PyPI - smftools - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

smftools 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

smftools/__init__.py +6 -8
smftools/_settings.py +4 -6
smftools/_version.py +1 -1
smftools/cli/helpers.py +54 -0
smftools/cli/hmm_adata.py +937 -256
smftools/cli/load_adata.py +448 -268
smftools/cli/preprocess_adata.py +469 -263
smftools/cli/spatial_adata.py +536 -319
smftools/cli_entry.py +97 -182
smftools/config/__init__.py +1 -1
smftools/config/conversion.yaml +17 -6
smftools/config/deaminase.yaml +12 -10
smftools/config/default.yaml +142 -33
smftools/config/direct.yaml +11 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +594 -264
smftools/constants.py +37 -0
smftools/datasets/__init__.py +2 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2128 -1418
smftools/hmm/__init__.py +2 -9
smftools/hmm/archived/call_hmm_peaks.py +121 -0
smftools/hmm/call_hmm_peaks.py +299 -91
smftools/hmm/display_hmm.py +19 -6
smftools/hmm/hmm_readwrite.py +13 -4
smftools/hmm/nucleosome_hmm_refinement.py +102 -14
smftools/informatics/__init__.py +30 -7
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
smftools/informatics/archived/print_bam_query_seq.py +7 -1
smftools/informatics/bam_functions.py +397 -175
smftools/informatics/basecalling.py +51 -9
smftools/informatics/bed_functions.py +90 -57
smftools/informatics/binarize_converted_base_identities.py +18 -7
smftools/informatics/complement_base_list.py +7 -6
smftools/informatics/converted_BAM_to_adata.py +265 -122
smftools/informatics/fasta_functions.py +161 -83
smftools/informatics/h5ad_functions.py +196 -30
smftools/informatics/modkit_extract_to_adata.py +609 -270
smftools/informatics/modkit_functions.py +85 -44
smftools/informatics/ohe.py +44 -21
smftools/informatics/pod5_functions.py +112 -73
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +2 -7
smftools/machine_learning/data/anndata_data_module.py +143 -50
smftools/machine_learning/data/preprocessing.py +2 -1
smftools/machine_learning/evaluation/__init__.py +1 -1
smftools/machine_learning/evaluation/eval_utils.py +11 -14
smftools/machine_learning/evaluation/evaluators.py +46 -33
smftools/machine_learning/inference/__init__.py +1 -1
smftools/machine_learning/inference/inference_utils.py +7 -4
smftools/machine_learning/inference/lightning_inference.py +9 -13
smftools/machine_learning/inference/sklearn_inference.py +6 -8
smftools/machine_learning/inference/sliding_window_inference.py +35 -25
smftools/machine_learning/models/__init__.py +10 -5
smftools/machine_learning/models/base.py +28 -42
smftools/machine_learning/models/cnn.py +15 -11
smftools/machine_learning/models/lightning_base.py +71 -40
smftools/machine_learning/models/mlp.py +13 -4
smftools/machine_learning/models/positional.py +3 -2
smftools/machine_learning/models/rnn.py +3 -2
smftools/machine_learning/models/sklearn_models.py +39 -22
smftools/machine_learning/models/transformer.py +68 -53
smftools/machine_learning/models/wrappers.py +2 -1
smftools/machine_learning/training/__init__.py +2 -2
smftools/machine_learning/training/train_lightning_model.py +29 -20
smftools/machine_learning/training/train_sklearn_model.py +9 -15
smftools/machine_learning/utils/__init__.py +1 -1
smftools/machine_learning/utils/device.py +7 -4
smftools/machine_learning/utils/grl.py +3 -1
smftools/metadata.py +443 -0
smftools/plotting/__init__.py +19 -5
smftools/plotting/autocorrelation_plotting.py +145 -44
smftools/plotting/classifiers.py +162 -72
smftools/plotting/general_plotting.py +422 -197
smftools/plotting/hmm_plotting.py +42 -13
smftools/plotting/position_stats.py +147 -87
smftools/plotting/qc_plotting.py +20 -12
smftools/preprocessing/__init__.py +10 -12
smftools/preprocessing/append_base_context.py +115 -80
smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +129 -31
smftools/preprocessing/binary_layers_to_ohe.py +17 -11
smftools/preprocessing/calculate_complexity_II.py +86 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +50 -25
smftools/preprocessing/calculate_pairwise_differences.py +2 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
smftools/preprocessing/calculate_position_Youden.py +118 -54
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
smftools/preprocessing/flag_duplicate_reads.py +689 -272
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +8 -3
smftools/preprocessing/min_non_diagonal.py +2 -1
smftools/preprocessing/recipes.py +56 -23
smftools/preprocessing/reindex_references_adata.py +103 -0
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +331 -82
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +3 -4
smftools/tools/archived/classifiers.py +163 -0
smftools/tools/archived/subset_adata_v1.py +10 -1
smftools/tools/archived/subset_adata_v2.py +12 -1
smftools/tools/calculate_umap.py +54 -15
smftools/tools/cluster_adata_on_methylation.py +115 -46
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +229 -98
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
smftools-0.2.5.dist-info/RECORD +181 -0
smftools-0.2.3.dist-info/RECORD +0 -173
/smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
/smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
/smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
/smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
/smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
/smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
/smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0

smftools/cli/hmm_adata.py CHANGED Viewed

@@ -1,318 +1,999 @@
-def hmm_adata(config_path):
+from __future__ import annotations
+import copy
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, List, Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+from smftools.logging_utils import get_logger
+# FIX: import _to_dense_np to avoid NameError
+from ..hmm.HMM import _safe_int_coords, _to_dense_np, create_hmm, normalize_hmm_feature_sets
+logger = get_logger(__name__)
+# =============================================================================
+# Helpers: extracting training arrays
+# =============================================================================
+def _get_training_matrix(
+    subset, cols_mask: np.ndarray, smf_modality: Optional[str], cfg
+) -> Tuple[np.ndarray, Optional[str]]:
+    """
+    Matches your existing behavior:
+      - direct -> uses cfg.output_binary_layer_name in .layers
+      - else   -> uses .X
+    Returns (X, layer_name_or_None) where X is dense float array.
     """
-    High-level function to call for hmm analysis of an adata object.
-    Command line accesses this through smftools hmm <config_path>
+    sub = subset[:, cols_mask]
-    Parameters:
-        config_path (str): A string representing the file path to the experiment configuration csv file.
+    if smf_modality == "direct":
+        hmm_layer = getattr(cfg, "output_binary_layer_name", None)
+        if hmm_layer is None or hmm_layer not in sub.layers:
+            raise KeyError(f"Missing HMM training layer '{hmm_layer}' in subset.")
+        logger.debug("Using direct modality HMM training layer: %s", hmm_layer)
+        mat = sub.layers[hmm_layer]
+    else:
+        logger.debug("Using .X for HMM training matrix")
+        hmm_layer = None
+        mat = sub.X
+    X = _to_dense_np(mat).astype(float)
+    if X.ndim != 2:
+        raise ValueError(f"Expected 2D training matrix; got {X.shape}")
+    return X, hmm_layer
+def _resolve_pos_mask_for_methbase(subset, ref: str, methbase: str) -> Optional[np.ndarray]:
+    """
+    Reproduces your mask resolution, with compatibility for both *_any_C_site and *_C_site.
+    """
+    key = str(methbase).strip().lower()
+    logger.debug("Resolving position mask for methbase=%s on ref=%s", key, ref)
+    if key in ("a",):
+        col = f"{ref}_A_site"
+        if col not in subset.var:
+            return None
+        logger.debug("Using positions with A calls from column: %s", col)
+        return np.asarray(subset.var[col])
+    if key in ("c", "any_c", "anyc", "any-c"):
+        for col in (f"{ref}_any_C_site", f"{ref}_C_site"):
+            if col in subset.var:
+                logger.debug("Using positions with C calls from column: %s", col)
+                return np.asarray(subset.var[col])
+        return None
+    if key in ("gpc", "gpc_site", "gpc-site"):
+        col = f"{ref}_GpC_site"
+        if col not in subset.var:
+            return None
+        logger.debug("Using positions with GpC calls from column: %s", col)
+        return np.asarray(subset.var[col])
+    if key in ("cpg", "cpg_site", "cpg-site"):
+        col = f"{ref}_CpG_site"
+        if col not in subset.var:
+            return None
+        logger.debug("Using positions with CpG calls from column: %s", col)
+        return np.asarray(subset.var[col])
+    alt = f"{ref}_{methbase}_site"
+    if alt not in subset.var:
+        return None
+    logger.debug("Using positions from column: %s", alt)
+    return np.asarray(subset.var[alt])
+def build_single_channel(
+    subset, ref: str, methbase: str, smf_modality: Optional[str], cfg
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
     Returns:
-        (pp_dedup_spatial_hmm_adata, pp_dedup_spatial_hmm_adata_path)
+      X     (N, Lmb) float with NaNs allowed
+      coords (Lmb,) int coords from var_names
+    """
+    pm = _resolve_pos_mask_for_methbase(subset, ref, methbase)
+    logger.debug(
+        "Position mask for methbase=%s on ref=%s has %d sites",
+        methbase,
+        ref,
+        int(np.sum(pm)) if pm is not None else 0,
+    )
+    if pm is None or int(np.sum(pm)) == 0:
+        raise ValueError(f"No columns for methbase={methbase} on ref={ref}")
+    X, _ = _get_training_matrix(subset, pm, smf_modality, cfg)
+    logger.debug("Training matrix for methbase=%s on ref=%s has shape %s", methbase, ref, X.shape)
+    coords, _ = _safe_int_coords(subset[:, pm].var_names)
+    logger.debug(
+        "Coordinates for methbase=%s on ref=%s have length %d", methbase, ref, coords.shape[0]
+    )
+    return X, coords
+def build_multi_channel_union(
+    subset, ref: str, methbases: Sequence[str], smf_modality: Optional[str], cfg
+) -> Tuple[np.ndarray, np.ndarray, List[str]]:
+    """
+    Build (N, Lunion, C) on union coordinate grid across methbases.
+    Returns:
+      X3d:    (N, Lunion, C) float with NaN where methbase has no site
+      coords: (Lunion,) int union coords
+      used_methbases: list of methbases actually included (>=2)
+    """
+    per: List[Tuple[str, np.ndarray, np.ndarray, np.ndarray]] = []  # (mb, X, coords, pm)
+    for mb in methbases:
+        pm = _resolve_pos_mask_for_methbase(subset, ref, mb)
+        if pm is None or int(np.sum(pm)) == 0:
+            continue
+        Xmb, _ = _get_training_matrix(subset, pm, smf_modality, cfg)  # (N,Lmb)
+        cmb, _ = _safe_int_coords(subset[:, pm].var_names)
+        per.append((mb, Xmb.astype(float), cmb.astype(int), pm))
+    if len(per) < 2:
+        raise ValueError(f"Need >=2 methbases with columns for union multi-channel on ref={ref}")
+    # union coordinates
+    coords = np.unique(np.concatenate([c for _, _, c, _ in per], axis=0)).astype(int)
+    idx = {int(v): i for i, v in enumerate(coords.tolist())}
+    N = per[0][1].shape[0]
+    L = coords.shape[0]
+    C = len(per)
+    X3 = np.full((N, L, C), np.nan, dtype=float)
+    for ci, (mb, Xmb, cmb, _) in enumerate(per):
+        cols = np.array([idx[int(v)] for v in cmb.tolist()], dtype=int)
+        X3[:, cols, ci] = Xmb
+    used = [mb for (mb, _, _, _) in per]
+    return X3, coords, used
+@dataclass
+class HMMTask:
+    name: str
+    signals: List[str]  # e.g. ["GpC"] or ["GpC","CpG"] or ["CpG"]
+    feature_groups: List[str]  # e.g. ["footprint","accessible"] or ["cpg"]
+    output_prefix: Optional[str] = None  # force prefix (CpG task uses "CpG")
+def build_hmm_tasks(cfg: Union[dict, Any]) -> List[HMMTask]:
+    """
+    Accessibility signals come from cfg['hmm_methbases'].
+    CpG task is enabled by cfg['cpg']==True, independent of hmm_methbases.
+    """
+    if not isinstance(cfg, dict):
+        # best effort conversion
+        cfg = {k: getattr(cfg, k) for k in dir(cfg) if not k.startswith("_")}
+    tasks: List[HMMTask] = []
+    # accessibility task
+    methbases = list(cfg.get("hmm_methbases", []) or [])
+    if len(methbases) > 0:
+        tasks.append(
+            HMMTask(
+                name="accessibility",
+                signals=methbases,
+                feature_groups=["footprint", "accessible"],
+                output_prefix=None,
+            )
+        )
+    # CpG task (special case)
+    if bool(cfg.get("cpg", False)):
+        tasks.append(
+            HMMTask(
+                name="cpg",
+                signals=["CpG"],
+                feature_groups=["cpg"],
+                output_prefix="CpG",
+            )
+        )
+    return tasks
+def select_hmm_arch(cfg: dict, signals: Sequence[str]) -> str:
+    """
+    Simple, explicit model selection:
+      - distance-aware => 'single_distance_binned' (only meaningful for single-channel)
+      - multi-signal   => 'multi'
+      - else           => 'single'
+    """
+    if bool(cfg.get("hmm_distance_aware", False)) and len(signals) == 1:
+        return "single_distance_binned"
+    if len(signals) > 1:
+        return "multi"
+    return "single"
+def resolve_input_layer(adata, cfg: dict, layer_override: Optional[str]) -> Optional[str]:
+    """
+    If direct modality, prefer cfg.output_binary_layer_name.
+    Else use layer_override or None (meaning use .X).
+    """
+    smf_modality = cfg.get("smf_modality", None)
+    if smf_modality == "direct":
+        nm = cfg.get("output_binary_layer_name", None)
+        if nm is None:
+            raise KeyError("cfg.output_binary_layer_name missing for smf_modality='direct'")
+        if nm not in adata.layers:
+            raise KeyError(f"Direct modality expects layer '{nm}' in adata.layers")
+        return nm
+    return layer_override
+def _ensure_layer_and_assign_rows(adata, layer_name: str, row_mask: np.ndarray, subset_layer):
     """
-    from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
+    Writes subset_layer (n_subset_obs, n_vars) into adata.layers[layer_name] for rows where row_mask==True.
+    """
+    row_mask = np.asarray(row_mask, dtype=bool)
+    if row_mask.ndim != 1 or row_mask.size != adata.n_obs:
+        raise ValueError("row_mask must be length adata.n_obs")
+    arr = _to_dense_np(subset_layer)
+    if arr.shape != (int(row_mask.sum()), adata.n_vars):
+        raise ValueError(
+            f"subset layer '{layer_name}' shape {arr.shape} != ({int(row_mask.sum())}, {adata.n_vars})"
+        )
+    if layer_name not in adata.layers:
+        adata.layers[layer_name] = np.zeros((adata.n_obs, adata.n_vars), dtype=arr.dtype)
+    adata.layers[layer_name][row_mask, :] = arr
+def resolve_torch_device(device_str: str | None) -> torch.device:
+    d = (device_str or "auto").lower()
+    if d == "auto":
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            return torch.device("mps")
+        return torch.device("cpu")
+    return torch.device(d)
+# =============================================================================
+# Model selection + fit strategy manager
+# =============================================================================
+@dataclass
+class HMMTrainer:
+    cfg: Any
+    models_dir: Path
+    def __post_init__(self):
+        self.models_dir = Path(self.models_dir)
+        self.models_dir.mkdir(parents=True, exist_ok=True)
+    def choose_arch(self, *, multichannel: bool) -> str:
+        use_dist = bool(getattr(self.cfg, "hmm_distance_aware", False))
+        if multichannel:
+            return "multi"
+        return "single_distance_binned" if use_dist else "single"
+    def _fit_scope(self) -> str:
+        return str(getattr(self.cfg, "hmm_fit_scope", "per_sample")).lower()
+        # "per_sample" | "global" | "global_then_adapt"
+    def _path(self, kind: str, sample: str, ref: str, label: str) -> Path:
+        # kind: "GLOBAL" | "PER" | "ADAPT"
+        def safe(s):
+            str(s).replace("/", "_")
+        return self.models_dir / f"{kind}_{safe(sample)}_{safe(ref)}_{safe(label)}.pt"
+    def _save(self, model, path: Path):
+        override = {}
+        if getattr(model, "hmm_name", None) == "multi":
+            override["hmm_n_channels"] = int(getattr(model, "n_channels", 2))
+        if getattr(model, "hmm_name", None) == "single_distance_binned":
+            override["hmm_distance_bins"] = list(
+                getattr(model, "distance_bins", [1, 5, 10, 25, 50, 100])
+            )
+        payload = {
+            "state_dict": model.state_dict(),
+            "hmm_arch": getattr(model, "hmm_name", None) or getattr(self.cfg, "hmm_arch", None),
+            "override": override,
+        }
+        torch.save(payload, path)
+    def _load(self, path: Path, arch: str, device):
+        payload = torch.load(path, map_location="cpu")
+        override = payload.get("override", None)
+        m = create_hmm(self.cfg, arch=arch, override=override, device=device)
+        sd = payload["state_dict"]
+        target_dtype = next(m.parameters()).dtype
+        for k, v in list(sd.items()):
+            if isinstance(v, torch.Tensor) and v.dtype != target_dtype:
+                sd[k] = v.to(dtype=target_dtype)
+        m.load_state_dict(sd)
+        m.to(device)
+        m.eval()
+        return m
+    def fit_or_load(
+        self,
+        *,
+        sample: str,
+        ref: str,
+        label: str,
+        arch: str,
+        X,
+        coords: Optional[np.ndarray],
+        device,
+    ):
+        force_fit = bool(getattr(self.cfg, "force_redo_hmm_fit", False))
+        scope = self._fit_scope()
+        max_iter = int(getattr(self.cfg, "hmm_max_iter", 50))
+        tol = float(getattr(self.cfg, "hmm_tol", 1e-4))
+        verbose = bool(getattr(self.cfg, "hmm_verbose", False))
+        # ---- global then adapt ----
+        if scope == "global_then_adapt":
+            p_global = self._path("GLOBAL", "ALL", ref, label)
+            if p_global.exists() and not force_fit:
+                base = self._load(p_global, arch=arch, device=device)
+            else:
+                base = create_hmm(self.cfg, arch=arch).to(device)
+                if arch == "single_distance_binned":
+                    base.fit(
+                        X, device=device, coords=coords, max_iter=max_iter, tol=tol, verbose=verbose
+                    )
+                else:
+                    base.fit(X, device=device, max_iter=max_iter, tol=tol, verbose=verbose)
+                self._save(base, p_global)
+            p_adapt = self._path("ADAPT", sample, ref, label)
+            if p_adapt.exists() and not force_fit:
+                return self._load(p_adapt, arch=arch, device=device)
+            # IMPORTANT: this assumes you added model.adapt_emissions(...)
+            adapted = copy.deepcopy(base).to(device)
+            if arch == "single_distance_binned":
+                adapted.adapt_emissions(
+                    X,
+                    coords,
+                    device=device,
+                    max_iter=int(getattr(self.cfg, "hmm_adapt_iters", 10)),
+                    verbose=verbose,
+                )
+            else:
+                adapted.adapt_emissions(
+                    X,
+                    coords,
+                    device=device,
+                    max_iter=int(getattr(self.cfg, "hmm_adapt_iters", 10)),
+                    verbose=verbose,
+                )
+            self._save(adapted, p_adapt)
+            return adapted
+        # ---- global only ----
+        if scope == "global":
+            p = self._path("GLOBAL", "ALL", ref, label)
+            if p.exists() and not force_fit:
+                return self._load(p, arch=arch, device=device)
+        # ---- per sample ----
+        else:
+            p = self._path("PER", sample, ref, label)
+            if p.exists() and not force_fit:
+                return self._load(p, arch=arch, device=device)
+        m = create_hmm(self.cfg, arch=arch, device=device)
+        if arch == "single_distance_binned":
+            m.fit(X, coords, device=device, max_iter=max_iter, tol=tol, verbose=verbose)
+        else:
+            m.fit(X, coords, device=device, max_iter=max_iter, tol=tol, verbose=verbose)
+        self._save(m, p)
+        return m
+def _fully_qualified_merge_layers(cfg, prefix: str) -> List[Tuple[str, int]]:
+    """
+    cfg.hmm_merge_layer_features is assumed to be a list of (core_layer_name, merge_distance),
+    where core_layer_name is like "all_accessible_features" (NOT prefixed with methbase).
+    We expand to f"{prefix}_{core_layer_name}".
+    """
+    out = []
+    for core_layer, dist in getattr(cfg, "hmm_merge_layer_features", []) or []:
+        if not core_layer:
+            continue
+        out.append((f"{prefix}_{core_layer}", int(dist)))
+    return out
+def hmm_adata(config_path: str):
+    """
+    CLI-facing wrapper for HMM analysis.
+    Command line entrypoint:
+        smftools hmm <config_path>
+    Responsibilities:
+    - Build cfg via load_adata()
+    - Ensure preprocess + spatial stages are run
+    - Decide which AnnData to start from (hmm > spatial > pp_dedup > pp > raw)
+    - Call hmm_adata_core(cfg, adata, paths)
+    """
+    from ..readwrite import safe_read_h5ad
+    from .helpers import get_adata_paths
     from .load_adata import load_adata
     from .preprocess_adata import preprocess_adata
     from .spatial_adata import spatial_adata
-    import numpy as np
-    import pandas as pd
-    import anndata as ad
-    import scanpy as sc
+    # 1) load cfg / stage paths
+    _, _, cfg = load_adata(config_path)
+    paths = get_adata_paths(cfg)
-    import os
-    from importlib import resources
-    from pathlib import Path
+    # 2) make sure upstream stages are run (they have their own skipping logic)
+    preprocess_adata(config_path)
+    spatial_ad, spatial_path = spatial_adata(config_path)
-    from datetime import datetime
-    date_str = datetime.today().strftime("%y%m%d")
+    # 3) choose starting AnnData
+    # Prefer:
+    #   - existing HMM h5ad if not forcing redo
+    #   - in-memory spatial_ad from wrapper call
+    #   - saved spatial / pp_dedup / pp / raw on disk
+    if paths.hmm.exists() and not (cfg.force_redo_hmm_fit or cfg.force_redo_hmm_apply):
+        adata, _ = safe_read_h5ad(paths.hmm)
+        return adata, paths.hmm
-    ############################################### smftools load start ###############################################
-    adata, adata_path, cfg = load_adata(config_path)
-    # General config variable init - Necessary user passed inputs
-    smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
-    output_directory = Path(cfg.output_directory)  # Path to the output directory to make for the analysis. Necessary.
-    # Make initial output directory
-    make_dirs([output_directory])
-    ############################################### smftools load end ###############################################
-    ############################################### smftools preprocess start ###############################################
-    pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
-    ############################################### smftools preprocess end ###############################################
-    ############################################### smftools spatial start ###############################################
-    spatial_ad, spatial_adata_path = spatial_adata(config_path)
-    ############################################### smftools spatial end ###############################################
-    ############################################### smftools hmm start ###############################################
-    input_manager_df = pd.read_csv(cfg.summary_file)
-    initial_adata_path = Path(input_manager_df['load_adata'][0])
-    pp_adata_path = Path(input_manager_df['pp_adata'][0])
-    pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
-    spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
-    hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
-    if spatial_ad:
-        # This happens on first run of the pipeline
+    if spatial_ad is not None:
         adata = spatial_ad
+        source_path = spatial_path
+    elif paths.spatial.exists():
+        adata, _ = safe_read_h5ad(paths.spatial)
+        source_path = paths.spatial
+    elif paths.pp_dedup.exists():
+        adata, _ = safe_read_h5ad(paths.pp_dedup)
+        source_path = paths.pp_dedup
+    elif paths.pp.exists():
+        adata, _ = safe_read_h5ad(paths.pp)
+        source_path = paths.pp
+    elif paths.raw.exists():
+        adata, _ = safe_read_h5ad(paths.raw)
+        source_path = paths.raw
     else:
-        # If an anndata is saved, check which stages of the anndata are available
-        initial_version_available = initial_adata_path.exists()
-        preprocessed_version_available = pp_adata_path.exists()
-        preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
-        preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
-        preprocessed_dedup_spatial_hmm_version_available = hmm_adata_path.exists()
-        if cfg.force_redo_hmm_fit:
-            print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
-            if preprocessed_dedup_spatial_version_available:
-                adata, load_report = safe_read_h5ad(spatial_adata_path)
-            elif preprocessed_dup_removed_version_available:
-                adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
-            elif initial_version_available:
-                adata, load_report = safe_read_h5ad(initial_adata_path)
-            else:
-                print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
-        elif preprocessed_dedup_spatial_hmm_version_available:
-            return (None, hmm_adata_path)
-        else:
-            if preprocessed_dedup_spatial_version_available:
-                adata, load_report = safe_read_h5ad(spatial_adata_path)
-            elif preprocessed_dup_removed_version_available:
-                adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
-            elif initial_version_available:
-                adata, load_report = safe_read_h5ad(initial_adata_path)
-            else:
-                print(f"No adata available.")
-                return
-    references = adata.obs[cfg.reference_column].cat.categories
-    deaminase = smf_modality == 'deaminase'
-############################################### HMM based feature annotations ###############################################
+        raise FileNotFoundError(
+            "No AnnData available for HMM: expected at least raw or preprocessed h5ad."
+        )
+    # 4) delegate to core
+    adata, hmm_adata_path = hmm_adata_core(
+        cfg,
+        adata,
+        paths,
+        source_adata_path=source_path,
+        config_path=config_path,
+    )
+    return adata, hmm_adata_path
+def hmm_adata_core(
+    cfg,
+    adata,
+    paths,
+    source_adata_path: Path | None = None,
+    config_path: str | None = None,
+) -> Tuple["anndata.AnnData", Path]:
+    """
+    Core HMM analysis pipeline.
+    Assumes:
+    - cfg is an ExperimentConfig
+    - adata is the starting AnnData (typically spatial + dedup)
+    - paths is an AdataPaths object (with .raw/.pp/.pp_dedup/.spatial/.hmm)
+    Does NOT decide which h5ad to start from – that is the wrapper's job.
+    """
+    import numpy as np
+    from ..hmm import call_hmm_peaks
+    from ..metadata import record_smftools_metadata
+    from ..plotting import (
+        combined_hmm_raw_clustermap,
+        plot_hmm_layers_rolling_by_sample_ref,
+        plot_hmm_size_contours,
+    )
+    from ..readwrite import make_dirs
+    from .helpers import write_gz_h5ad
+    smf_modality = cfg.smf_modality
+    deaminase = smf_modality == "deaminase"
+    output_directory = Path(cfg.output_directory)
+    make_dirs([output_directory])
+    pp_dir = output_directory / "preprocessed" / "deduplicated"
+    # ---------------------------- HMM annotate stage ----------------------------
     if not (cfg.bypass_hmm_fit and cfg.bypass_hmm_apply):
-        from ..hmm.HMM import HMM
-        from scipy.sparse import issparse, csr_matrix
-        import warnings
+        hmm_models_dir = pp_dir / "10_hmm_models"
+        make_dirs([pp_dir, hmm_models_dir])
-        pp_dir = output_directory / "preprocessed"
-        pp_dir = pp_dir / "deduplicated"
-        hmm_dir = pp_dir / "10_hmm_models"
+        # Standard bookkeeping
+        uns_key = "hmm_appended_layers"
+        if adata.uns.get(uns_key) is None:
+            adata.uns[uns_key] = []
+        global_appended = list(adata.uns.get(uns_key, []))
-        if hmm_dir.is_dir():
-            print(f'{hmm_dir} already exists.')
-        else:
-            make_dirs([pp_dir, hmm_dir])
+        # Prepare trainer + feature config
+        trainer = HMMTrainer(cfg=cfg, models_dir=hmm_models_dir)
+        feature_sets = normalize_hmm_feature_sets(getattr(cfg, "hmm_feature_sets", None))
+        prob_thr = float(getattr(cfg, "hmm_feature_prob_threshold", 0.5))
+        decode = str(getattr(cfg, "hmm_decode", "marginal"))
+        write_post = bool(getattr(cfg, "hmm_write_posterior", True))
+        post_state = getattr(cfg, "hmm_posterior_state", "Modified")
+        merged_suffix = str(getattr(cfg, "hmm_merged_suffix", "_merged"))
+        force_apply = bool(getattr(cfg, "force_redo_hmm_apply", False))
+        bypass_apply = bool(getattr(cfg, "bypass_hmm_apply", False))
+        bypass_fit = bool(getattr(cfg, "bypass_hmm_fit", False))
         samples = adata.obs[cfg.sample_name_col_for_plotting].cat.categories
         references = adata.obs[cfg.reference_column].cat.categories
-        uns_key = "hmm_appended_layers"
+        methbases = list(getattr(cfg, "hmm_methbases", [])) or []
-        # ensure uns key exists (avoid KeyError later)
-        if adata.uns.get(uns_key) is None:
-            adata.uns[uns_key] = []
+        if not methbases:
+            raise ValueError("cfg.hmm_methbases is empty.")
-        for sample in samples:
-            for ref in references:
-                mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (adata.obs[cfg.reference_column] == ref)
-                subset = adata[mask].copy()
-                if subset.shape[0] < 1:
-                    continue
-                for mod_site in cfg.hmm_methbases:
-                    mod_label = {'C': 'C'}.get(mod_site, mod_site)
-                    hmm_path = hmm_dir / f"{sample}_{ref}_{mod_label}_hmm_model.pth"
-                    # ensure the input obsm exists
-                    obsm_key = f'{ref}_{mod_label}_site'
-                    if obsm_key not in subset.obsm:
-                        print(f"Skipping {sample} {ref} {mod_label}: missing obsm '{obsm_key}'")
+        # Top-level skip
+        already = bool(adata.uns.get("hmm_annotated", False))
+        if already and not (bool(getattr(cfg, "force_redo_hmm_fit", False)) or force_apply):
+            pass
+        else:
+            logger.info("Starting HMM annotation over samples and references")
+            for sample in samples:
+                for ref in references:
+                    mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (
+                        adata.obs[cfg.reference_column] == ref
+                    )
+                    if int(np.sum(mask)) == 0:
                         continue
-                    # Fit or load model
-                    if os.path.exists(hmm_path) and not cfg.force_redo_hmm_fit:
-                        hmm = HMM.load(hmm_path)
-                        hmm.print_params()
-                    else:
-                        print(f"Fitting HMM for {sample} {ref} {mod_label}")
-                        hmm = HMM.from_config(cfg)
-                        # fit expects a list-of-seqs or 2D ndarray in the obsm
-                        seqs = subset.obsm[obsm_key]
-                        hmm.fit(seqs)
-                        hmm.print_params()
-                        hmm.save(hmm_path)
-                    # Apply / annotate on the subset, then copy layers back to final_adata
-                    if (not cfg.bypass_hmm_apply) or cfg.force_redo_hmm_apply:
-                        print(f"Applying HMM on subset for {sample} {ref} {mod_label}")
-                        # Use the new uns_key argument so subset will record appended layer names
-                        # (annotate_adata modifies subset.obs/layers in-place and should write subset.uns[uns_key])
-                        hmm.annotate_adata(subset,
-                                        obs_column=cfg.reference_column,
-                                        layer=cfg.layer_for_umap_plotting,
-                                        config=cfg)
-                        #to_merge = [("C_all_accessible_features", 80)]
-                        to_merge = cfg.hmm_merge_layer_features
-                        for layer_to_merge, merge_distance in to_merge:
-                            if layer_to_merge:
-                                hmm.merge_intervals_in_layer(subset,
-                                                            layer=layer_to_merge,
-                                                            distance_threshold=merge_distance,
-                                                            overwrite=True
-                                                            )
-                            else:
-                                pass
-                        # collect appended layers from subset.uns
-                        appended = list(subset.uns.get(uns_key, []))
-                        print(appended)
-                        if len(appended) == 0:
-                            # nothing appended for this subset; continue
+                    subset = adata[mask].copy()
+                    subset.uns[uns_key] = []  # isolate appended tracking per subset
+                    # ---- Decide which tasks to run ----
+                    methbases = list(getattr(cfg, "hmm_methbases", [])) or []
+                    run_multi = bool(getattr(cfg, "hmm_run_multichannel", True))
+                    run_cpg = bool(getattr(cfg, "cpg", False))
+                    device = resolve_torch_device(cfg.device)
+                    logger.info("HMM processing sample=%s ref=%s", sample, ref)
+                    # ---- split feature sets ----
+                    feature_sets_all = normalize_hmm_feature_sets(
+                        getattr(cfg, "hmm_feature_sets", None)
+                    )
+                    feature_sets_access = {
+                        k: v
+                        for k, v in feature_sets_all.items()
+                        if k in ("footprint", "accessible")
+                    }
+                    feature_sets_cpg = (
+                        {"cpg": feature_sets_all["cpg"]} if "cpg" in feature_sets_all else {}
+                    )
+                    # =========================
+                    # 1) Single-channel accessibility (per methbase)
+                    # =========================
+                    for mb in methbases:
+                        logger.info("HMM single-channel for methbase=%s", mb)
+                        try:
+                            X, coords = build_single_channel(
+                                subset,
+                                ref=str(ref),
+                                methbase=str(mb),
+                                smf_modality=smf_modality,
+                                cfg=cfg,
+                            )
+                        except Exception:
+                            logger.warning(
+                                "Skipping HMM single-channel for methbase=%s due to data error", mb
+                            )
                             continue
-                        # copy each appended layer into adata
-                        subset_mask_bool = mask.values if hasattr(mask, "values") else np.asarray(mask)
-                        for layer_name in appended:
-                            if layer_name not in subset.layers:
-                                # defensive: skip
-                                warnings.warn(f"Expected layer {layer_name} in subset but not found; skipping copy.")
-                                continue
-                            sub_layer = subset.layers[layer_name]
-                            # ensure final layer exists and assign rows
-                            try:
-                               hmm._ensure_final_layer_and_assign(adata, layer_name, subset_mask_bool, sub_layer)
-                            except Exception as e:
-                                warnings.warn(f"Failed to copy layer {layer_name} into adata: {e}", stacklevel=2)
-                                # fallback: if dense and small, try to coerce
-                                if issparse(sub_layer):
-                                    arr = sub_layer.toarray()
-                                else:
-                                    arr = np.asarray(sub_layer)
-                                adata.layers[layer_name] = adata.layers.get(layer_name, np.zeros((adata.shape[0], arr.shape[1]), dtype=arr.dtype))
-                                final_idx = np.nonzero(subset_mask_bool)[0]
-                                adata.layers[layer_name][final_idx, :] = arr
-                        # merge appended layer names into adata.uns
-                        existing = list(adata.uns.get(uns_key, []))
-                        for ln in appended:
-                            if ln not in existing:
-                                existing.append(ln)
-                        adata.uns[uns_key] = existing
+                        arch = trainer.choose_arch(multichannel=False)
+                        logger.info("HMM fitting/loading for methbase=%s", mb)
+                        hmm = trainer.fit_or_load(
+                            sample=str(sample),
+                            ref=str(ref),
+                            label=str(mb),
+                            arch=arch,
+                            X=X,
+                            coords=coords,
+                            device=device,
+                        )
+                        if not bypass_apply:
+                            logger.info("HMM applying for methbase=%s", mb)
+                            pm = _resolve_pos_mask_for_methbase(subset, str(ref), str(mb))
+                            hmm.annotate_adata(
+                                subset,
+                                prefix=str(mb),
+                                X=X,
+                                coords=coords,
+                                var_mask=pm,
+                                span_fill=True,
+                                config=cfg,
+                                decode=decode,
+                                write_posterior=write_post,
+                                posterior_state=post_state,
+                                feature_sets=feature_sets_access,  # <--- ONLY accessibility feature sets
+                                prob_threshold=prob_thr,
+                                uns_key=uns_key,
+                                uns_flag=f"hmm_annotated_{mb}",
+                                force_redo=force_apply,
+                            )
+                            # merges for this mb
+                            for core_layer, dist in (
+                                getattr(cfg, "hmm_merge_layer_features", []) or []
+                            ):
+                                base_layer = f"{mb}_{core_layer}"
+                                logger.info("Merging intervals for layer=%s", base_layer)
+                                if base_layer in subset.layers:
+                                    merged_base = hmm.merge_intervals_to_new_layer(
+                                        subset,
+                                        base_layer,
+                                        distance_threshold=int(dist),
+                                        suffix=merged_suffix,
+                                        overwrite=True,
+                                    )
+                                    # write merged size classes based on whichever group core_layer corresponds to
+                                    for group, fs in feature_sets_access.items():
+                                        fmap = fs.get("features", {}) or {}
+                                        if fmap:
+                                            hmm.write_size_class_layers_from_binary(
+                                                subset,
+                                                merged_base,
+                                                out_prefix=str(mb),
+                                                feature_ranges=fmap,
+                                                suffix=merged_suffix,
+                                                overwrite=True,
+                                            )
+                    # =========================
+                    # 2) Multi-channel accessibility (Combined)
+                    # =========================
+                    if run_multi and len(methbases) >= 2:
+                        logger.info("HMM multi-channel for methbases=%s", ",".join(methbases))
+                        try:
+                            X3, coords_u, used_mbs = build_multi_channel_union(
+                                subset,
+                                ref=str(ref),
+                                methbases=methbases,
+                                smf_modality=smf_modality,
+                                cfg=cfg,
+                            )
+                        except Exception:
+                            X3, coords_u, used_mbs = None, None, []
+                            logger.warning(
+                                "Skipping HMM multi-channel due to data error or insufficient methbases"
+                            )
+                        if X3 is not None and len(used_mbs) >= 2:
+                            union_mask = None
+                            for mb in used_mbs:
+                                pm = _resolve_pos_mask_for_methbase(subset, str(ref), str(mb))
+                                union_mask = pm if union_mask is None else (union_mask | pm)
+                            arch = trainer.choose_arch(multichannel=True)
+                            logger.info("HMM fitting/loading for multi-channel")
+                            hmmc = trainer.fit_or_load(
+                                sample=str(sample),
+                                ref=str(ref),
+                                label="Combined",
+                                arch=arch,
+                                X=X3,
+                                coords=coords_u,
+                                device=device,
+                            )
+                            if not bypass_apply:
+                                logger.info("HMM applying for multi-channel")
+                                hmmc.annotate_adata(
+                                    subset,
+                                    prefix="Combined",
+                                    X=X3,
+                                    coords=coords_u,
+                                    var_mask=union_mask,
+                                    span_fill=True,
+                                    config=cfg,
+                                    decode=decode,
+                                    write_posterior=write_post,
+                                    posterior_state=post_state,
+                                    feature_sets=feature_sets_access,  # <--- accessibility only
+                                    prob_threshold=prob_thr,
+                                    uns_key=uns_key,
+                                    uns_flag="hmm_annotated_combined",
+                                    force_redo=force_apply,
+                                )
+                                for core_layer, dist in (
+                                    getattr(cfg, "hmm_merge_layer_features", []) or []
+                                ):
+                                    base_layer = f"Combined_{core_layer}"
+                                    if base_layer in subset.layers:
+                                        merged_base = hmmc.merge_intervals_to_new_layer(
+                                            subset,
+                                            base_layer,
+                                            distance_threshold=int(dist),
+                                            suffix=merged_suffix,
+                                            overwrite=True,
+                                        )
+                                        for group, fs in feature_sets_access.items():
+                                            fmap = fs.get("features", {}) or {}
+                                            if fmap:
+                                                hmmc.write_size_class_layers_from_binary(
+                                                    subset,
+                                                    merged_base,
+                                                    out_prefix="Combined",
+                                                    feature_ranges=fmap,
+                                                    suffix=merged_suffix,
+                                                    overwrite=True,
+                                                )
+                    # =========================
+                    # 3) CpG-only single-channel task
+                    # =========================
+                    if run_cpg:
+                        logger.info("HMM single-channel for CpG")
+                        try:
+                            Xcpg, coordscpg = build_single_channel(
+                                subset,
+                                ref=str(ref),
+                                methbase="CpG",
+                                smf_modality=smf_modality,
+                                cfg=cfg,
+                            )
+                        except Exception:
+                            Xcpg, coordscpg = None, None
+                            logger.warning("Skipping HMM single-channel for CpG due to data error")
+                        if Xcpg is not None and Xcpg.size and feature_sets_cpg:
+                            arch = trainer.choose_arch(multichannel=False)
+                            logger.info("HMM fitting/loading for CpG")
+                            hmmg = trainer.fit_or_load(
+                                sample=str(sample),
+                                ref=str(ref),
+                                label="CpG",
+                                arch=arch,
+                                X=Xcpg,
+                                coords=coordscpg,
+                                device=device,
+                            )
+                            if not bypass_apply:
+                                logger.info("HMM applying for CpG")
+                                pm = _resolve_pos_mask_for_methbase(subset, str(ref), "CpG")
+                                hmmg.annotate_adata(
+                                    subset,
+                                    prefix="CpG",
+                                    X=Xcpg,
+                                    coords=coordscpg,
+                                    var_mask=pm,
+                                    span_fill=True,
+                                    config=cfg,
+                                    decode=decode,
+                                    write_posterior=write_post,
+                                    posterior_state=post_state,
+                                    feature_sets=feature_sets_cpg,  # <--- ONLY cpg group (cpg_patch)
+                                    prob_threshold=prob_thr,
+                                    uns_key=uns_key,
+                                    uns_flag="hmm_annotated_CpG",
+                                    force_redo=force_apply,
+                                )
+                    # ------------------------------------------------------------
+                    # Copy newly created subset layers back into the full adata
+                    # ------------------------------------------------------------
+                    appended = (
+                        list(subset.uns.get(uns_key, []))
+                        if subset.uns.get(uns_key) is not None
+                        else []
+                    )
+                    if appended:
+                        row_mask = np.asarray(
+                            mask.values if hasattr(mask, "values") else mask, dtype=bool
+                        )
-    else:
-        pass
-    ## Save HMM annotated adata
-    if not hmm_adata_path.exists():
-        print('Saving hmm analyzed adata post preprocessing and duplicate removal')
-        if ".gz" == hmm_adata_path.suffix:
-            safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
-        else:
-            hmm_adata_path = hmm_adata_path.with_name(hmm_adata_path.name + '.gz')
-            safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
+                        for ln in appended:
+                            if ln not in subset.layers:
+                                continue
+                            _ensure_layer_and_assign_rows(adata, ln, row_mask, subset.layers[ln])
+                            if ln not in global_appended:
+                                global_appended.append(ln)
-    add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
+                        adata.uns[uns_key] = global_appended
-    ########################################################################################################################
+            adata.uns["hmm_annotated"] = True
-############################################### HMM based feature plotting ###############################################
-    hmm_dir = pp_dir / "11_hmm_clustermaps"
+            hmm_layers = list(adata.uns.get("hmm_appended_layers", []) or [])
+            # keep only real feature layers; drop lengths/states/posterior
+            hmm_layers = [
+                layer
+                for layer in hmm_layers
+                if not any(s in layer for s in ("_lengths", "_states", "_posterior"))
+            ]
+            logger.info(f"HMM appended layers: {hmm_layers}")
+    # ---------------------------- HMM peak calling stage ----------------------------
+    hmm_dir = pp_dir / "11_hmm_peak_calling"
     if hmm_dir.is_dir():
-        print(f'{hmm_dir} already exists.')
+        pass
     else:
         make_dirs([pp_dir, hmm_dir])
-        from ..plotting import combined_hmm_raw_clustermap
-        feature_layers = [
-            "all_accessible_features",
-            "large_accessible_patch",
-            "small_bound_stretch",
-            "medium_bound_stretch",
-            "putative_nucleosome",
-            "all_accessible_features_merged",
-        ]
-        layers: list[str] = []
+        call_hmm_peaks(
+            adata,
+            feature_configs=cfg.hmm_peak_feature_configs,
+            ref_column=cfg.reference_column,
+            site_types=cfg.mod_target_bases,
+            save_plot=True,
+            output_dir=hmm_dir,
+            index_col_suffix=cfg.reindexed_var_suffix,
+        )
-        if any(base in ["C", "CpG", "GpC"] for base in cfg.mod_target_bases):
-            if smf_modality == 'deaminase':
-                layers.extend([f"C_{layer}" for layer in feature_layers])
-            elif smf_modality == 'conversion':
-                layers.extend([f"GpC_{layer}" for layer in feature_layers])
+    ## Save HMM annotated adata
+    if not paths.hmm.exists():
+        logger.info("Saving hmm analyzed AnnData (post preprocessing and duplicate removal).")
+        record_smftools_metadata(
+            adata,
+            step_name="hmm",
+            cfg=cfg,
+            config_path=config_path,
+            input_paths=[source_adata_path] if source_adata_path else None,
+            output_path=paths.hmm,
+        )
+        write_gz_h5ad(adata, paths.hmm)
-        if 'A' in cfg.mod_target_bases:
-            layers.extend([f"A_{layer}" for layer in feature_layers])
+    ########################################################################################################################
-        if not layers:
-            raise ValueError(
-                f"No HMM feature layers matched mod_target_bases={cfg.mod_target_bases} "
-                f"and smf_modality={smf_modality}"
-            )
-        if smf_modality == 'direct':
-            sort_by = "any_a"
-        else:
-            sort_by = 'gpc'
+    ############################################### HMM based feature plotting ###############################################
-        for layer in layers:
-            save_path = hmm_dir / layer
-            make_dirs([save_path])
+    hmm_dir = pp_dir / "12_hmm_clustermaps"
+    make_dirs([pp_dir, hmm_dir])
+    layers: list[str] = []
+    for base in cfg.hmm_methbases:
+        layers.extend([f"{base}_{layer}" for layer in cfg.hmm_clustermap_feature_layers])
+    if getattr(cfg, "hmm_run_multichannel", True) and len(cfg.hmm_methbases) >= 2:
+        layers.extend([f"Combined_{layer}" for layer in cfg.hmm_clustermap_feature_layers])
+    if cfg.cpg:
+        layers.extend(["CpG_cpg_patch"])
+    if not layers:
+        raise ValueError(
+            f"No HMM feature layers matched mod_target_bases={cfg.mod_target_bases} "
+            f"and smf_modality={smf_modality}"
+        )
+    for layer in layers:
+        hmm_cluster_save_dir = hmm_dir / layer
+        if hmm_cluster_save_dir.is_dir():
+            pass
+        else:
+            make_dirs([hmm_cluster_save_dir])
             combined_hmm_raw_clustermap(
-            adata,
-            sample_col=cfg.sample_name_col_for_plotting,
-            reference_col=cfg.reference_column,
-            hmm_feature_layer=layer,
-            layer_gpc="nan0_0minus1",
-            layer_cpg="nan0_0minus1",
-            layer_any_c="nan0_0minus1",
-            layer_a= "nan0_0minus1",
-            cmap_hmm="coolwarm",
-            cmap_gpc="coolwarm",
-            cmap_cpg="viridis",
-            cmap_any_c='coolwarm',
-            cmap_a= "coolwarm",
-            min_quality=cfg.read_quality_filter_thresholds[0],
-            min_length=cfg.read_len_filter_thresholds[0],
-            min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
-            min_position_valid_fraction=1-cfg.position_max_nan_threshold,
-            save_path=save_path,
-            normalize_hmm=False,
-            sort_by=sort_by,  # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
-            bins=None,
-            deaminase=deaminase,
-            min_signal=0
+                adata,
+                sample_col=cfg.sample_name_col_for_plotting,
+                reference_col=cfg.reference_column,
+                hmm_feature_layer=layer,
+                layer_gpc=cfg.layer_for_clustermap_plotting,
+                layer_cpg=cfg.layer_for_clustermap_plotting,
+                layer_c=cfg.layer_for_clustermap_plotting,
+                layer_a=cfg.layer_for_clustermap_plotting,
+                cmap_hmm=cfg.clustermap_cmap_hmm,
+                cmap_gpc=cfg.clustermap_cmap_gpc,
+                cmap_cpg=cfg.clustermap_cmap_cpg,
+                cmap_c=cfg.clustermap_cmap_c,
+                cmap_a=cfg.clustermap_cmap_a,
+                min_quality=cfg.read_quality_filter_thresholds[0],
+                min_length=cfg.read_len_filter_thresholds[0],
+                min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
+                    0
+                ],
+                min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
+                demux_types=("double", "already"),
+                save_path=hmm_cluster_save_dir,
+                normalize_hmm=False,
+                sort_by=cfg.hmm_clustermap_sortby,  # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
+                bins=None,
+                deaminase=deaminase,
+                min_signal=0,
+                index_col_suffix=cfg.reindexed_var_suffix,
             )
-    hmm_dir = pp_dir / "12_hmm_bulk_traces"
+    hmm_dir = pp_dir / "13_hmm_bulk_traces"
     if hmm_dir.is_dir():
-        print(f'{hmm_dir} already exists.')
+        logger.debug(f"{hmm_dir} already exists.")
     else:
         make_dirs([pp_dir, hmm_dir])
         from ..plotting import plot_hmm_layers_rolling_by_sample_ref
+        bulk_hmm_layers = [
+            layer
+            for layer in hmm_layers
+            if not any(s in layer for s in ("_lengths", "_states", "_posterior"))
+        ]
         saved = plot_hmm_layers_rolling_by_sample_ref(
             adata,
-            layers=adata.uns['hmm_appended_layers'],
+            layers=bulk_hmm_layers,
             sample_col=cfg.sample_name_col_for_plotting,
             ref_col=cfg.reference_column,
             window=101,
             rows_per_page=4,
-            figsize_per_cell=(4,2.5),
+            figsize_per_cell=(4, 2.5),
             output_dir=hmm_dir,
             save=True,
-            show_raw=False
+            show_raw=False,
         )
-    hmm_dir = pp_dir / "13_hmm_fragment_distributions"
+    hmm_dir = pp_dir / "14_hmm_fragment_distributions"
     if hmm_dir.is_dir():
-        print(f'{hmm_dir} already exists.')
+        logger.debug(f"{hmm_dir} already exists.")
     else:
         make_dirs([pp_dir, hmm_dir])
         from ..plotting import plot_hmm_size_contours
-        for layer, max in [('C_all_accessible_features_lengths', 400), ('C_all_footprint_features_lengths', 160), ('C_all_accessible_features_merged_lengths', 800)]:
+        if smf_modality == "deaminase":
+            fragments = [
+                ("C_all_accessible_features_lengths", 400),
+                ("C_all_footprint_features_lengths", 250),
+                ("C_all_accessible_features_merged_lengths", 800),
+            ]
+        elif smf_modality == "conversion":
+            fragments = [
+                ("GpC_all_accessible_features_lengths", 400),
+                ("GpC_all_footprint_features_lengths", 250),
+                ("GpC_all_accessible_features_merged_lengths", 800),
+            ]
+        elif smf_modality == "direct":
+            fragments = [
+                ("A_all_accessible_features_lengths", 400),
+                ("A_all_footprint_features_lengths", 200),
+                ("A_all_accessible_features_merged_lengths", 800),
+            ]
+        for layer, max in fragments:
             save_path = hmm_dir / layer
             make_dirs([save_path])
@@ -328,11 +1009,11 @@ def hmm_adata(config_path):
                 save_pdf=False,
                 save_each_page=True,
                 dpi=200,
-                smoothing_sigma=None,
-                normalize_after_smoothing=False,
-                cmap='viridis',
-                log_scale_z=True
+                smoothing_sigma=(10, 10),
+                normalize_after_smoothing=True,
+                cmap="Greens",
+                log_scale_z=True,
             )
     ########################################################################################################################
-    return (adata, hmm_adata_path)
+    return (adata, paths.hmm)

smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

smftools 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl