PyPI - smftools - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

smftools/plotting/general_plotting.py CHANGED Viewed

@@ -1,6 +1,93 @@
+from __future__ import annotations
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
+import scipy.cluster.hierarchy as sch
+import matplotlib.gridspec as gridspec
+import os
+import math
+import pandas as pd
+from typing import Optional, Mapping, Sequence, Any, Dict, List, Tuple
+from pathlib import Path
+def _fixed_tick_positions(n_positions: int, n_ticks: int) -> np.ndarray:
+    """
+    Return indices for ~n_ticks evenly spaced labels across [0, n_positions-1].
+    Always includes 0 and n_positions-1 when possible.
+    """
+    n_ticks = int(max(2, n_ticks))
+    if n_positions <= n_ticks:
+        return np.arange(n_positions)
+    # linspace gives fixed count
+    pos = np.linspace(0, n_positions - 1, n_ticks)
+    return np.unique(np.round(pos).astype(int))
+def _select_labels(subset, sites: np.ndarray, reference: str, index_col_suffix: str | None):
+    """
+    Select tick labels for the heatmap axis.
+    Parameters
+    ----------
+    subset : AnnData view
+        The per-bin subset of the AnnData.
+    sites : np.ndarray[int]
+        Indices of the subset.var positions to annotate.
+    reference : str
+        Reference name (e.g., '6B6_top').
+    index_col_suffix : None or str
+        If None → use subset.var_names
+        Else     → use subset.var[f"{reference}_{index_col_suffix}"]
+    Returns
+    -------
+    np.ndarray[str]
+        The labels to use for tick positions.
+    """
+    if sites.size == 0:
+        return np.array([])
+    # Default behavior: use var_names
+    if index_col_suffix is None:
+        return subset.var_names[sites].astype(str)
+    # Otherwise: use a computed column adata.var[f"{reference}_{suffix}"]
+    colname = f"{reference}_{index_col_suffix}"
+    if colname not in subset.var:
+        raise KeyError(
+            f"index_col_suffix='{index_col_suffix}' requires var column '{colname}', "
+            f"but it is not present in adata.var."
+        )
+    labels = subset.var[colname].astype(str).values
+    return labels[sites]
+def normalized_mean(matrix: np.ndarray) -> np.ndarray:
+    mean = np.nanmean(matrix, axis=0)
+    denom = (mean.max() - mean.min()) + 1e-9
+    return (mean - mean.min()) / denom
+def methylation_fraction(matrix: np.ndarray) -> np.ndarray:
+    """
+    Fraction methylated per column.
+    Methylated = 1
+    Valid = finite AND not 0
+    """
+    matrix = np.asarray(matrix)
+    valid_mask = np.isfinite(matrix) & (matrix != 0)
+    methyl_mask = (matrix == 1) & np.isfinite(matrix)
+    methylated = methyl_mask.sum(axis=0)
+    valid = valid_mask.sum(axis=0)
+    return np.divide(
+        methylated, valid,
+        out=np.zeros_like(methylated, dtype=float),
+        where=valid != 0
+    )
 def clean_barplot(ax, mean_values, title):
     x = np.arange(len(mean_values))
@@ -17,438 +104,1072 @@ def clean_barplot(ax, mean_values, title):
     ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
+# def combined_hmm_raw_clustermap(
+#     adata,
+#     sample_col='Sample_Names',
+#     reference_col='Reference_strand',
+#     hmm_feature_layer="hmm_combined",
+#     layer_gpc="nan0_0minus1",
+#     layer_cpg="nan0_0minus1",
+#     layer_any_c="nan0_0minus1",
+#     cmap_hmm="tab10",
+#     cmap_gpc="coolwarm",
+#     cmap_cpg="viridis",
+#     cmap_any_c='coolwarm',
+#     min_quality=20,
+#     min_length=200,
+#     min_mapped_length_to_reference_length_ratio=0.8,
+#     min_position_valid_fraction=0.5,
+#     sample_mapping=None,
+#     save_path=None,
+#     normalize_hmm=False,
+#     sort_by="gpc",  # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
+#     bins=None,
+#     deaminase=False,
+#     min_signal=0
+#     ):
+#     results = []
+#     if deaminase:
+#         signal_type = 'deamination'
+#     else:
+#         signal_type = 'methylation'
+#     for ref in adata.obs[reference_col].cat.categories:
+#         for sample in adata.obs[sample_col].cat.categories:
+#             try:
+#                 subset = adata[
+#                     (adata.obs[reference_col] == ref) &
+#                     (adata.obs[sample_col] == sample) &
+#                     (adata.obs['read_quality'] >= min_quality) &
+#                     (adata.obs['read_length'] >= min_length) &
+#                     (adata.obs['mapped_length_to_reference_length_ratio'] > min_mapped_length_to_reference_length_ratio)
+#                 ]
+#                 mask = subset.var[f"{ref}_valid_fraction"].astype(float) > float(min_position_valid_fraction)
+#                 subset = subset[:, mask]
+#                 if subset.shape[0] == 0:
+#                     print(f"  No reads left after filtering for {sample} - {ref}")
+#                     continue
+#                 if bins:
+#                     print(f"Using defined bins to subset clustermap for {sample} - {ref}")
+#                     bins_temp = bins
+#                 else:
+#                     print(f"Using all reads for clustermap for {sample} - {ref}")
+#                     bins_temp = {"All": (subset.obs['Reference_strand'] == ref)}
+#                 # Get column positions (not var_names!) of site masks
+#                 gpc_sites = np.where(subset.var[f"{ref}_GpC_site"].values)[0]
+#                 cpg_sites = np.where(subset.var[f"{ref}_CpG_site"].values)[0]
+#                 any_c_sites = np.where(subset.var[f"{ref}_any_C_site"].values)[0]
+#                 num_gpc = len(gpc_sites)
+#                 num_cpg = len(cpg_sites)
+#                 num_c = len(any_c_sites)
+#                 print(f"Found {num_gpc} GpC sites at {gpc_sites} \nand {num_cpg} CpG sites at {cpg_sites} for {sample} - {ref}")
+#                 # Use var_names for x-axis tick labels
+#                 gpc_labels = subset.var_names[gpc_sites].astype(int)
+#                 cpg_labels = subset.var_names[cpg_sites].astype(int)
+#                 any_c_labels = subset.var_names[any_c_sites].astype(int)
+#                 stacked_hmm_feature, stacked_gpc, stacked_cpg, stacked_any_c = [], [], [], []
+#                 row_labels, bin_labels = [], []
+#                 bin_boundaries = []
+#                 total_reads = subset.shape[0]
+#                 percentages = {}
+#                 last_idx = 0
+#                 for bin_label, bin_filter in bins_temp.items():
+#                     subset_bin = subset[bin_filter].copy()
+#                     num_reads = subset_bin.shape[0]
+#                     print(f"analyzing {num_reads} reads for {bin_label} bin in {sample} - {ref}")
+#                     percent_reads = (num_reads / total_reads) * 100 if total_reads > 0 else 0
+#                     percentages[bin_label] = percent_reads
+#                     if num_reads > 0 and num_cpg > 0 and num_gpc > 0:
+#                         # Determine sorting order
+#                         if sort_by.startswith("obs:"):
+#                             colname = sort_by.split("obs:")[1]
+#                             order = np.argsort(subset_bin.obs[colname].values)
+#                         elif sort_by == "gpc":
+#                             linkage = sch.linkage(subset_bin[:, gpc_sites].layers[layer_gpc], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "cpg":
+#                             linkage = sch.linkage(subset_bin[:, cpg_sites].layers[layer_cpg], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "gpc_cpg":
+#                             linkage = sch.linkage(subset_bin.layers[layer_gpc], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "none":
+#                             order = np.arange(num_reads)
+#                         elif sort_by == "any_c":
+#                             linkage = sch.linkage(subset_bin.layers[layer_any_c], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         else:
+#                             raise ValueError(f"Unsupported sort_by option: {sort_by}")
+#                         stacked_hmm_feature.append(subset_bin[order].layers[hmm_feature_layer])
+#                         stacked_gpc.append(subset_bin[order][:, gpc_sites].layers[layer_gpc])
+#                         stacked_cpg.append(subset_bin[order][:, cpg_sites].layers[layer_cpg])
+#                         stacked_any_c.append(subset_bin[order][:, any_c_sites].layers[layer_any_c])
+#                         row_labels.extend([bin_label] * num_reads)
+#                         bin_labels.append(f"{bin_label}: {num_reads} reads ({percent_reads:.1f}%)")
+#                         last_idx += num_reads
+#                         bin_boundaries.append(last_idx)
+#                 if stacked_hmm_feature:
+#                     hmm_matrix = np.vstack(stacked_hmm_feature)
+#                     gpc_matrix = np.vstack(stacked_gpc)
+#                     cpg_matrix = np.vstack(stacked_cpg)
+#                     any_c_matrix = np.vstack(stacked_any_c)
+#                     if hmm_matrix.size > 0:
+#                         def normalized_mean(matrix):
+#                             mean = np.nanmean(matrix, axis=0)
+#                             normalized = (mean - mean.min()) / (mean.max() - mean.min() + 1e-9)
+#                             return normalized
+#                         def methylation_fraction(matrix):
+#                             methylated = (matrix == 1).sum(axis=0)
+#                             valid = (matrix != 0).sum(axis=0)
+#                             return np.divide(methylated, valid, out=np.zeros_like(methylated, dtype=float), where=valid != 0)
+#                         if normalize_hmm:
+#                             mean_hmm = normalized_mean(hmm_matrix)
+#                         else:
+#                             mean_hmm = np.nanmean(hmm_matrix, axis=0)
+#                         mean_gpc = methylation_fraction(gpc_matrix)
+#                         mean_cpg = methylation_fraction(cpg_matrix)
+#                         mean_any_c = methylation_fraction(any_c_matrix)
+#                         fig = plt.figure(figsize=(18, 12))
+#                         gs = gridspec.GridSpec(2, 4, height_ratios=[1, 6], hspace=0.01)
+#                         fig.suptitle(f"{sample} - {ref} - {total_reads} reads", fontsize=14, y=0.95)
+#                         axes_heat = [fig.add_subplot(gs[1, i]) for i in range(4)]
+#                         axes_bar = [fig.add_subplot(gs[0, i], sharex=axes_heat[i]) for i in range(4)]
+#                         clean_barplot(axes_bar[0], mean_hmm, f"{hmm_feature_layer} HMM Features")
+#                         clean_barplot(axes_bar[1], mean_gpc, f"GpC Accessibility Signal")
+#                         clean_barplot(axes_bar[2], mean_cpg, f"CpG Accessibility Signal")
+#                         clean_barplot(axes_bar[3], mean_any_c, f"Any C Accessibility Signal")
+#                         hmm_labels = subset.var_names.astype(int)
+#                         hmm_label_spacing = 150
+#                         sns.heatmap(hmm_matrix, cmap=cmap_hmm, ax=axes_heat[0], xticklabels=hmm_labels[::hmm_label_spacing], yticklabels=False, cbar=False)
+#                         axes_heat[0].set_xticks(range(0, len(hmm_labels), hmm_label_spacing))
+#                         axes_heat[0].set_xticklabels(hmm_labels[::hmm_label_spacing], rotation=90, fontsize=10)
+#                         for boundary in bin_boundaries[:-1]:
+#                             axes_heat[0].axhline(y=boundary, color="black", linewidth=2)
+#                         sns.heatmap(gpc_matrix, cmap=cmap_gpc, ax=axes_heat[1], xticklabels=gpc_labels[::5], yticklabels=False, cbar=False)
+#                         axes_heat[1].set_xticks(range(0, len(gpc_labels), 5))
+#                         axes_heat[1].set_xticklabels(gpc_labels[::5], rotation=90, fontsize=10)
+#                         for boundary in bin_boundaries[:-1]:
+#                             axes_heat[1].axhline(y=boundary, color="black", linewidth=2)
+#                         sns.heatmap(cpg_matrix, cmap=cmap_cpg, ax=axes_heat[2], xticklabels=cpg_labels, yticklabels=False, cbar=False)
+#                         axes_heat[2].set_xticklabels(cpg_labels, rotation=90, fontsize=10)
+#                         for boundary in bin_boundaries[:-1]:
+#                             axes_heat[2].axhline(y=boundary, color="black", linewidth=2)
+#                         sns.heatmap(any_c_matrix, cmap=cmap_any_c, ax=axes_heat[3], xticklabels=any_c_labels[::20], yticklabels=False, cbar=False)
+#                         axes_heat[3].set_xticks(range(0, len(any_c_labels), 20))
+#                         axes_heat[3].set_xticklabels(any_c_labels[::20], rotation=90, fontsize=10)
+#                         for boundary in bin_boundaries[:-1]:
+#                             axes_heat[3].axhline(y=boundary, color="black", linewidth=2)
+#                         plt.tight_layout()
+#                         if save_path:
+#                             save_name = f"{ref} — {sample}"
+#                             os.makedirs(save_path, exist_ok=True)
+#                             safe_name = save_name.replace("=", "").replace("__", "_").replace(",", "_")
+#                             out_file = os.path.join(save_path, f"{safe_name}.png")
+#                             plt.savefig(out_file, dpi=300)
+#                             print(f"Saved: {out_file}")
+#                             plt.close()
+#                         else:
+#                             plt.show()
+#                         print(f"Summary for {sample} - {ref}:")
+#                         for bin_label, percent in percentages.items():
+#                             print(f"  - {bin_label}: {percent:.1f}%")
+#                         results.append({
+#                             "sample": sample,
+#                             "ref": ref,
+#                             "hmm_matrix": hmm_matrix,
+#                             "gpc_matrix": gpc_matrix,
+#                             "cpg_matrix": cpg_matrix,
+#                             "row_labels": row_labels,
+#                             "bin_labels": bin_labels,
+#                             "bin_boundaries": bin_boundaries,
+#                             "percentages": percentages
+#                         })
+#                         #adata.uns['clustermap_results'] = results
+#             except Exception as e:
+#                 import traceback
+#                 traceback.print_exc()
+#                 continue
 def combined_hmm_raw_clustermap(
     adata,
-    sample_col='Sample_Names',
-    reference_col='Reference_strand',
-    hmm_feature_layer="hmm_combined",
-    layer_gpc="nan0_0minus1",
-    layer_cpg="nan0_0minus1",
-    layer_any_c="nan0_0minus1",
-    cmap_hmm="tab10",
-    cmap_gpc="coolwarm",
-    cmap_cpg="viridis",
-    cmap_any_c='coolwarm',
-    min_quality=20,
-    min_length=200,
-    min_mapped_length_to_reference_length_ratio=0.8,
-    min_position_valid_fraction=0.5,
-    sample_mapping=None,
-    save_path=None,
-    normalize_hmm=False,
-    sort_by="gpc",  # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
-    bins=None,
-    deaminase=False,
-    min_signal=0
-    ):
-    import scipy.cluster.hierarchy as sch
-    import pandas as pd
-    import numpy as np
-    import seaborn as sns
-    import matplotlib.pyplot as plt
-    import matplotlib.gridspec as gridspec
-    import os
+    sample_col: str = "Sample_Names",
+    reference_col: str = "Reference_strand",
+    hmm_feature_layer: str = "hmm_combined",
+    layer_gpc: str = "nan0_0minus1",
+    layer_cpg: str = "nan0_0minus1",
+    layer_c: str = "nan0_0minus1",
+    layer_a: str = "nan0_0minus1",
+    cmap_hmm: str = "tab10",
+    cmap_gpc: str = "coolwarm",
+    cmap_cpg: str = "viridis",
+    cmap_c: str = "coolwarm",
+    cmap_a: str = "coolwarm",
+    min_quality: int = 20,
+    min_length: int = 200,
+    min_mapped_length_to_reference_length_ratio: float = 0.8,
+    min_position_valid_fraction: float = 0.5,
+    save_path: str | Path | None = None,
+    normalize_hmm: bool = False,
+    sort_by: str = "gpc",
+    bins: Optional[Dict[str, Any]] = None,
+    deaminase: bool = False,
+    min_signal: float = 0.0,
+    # ---- fixed tick label controls (counts, not spacing)
+    n_xticks_hmm: int = 10,
+    n_xticks_any_c: int = 8,
+    n_xticks_gpc: int = 8,
+    n_xticks_cpg: int = 8,
+    n_xticks_a: int = 8,
+    index_col_suffix: str | None = None,
+):
+    """
+    Makes a multi-panel clustermap per (sample, reference):
+      HMM panel (always) + optional raw panels for C, GpC, CpG, and A sites.
+    Panels are added only if the corresponding site mask exists AND has >0 sites.
+    sort_by options:
+      'gpc', 'cpg', 'c', 'a', 'gpc_cpg', 'none', 'hmm', or 'obs:<col>'
+    """
+    def pick_xticks(labels: np.ndarray, n_ticks: int):
+        if labels.size == 0:
+            return [], []
+        idx = np.linspace(0, len(labels) - 1, n_ticks).round().astype(int)
+        idx = np.unique(idx)
+        return idx.tolist(), labels[idx].tolist()
     results = []
-    if deaminase:
-        signal_type = 'deamination'
-    else:
-        signal_type = 'methylation'
+    signal_type = "deamination" if deaminase else "methylation"
     for ref in adata.obs[reference_col].cat.categories:
         for sample in adata.obs[sample_col].cat.categories:
             try:
+                # ---- subset reads ----
                 subset = adata[
                     (adata.obs[reference_col] == ref) &
                     (adata.obs[sample_col] == sample) &
-                    (adata.obs['read_quality'] >= min_quality) &
-                    (adata.obs['read_length'] >= min_length) &
-                    (adata.obs['mapped_length_to_reference_length_ratio'] > min_mapped_length_to_reference_length_ratio)
+                    (adata.obs["read_quality"] >= min_quality) &
+                    (adata.obs["read_length"] >= min_length) &
+                    (
+                        adata.obs["mapped_length_to_reference_length_ratio"]
+                        > min_mapped_length_to_reference_length_ratio
+                    )
                 ]
-                mask = subset.var[f"{ref}_valid_fraction"].astype(float) > float(min_position_valid_fraction)
-                subset = subset[:, mask]
+                # ---- valid fraction filter ----
+                vf_key = f"{ref}_valid_fraction"
+                if vf_key in subset.var:
+                    mask = subset.var[vf_key].astype(float) > float(min_position_valid_fraction)
+                    subset = subset[:, mask]
                 if subset.shape[0] == 0:
-                    print(f"  No reads left after filtering for {sample} - {ref}")
                     continue
-                if bins:
-                    print(f"Using defined bins to subset clustermap for {sample} - {ref}")
-                    bins_temp = bins
+                # ---- bins ----
+                if bins is None:
+                    bins_temp = {"All": np.ones(subset.n_obs, dtype=bool)}
                 else:
-                    print(f"Using all reads for clustermap for {sample} - {ref}")
-                    bins_temp = {"All": (subset.obs['Reference_strand'] == ref)}
-                # Get column positions (not var_names!) of site masks
-                gpc_sites = np.where(subset.var[f"{ref}_GpC_site"].values)[0]
-                cpg_sites = np.where(subset.var[f"{ref}_CpG_site"].values)[0]
-                any_c_sites = np.where(subset.var[f"{ref}_any_C_site"].values)[0]
-                num_gpc = len(gpc_sites)
-                num_cpg = len(cpg_sites)
-                num_c = len(any_c_sites)
-                print(f"Found {num_gpc} GpC sites at {gpc_sites} \nand {num_cpg} CpG sites at {cpg_sites} for {sample} - {ref}")
-                # Use var_names for x-axis tick labels
-                gpc_labels = subset.var_names[gpc_sites].astype(int)
-                cpg_labels = subset.var_names[cpg_sites].astype(int)
-                any_c_labels = subset.var_names[any_c_sites].astype(int)
-                stacked_hmm_feature, stacked_gpc, stacked_cpg, stacked_any_c = [], [], [], []
-                row_labels, bin_labels = [], []
-                bin_boundaries = []
+                    bins_temp = bins
-                total_reads = subset.shape[0]
+                # ---- site masks (robust) ----
+                def _sites(*keys):
+                    for k in keys:
+                        if k in subset.var:
+                            return np.where(subset.var[k].values)[0]
+                    return np.array([], dtype=int)
+                gpc_sites   = _sites(f"{ref}_GpC_site")
+                cpg_sites   = _sites(f"{ref}_CpG_site")
+                any_c_sites = _sites(f"{ref}_any_C_site", f"{ref}_C_site")
+                any_a_sites = _sites(f"{ref}_A_site", f"{ref}_any_A_site")
+                # ---- labels via _select_labels ----
+                # HMM uses *all* columns
+                hmm_sites   = np.arange(subset.n_vars, dtype=int)
+                hmm_labels  = _select_labels(subset, hmm_sites,   ref, index_col_suffix)
+                gpc_labels  = _select_labels(subset, gpc_sites,   ref, index_col_suffix)
+                cpg_labels  = _select_labels(subset, cpg_sites,   ref, index_col_suffix)
+                any_c_labels = _select_labels(subset, any_c_sites, ref, index_col_suffix)
+                any_a_labels = _select_labels(subset, any_a_sites, ref, index_col_suffix)
+                # storage
+                stacked_hmm = []
+                stacked_any_c = []
+                stacked_gpc = []
+                stacked_cpg = []
+                stacked_any_a = []
+                row_labels, bin_labels, bin_boundaries = [], [], []
+                total_reads = subset.n_obs
                 percentages = {}
                 last_idx = 0
+                # ---------------- process bins ----------------
                 for bin_label, bin_filter in bins_temp.items():
-                    subset_bin = subset[bin_filter].copy()
-                    num_reads = subset_bin.shape[0]
-                    print(f"analyzing {num_reads} reads for {bin_label} bin in {sample} - {ref}")
-                    percent_reads = (num_reads / total_reads) * 100 if total_reads > 0 else 0
-                    percentages[bin_label] = percent_reads
+                    sb = subset[bin_filter].copy()
+                    n = sb.n_obs
+                    if n == 0:
+                        continue
-                    if num_reads > 0 and num_cpg > 0 and num_gpc > 0:
-                        # Determine sorting order
-                        if sort_by.startswith("obs:"):
-                            colname = sort_by.split("obs:")[1]
-                            order = np.argsort(subset_bin.obs[colname].values)
-                        elif sort_by == "gpc":
-                            linkage = sch.linkage(subset_bin[:, gpc_sites].layers[layer_gpc], method="ward")
-                            order = sch.leaves_list(linkage)
-                        elif sort_by == "cpg":
-                            linkage = sch.linkage(subset_bin[:, cpg_sites].layers[layer_cpg], method="ward")
-                            order = sch.leaves_list(linkage)
-                        elif sort_by == "gpc_cpg":
-                            linkage = sch.linkage(subset_bin.layers[layer_gpc], method="ward")
-                            order = sch.leaves_list(linkage)
-                        elif sort_by == "none":
-                            order = np.arange(num_reads)
-                        elif sort_by == "any_c":
-                            linkage = sch.linkage(subset_bin.layers[layer_any_c], method="ward")
-                            order = sch.leaves_list(linkage)
-                        else:
-                            raise ValueError(f"Unsupported sort_by option: {sort_by}")
-                        stacked_hmm_feature.append(subset_bin[order].layers[hmm_feature_layer])
-                        stacked_gpc.append(subset_bin[order][:, gpc_sites].layers[layer_gpc])
-                        stacked_cpg.append(subset_bin[order][:, cpg_sites].layers[layer_cpg])
-                        stacked_any_c.append(subset_bin[order][:, any_c_sites].layers[layer_any_c])
-                        row_labels.extend([bin_label] * num_reads)
-                        bin_labels.append(f"{bin_label}: {num_reads} reads ({percent_reads:.1f}%)")
-                        last_idx += num_reads
-                        bin_boundaries.append(last_idx)
-                if stacked_hmm_feature:
-                    hmm_matrix = np.vstack(stacked_hmm_feature)
-                    gpc_matrix = np.vstack(stacked_gpc)
-                    cpg_matrix = np.vstack(stacked_cpg)
-                    any_c_matrix = np.vstack(stacked_any_c)
+                    pct = (n / total_reads) * 100 if total_reads else 0
+                    percentages[bin_label] = pct
-                    if hmm_matrix.size > 0:
-                        def normalized_mean(matrix):
-                            mean = np.nanmean(matrix, axis=0)
-                            normalized = (mean - mean.min()) / (mean.max() - mean.min() + 1e-9)
-                            return normalized
+                    # ---- sorting ----
+                    if sort_by.startswith("obs:"):
+                        colname = sort_by.split("obs:")[1]
+                        order = np.argsort(sb.obs[colname].values)
-                        def methylation_fraction(matrix):
-                            methylated = (matrix == 1).sum(axis=0)
-                            valid = (matrix != 0).sum(axis=0)
-                            return np.divide(methylated, valid, out=np.zeros_like(methylated, dtype=float), where=valid != 0)
+                    elif sort_by == "gpc" and gpc_sites.size:
+                        linkage = sch.linkage(sb[:, gpc_sites].layers[layer_gpc], method="ward")
+                        order = sch.leaves_list(linkage)
-                        if normalize_hmm:
-                            mean_hmm = normalized_mean(hmm_matrix)
-                        else:
-                            mean_hmm = np.nanmean(hmm_matrix, axis=0)
-                        mean_gpc = methylation_fraction(gpc_matrix)
-                        mean_cpg = methylation_fraction(cpg_matrix)
-                        mean_any_c = methylation_fraction(any_c_matrix)
-                        fig = plt.figure(figsize=(18, 12))
-                        gs = gridspec.GridSpec(2, 4, height_ratios=[1, 6], hspace=0.01)
-                        fig.suptitle(f"{sample} - {ref}", fontsize=14, y=0.95)
-                        axes_heat = [fig.add_subplot(gs[1, i]) for i in range(4)]
-                        axes_bar = [fig.add_subplot(gs[0, i], sharex=axes_heat[i]) for i in range(4)]
-                        clean_barplot(axes_bar[0], mean_hmm, f"{hmm_feature_layer} HMM Features")
-                        clean_barplot(axes_bar[1], mean_gpc, f"GpC Accessibility Signal")
-                        clean_barplot(axes_bar[2], mean_cpg, f"CpG Accessibility Signal")
-                        clean_barplot(axes_bar[3], mean_any_c, f"Any C Accessibility Signal")
-                        hmm_labels = subset.var_names.astype(int)
-                        hmm_label_spacing = 150
-                        sns.heatmap(hmm_matrix, cmap=cmap_hmm, ax=axes_heat[0], xticklabels=hmm_labels[::hmm_label_spacing], yticklabels=False, cbar=False)
-                        axes_heat[0].set_xticks(range(0, len(hmm_labels), hmm_label_spacing))
-                        axes_heat[0].set_xticklabels(hmm_labels[::hmm_label_spacing], rotation=90, fontsize=10)
-                        for boundary in bin_boundaries[:-1]:
-                            axes_heat[0].axhline(y=boundary, color="black", linewidth=2)
-                        sns.heatmap(gpc_matrix, cmap=cmap_gpc, ax=axes_heat[1], xticklabels=gpc_labels[::5], yticklabels=False, cbar=False)
-                        axes_heat[1].set_xticks(range(0, len(gpc_labels), 5))
-                        axes_heat[1].set_xticklabels(gpc_labels[::5], rotation=90, fontsize=10)
-                        for boundary in bin_boundaries[:-1]:
-                            axes_heat[1].axhline(y=boundary, color="black", linewidth=2)
-                        sns.heatmap(cpg_matrix, cmap=cmap_cpg, ax=axes_heat[2], xticklabels=cpg_labels, yticklabels=False, cbar=False)
-                        axes_heat[2].set_xticklabels(cpg_labels, rotation=90, fontsize=10)
-                        for boundary in bin_boundaries[:-1]:
-                            axes_heat[2].axhline(y=boundary, color="black", linewidth=2)
-                        sns.heatmap(any_c_matrix, cmap=cmap_any_c, ax=axes_heat[3], xticklabels=any_c_labels[::20], yticklabels=False, cbar=False)
-                        axes_heat[3].set_xticks(range(0, len(any_c_labels), 20))
-                        axes_heat[3].set_xticklabels(any_c_labels[::20], rotation=90, fontsize=10)
-                        for boundary in bin_boundaries[:-1]:
-                            axes_heat[3].axhline(y=boundary, color="black", linewidth=2)
-                        plt.tight_layout()
-                        if save_path:
-                            save_name = f"{ref} — {sample}"
-                            os.makedirs(save_path, exist_ok=True)
-                            safe_name = save_name.replace("=", "").replace("__", "_").replace(",", "_")
-                            out_file = os.path.join(save_path, f"{safe_name}.png")
-                            plt.savefig(out_file, dpi=300)
-                            print(f"Saved: {out_file}")
-                            plt.close()
-                        else:
-                            plt.show()
-                        print(f"Summary for {sample} - {ref}:")
-                        for bin_label, percent in percentages.items():
-                            print(f"  - {bin_label}: {percent:.1f}%")
-                        results.append({
-                            "sample": sample,
-                            "ref": ref,
-                            "hmm_matrix": hmm_matrix,
-                            "gpc_matrix": gpc_matrix,
-                            "cpg_matrix": cpg_matrix,
-                            "row_labels": row_labels,
-                            "bin_labels": bin_labels,
-                            "bin_boundaries": bin_boundaries,
-                            "percentages": percentages
-                        })
-                        adata.uns['clustermap_results'] = results
+                    elif sort_by == "cpg" and cpg_sites.size:
+                        linkage = sch.linkage(sb[:, cpg_sites].layers[layer_cpg], method="ward")
+                        order = sch.leaves_list(linkage)
-            except Exception as e:
+                    elif sort_by == "c" and any_c_sites.size:
+                        linkage = sch.linkage(sb[:, any_c_sites].layers[layer_c], method="ward")
+                        order = sch.leaves_list(linkage)
+                    elif sort_by == "a" and any_a_sites.size:
+                        linkage = sch.linkage(sb[:, any_a_sites].layers[layer_a], method="ward")
+                        order = sch.leaves_list(linkage)
+                    elif sort_by == "gpc_cpg" and gpc_sites.size and cpg_sites.size:
+                        linkage = sch.linkage(sb.layers[layer_gpc], method="ward")
+                        order = sch.leaves_list(linkage)
+                    elif sort_by == "hmm" and hmm_sites.size:
+                        linkage = sch.linkage(sb[:, hmm_sites].layers[hmm_feature_layer], method="ward")
+                        order = sch.leaves_list(linkage)
+                    else:
+                        order = np.arange(n)
+                    sb = sb[order]
+                    # ---- collect matrices ----
+                    stacked_hmm.append(sb.layers[hmm_feature_layer])
+                    if any_c_sites.size:
+                        stacked_any_c.append(sb[:, any_c_sites].layers[layer_c])
+                    if gpc_sites.size:
+                        stacked_gpc.append(sb[:, gpc_sites].layers[layer_gpc])
+                    if cpg_sites.size:
+                        stacked_cpg.append(sb[:, cpg_sites].layers[layer_cpg])
+                    if any_a_sites.size:
+                        stacked_any_a.append(sb[:, any_a_sites].layers[layer_a])
+                    row_labels.extend([bin_label] * n)
+                    bin_labels.append(f"{bin_label}: {n} reads ({pct:.1f}%)")
+                    last_idx += n
+                    bin_boundaries.append(last_idx)
+                # ---------------- stack ----------------
+                hmm_matrix = np.vstack(stacked_hmm)
+                mean_hmm = normalized_mean(hmm_matrix) if normalize_hmm else np.nanmean(hmm_matrix, axis=0)
+                panels = [
+                    (f"HMM - {hmm_feature_layer}", hmm_matrix, hmm_labels, cmap_hmm, mean_hmm, n_xticks_hmm),
+                ]
+                if stacked_any_c:
+                    m = np.vstack(stacked_any_c)
+                    panels.append(("C", m, any_c_labels, cmap_c, methylation_fraction(m), n_xticks_any_c))
+                if stacked_gpc:
+                    m = np.vstack(stacked_gpc)
+                    panels.append(("GpC", m, gpc_labels, cmap_gpc, methylation_fraction(m), n_xticks_gpc))
+                if stacked_cpg:
+                    m = np.vstack(stacked_cpg)
+                    panels.append(("CpG", m, cpg_labels, cmap_cpg, methylation_fraction(m), n_xticks_cpg))
+                if stacked_any_a:
+                    m = np.vstack(stacked_any_a)
+                    panels.append(("A", m, any_a_labels, cmap_a, methylation_fraction(m), n_xticks_a))
+                # ---------------- plotting ----------------
+                n_panels = len(panels)
+                fig = plt.figure(figsize=(4.5 * n_panels, 10))
+                gs = gridspec.GridSpec(2, n_panels, height_ratios=[1, 6], hspace=0.01)
+                fig.suptitle(f"{sample} — {ref} — {total_reads} reads ({signal_type})",
+                             fontsize=14, y=0.98)
+                axes_heat = [fig.add_subplot(gs[1, i]) for i in range(n_panels)]
+                axes_bar = [fig.add_subplot(gs[0, i], sharex=axes_heat[i]) for i in range(n_panels)]
+                for i, (name, matrix, labels, cmap, mean_vec, n_ticks) in enumerate(panels):
+                    # ---- your clean barplot ----
+                    clean_barplot(axes_bar[i], mean_vec, name)
+                    # ---- heatmap ----
+                    sns.heatmap(matrix, cmap=cmap, ax=axes_heat[i],
+                                yticklabels=False, cbar=False)
+                    # ---- xticks ----
+                    xtick_pos, xtick_labels = pick_xticks(np.asarray(labels), n_ticks)
+                    axes_heat[i].set_xticks(xtick_pos)
+                    axes_heat[i].set_xticklabels(xtick_labels, rotation=90, fontsize=8)
+                    for boundary in bin_boundaries[:-1]:
+                        axes_heat[i].axhline(y=boundary, color="black", linewidth=1.2)
+                plt.tight_layout()
+                if save_path:
+                    save_path = Path(save_path)
+                    save_path.mkdir(parents=True, exist_ok=True)
+                    safe_name = f"{ref}__{sample}".replace("/", "_")
+                    out_file = save_path / f"{safe_name}.png"
+                    plt.savefig(out_file, dpi=300)
+                    plt.close(fig)
+                else:
+                    plt.show()
+            except Exception:
                 import traceback
                 traceback.print_exc()
                 continue
+# def combined_raw_clustermap(
+#     adata,
+#     sample_col='Sample_Names',
+#     reference_col='Reference_strand',
+#     mod_target_bases=['GpC', 'CpG'],
+#     layer_any_c="nan0_0minus1",
+#     layer_gpc="nan0_0minus1",
+#     layer_cpg="nan0_0minus1",
+#     layer_a="nan0_0minus1",
+#     cmap_any_c="coolwarm",
+#     cmap_gpc="coolwarm",
+#     cmap_cpg="viridis",
+#     cmap_a="coolwarm",
+#     min_quality=20,
+#     min_length=200,
+#     min_mapped_length_to_reference_length_ratio=0.8,
+#     min_position_valid_fraction=0.5,
+#     sample_mapping=None,
+#     save_path=None,
+#     sort_by="gpc",  # options: 'gpc', 'cpg', 'gpc_cpg', 'none', 'any_a', or 'obs:<column>'
+#     bins=None,
+#     deaminase=False,
+#     min_signal=0
+#     ):
+#     results = []
+#     for ref in adata.obs[reference_col].cat.categories:
+#         for sample in adata.obs[sample_col].cat.categories:
+#             try:
+#                 subset = adata[
+#                     (adata.obs[reference_col] == ref) &
+#                     (adata.obs[sample_col] == sample) &
+#                     (adata.obs['read_quality'] >= min_quality) &
+#                     (adata.obs['mapped_length'] >= min_length) &
+#                     (adata.obs['mapped_length_to_reference_length_ratio'] >= min_mapped_length_to_reference_length_ratio)
+#                 ]
+#                 mask = subset.var[f"{ref}_valid_fraction"].astype(float) > float(min_position_valid_fraction)
+#                 subset = subset[:, mask]
+#                 if subset.shape[0] == 0:
+#                     print(f"  No reads left after filtering for {sample} - {ref}")
+#                     continue
+#                 if bins:
+#                     print(f"Using defined bins to subset clustermap for {sample} - {ref}")
+#                     bins_temp = bins
+#                 else:
+#                     print(f"Using all reads for clustermap for {sample} - {ref}")
+#                     bins_temp = {"All": (subset.obs['Reference_strand'] == ref)}
+#                 num_any_c = 0
+#                 num_gpc = 0
+#                 num_cpg = 0
+#                 num_any_a = 0
+#                 # Get column positions (not var_names!) of site masks
+#                 if any(base in ["C", "CpG", "GpC"] for base in mod_target_bases):
+#                     any_c_sites = np.where(subset.var[f"{ref}_C_site"].values)[0]
+#                     gpc_sites = np.where(subset.var[f"{ref}_GpC_site"].values)[0]
+#                     cpg_sites = np.where(subset.var[f"{ref}_CpG_site"].values)[0]
+#                     num_any_c = len(any_c_sites)
+#                     num_gpc = len(gpc_sites)
+#                     num_cpg = len(cpg_sites)
+#                     print(f"Found {num_gpc} GpC sites at {gpc_sites} \nand {num_cpg} CpG sites at {cpg_sites}\n and {num_any_c} any_C sites at {any_c_sites} for {sample} - {ref}")
+#                     # Use var_names for x-axis tick labels
+#                     gpc_labels = subset.var_names[gpc_sites].astype(int)
+#                     cpg_labels = subset.var_names[cpg_sites].astype(int)
+#                     any_c_labels = subset.var_names[any_c_sites].astype(int)
+#                 stacked_any_c, stacked_gpc, stacked_cpg = [], [], []
+#                 if "A" in mod_target_bases:
+#                     any_a_sites = np.where(subset.var[f"{ref}_A_site"].values)[0]
+#                     num_any_a = len(any_a_sites)
+#                     print(f"Found {num_any_a} any_A sites at {any_a_sites} for {sample} - {ref}")
+#                     any_a_labels = subset.var_names[any_a_sites].astype(int)
+#                 stacked_any_a = []
+#                 row_labels, bin_labels = [], []
+#                 bin_boundaries = []
+#                 total_reads = subset.shape[0]
+#                 percentages = {}
+#                 last_idx = 0
+#                 for bin_label, bin_filter in bins_temp.items():
+#                     subset_bin = subset[bin_filter].copy()
+#                     num_reads = subset_bin.shape[0]
+#                     print(f"analyzing {num_reads} reads for {bin_label} bin in {sample} - {ref}")
+#                     percent_reads = (num_reads / total_reads) * 100 if total_reads > 0 else 0
+#                     percentages[bin_label] = percent_reads
+#                     if num_reads > 0 and num_cpg > 0 and num_gpc > 0:
+#                         # Determine sorting order
+#                         if sort_by.startswith("obs:"):
+#                             colname = sort_by.split("obs:")[1]
+#                             order = np.argsort(subset_bin.obs[colname].values)
+#                         elif sort_by == "gpc":
+#                             linkage = sch.linkage(subset_bin[:, gpc_sites].layers[layer_gpc], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "cpg":
+#                             linkage = sch.linkage(subset_bin[:, cpg_sites].layers[layer_cpg], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "any_c":
+#                             linkage = sch.linkage(subset_bin[:, any_c_sites].layers[layer_any_c], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "gpc_cpg":
+#                             linkage = sch.linkage(subset_bin.layers[layer_gpc], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "none":
+#                             order = np.arange(num_reads)
+#                         elif sort_by == "any_a":
+#                             linkage = sch.linkage(subset_bin.layers[layer_a], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         else:
+#                             raise ValueError(f"Unsupported sort_by option: {sort_by}")
+#                         stacked_any_c.append(subset_bin[order][:, any_c_sites].layers[layer_any_c])
+#                         stacked_gpc.append(subset_bin[order][:, gpc_sites].layers[layer_gpc])
+#                         stacked_cpg.append(subset_bin[order][:, cpg_sites].layers[layer_cpg])
+#                     if num_reads > 0 and num_any_a > 0:
+#                         # Determine sorting order
+#                         if sort_by.startswith("obs:"):
+#                             colname = sort_by.split("obs:")[1]
+#                             order = np.argsort(subset_bin.obs[colname].values)
+#                         elif sort_by == "gpc":
+#                             linkage = sch.linkage(subset_bin[:, gpc_sites].layers[layer_gpc], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "cpg":
+#                             linkage = sch.linkage(subset_bin[:, cpg_sites].layers[layer_cpg], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "any_c":
+#                             linkage = sch.linkage(subset_bin[:, any_c_sites].layers[layer_any_c], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "gpc_cpg":
+#                             linkage = sch.linkage(subset_bin.layers[layer_gpc], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         elif sort_by == "none":
+#                             order = np.arange(num_reads)
+#                         elif sort_by == "any_a":
+#                             linkage = sch.linkage(subset_bin.layers[layer_a], method="ward")
+#                             order = sch.leaves_list(linkage)
+#                         else:
+#                             raise ValueError(f"Unsupported sort_by option: {sort_by}")
+#                         stacked_any_a.append(subset_bin[order][:, any_a_sites].layers[layer_a])
+#                     row_labels.extend([bin_label] * num_reads)
+#                     bin_labels.append(f"{bin_label}: {num_reads} reads ({percent_reads:.1f}%)")
+#                     last_idx += num_reads
+#                     bin_boundaries.append(last_idx)
+#                 gs_dim = 0
+#                 if stacked_any_c:
+#                     any_c_matrix = np.vstack(stacked_any_c)
+#                     gpc_matrix = np.vstack(stacked_gpc)
+#                     cpg_matrix = np.vstack(stacked_cpg)
+#                     if any_c_matrix.size > 0:
+#                         mean_gpc = methylation_fraction(gpc_matrix)
+#                         mean_cpg = methylation_fraction(cpg_matrix)
+#                         mean_any_c = methylation_fraction(any_c_matrix)
+#                     gs_dim += 3
+#                 if stacked_any_a:
+#                     any_a_matrix = np.vstack(stacked_any_a)
+#                     if any_a_matrix.size > 0:
+#                         mean_any_a = methylation_fraction(any_a_matrix)
+#                         gs_dim += 1
+#                 fig = plt.figure(figsize=(18, 12))
+#                 gs = gridspec.GridSpec(2, gs_dim, height_ratios=[1, 6], hspace=0.01)
+#                 fig.suptitle(f"{sample} - {ref} - {total_reads} reads", fontsize=14, y=0.95)
+#                 axes_heat = [fig.add_subplot(gs[1, i]) for i in range(gs_dim)]
+#                 axes_bar = [fig.add_subplot(gs[0, i], sharex=axes_heat[i]) for i in range(gs_dim)]
+#                 current_ax = 0
+#                 if stacked_any_c:
+#                     if any_c_matrix.size > 0:
+#                         clean_barplot(axes_bar[current_ax], mean_any_c, f"any C site Modification Signal")
+#                         sns.heatmap(any_c_matrix, cmap=cmap_any_c, ax=axes_heat[current_ax], xticklabels=any_c_labels[::20], yticklabels=False, cbar=False)
+#                         axes_heat[current_ax].set_xticks(range(0, len(any_c_labels), 20))
+#                         axes_heat[current_ax].set_xticklabels(any_c_labels[::20], rotation=90, fontsize=10)
+#                         for boundary in bin_boundaries[:-1]:
+#                             axes_heat[current_ax].axhline(y=boundary, color="black", linewidth=2)
+#                         current_ax +=1
+#                         clean_barplot(axes_bar[current_ax], mean_gpc, f"GpC Modification Signal")
+#                         sns.heatmap(gpc_matrix, cmap=cmap_gpc, ax=axes_heat[current_ax], xticklabels=gpc_labels[::5], yticklabels=False, cbar=False)
+#                         axes_heat[current_ax].set_xticks(range(0, len(gpc_labels), 5))
+#                         axes_heat[current_ax].set_xticklabels(gpc_labels[::5], rotation=90, fontsize=10)
+#                         for boundary in bin_boundaries[:-1]:
+#                             axes_heat[current_ax].axhline(y=boundary, color="black", linewidth=2)
+#                         current_ax +=1
+#                         clean_barplot(axes_bar[current_ax], mean_cpg, f"CpG Modification Signal")
+#                         sns.heatmap(cpg_matrix, cmap=cmap_cpg, ax=axes_heat[2], xticklabels=cpg_labels, yticklabels=False, cbar=False)
+#                         axes_heat[current_ax].set_xticklabels(cpg_labels, rotation=90, fontsize=10)
+#                         for boundary in bin_boundaries[:-1]:
+#                             axes_heat[current_ax].axhline(y=boundary, color="black", linewidth=2)
+#                         current_ax +=1
+#                         results.append({
+#                             "sample": sample,
+#                             "ref": ref,
+#                             "any_c_matrix": any_c_matrix,
+#                             "gpc_matrix": gpc_matrix,
+#                             "cpg_matrix": cpg_matrix,
+#                             "row_labels": row_labels,
+#                             "bin_labels": bin_labels,
+#                             "bin_boundaries": bin_boundaries,
+#                             "percentages": percentages
+#                         })
+#                 if stacked_any_a:
+#                     if any_a_matrix.size > 0:
+#                         clean_barplot(axes_bar[current_ax], mean_any_a, f"any A site Modification Signal")
+#                         sns.heatmap(any_a_matrix, cmap=cmap_a, ax=axes_heat[current_ax], xticklabels=any_a_labels[::20], yticklabels=False, cbar=False)
+#                         axes_heat[current_ax].set_xticks(range(0, len(any_a_labels), 20))
+#                         axes_heat[current_ax].set_xticklabels(any_a_labels[::20], rotation=90, fontsize=10)
+#                         for boundary in bin_boundaries[:-1]:
+#                             axes_heat[current_ax].axhline(y=boundary, color="black", linewidth=2)
+#                         current_ax +=1
+#                         results.append({
+#                             "sample": sample,
+#                             "ref": ref,
+#                             "any_a_matrix": any_a_matrix,
+#                             "row_labels": row_labels,
+#                             "bin_labels": bin_labels,
+#                             "bin_boundaries": bin_boundaries,
+#                             "percentages": percentages
+#                         })
+#                 plt.tight_layout()
+#                 if save_path:
+#                     save_name = f"{ref} — {sample}"
+#                     os.makedirs(save_path, exist_ok=True)
+#                     safe_name = save_name.replace("=", "").replace("__", "_").replace(",", "_")
+#                     out_file = os.path.join(save_path, f"{safe_name}.png")
+#                     plt.savefig(out_file, dpi=300)
+#                     print(f"Saved: {out_file}")
+#                     plt.close()
+#                 else:
+#                     plt.show()
+#                 print(f"Summary for {sample} - {ref}:")
+#                 for bin_label, percent in percentages.items():
+#                     print(f"  - {bin_label}: {percent:.1f}%")
+#                 adata.uns['clustermap_results'] = results
+#             except Exception as e:
+#                 import traceback
+#                 traceback.print_exc()
+#                 continue
 def combined_raw_clustermap(
     adata,
-    sample_col='Sample_Names',
-    reference_col='Reference_strand',
-    layer_any_c="nan0_0minus1",
-    layer_gpc="nan0_0minus1",
-    layer_cpg="nan0_0minus1",
-    cmap_any_c="coolwarm",
-    cmap_gpc="coolwarm",
-    cmap_cpg="viridis",
-    min_quality=20,
-    min_length=200,
-    min_mapped_length_to_reference_length_ratio=0.8,
-    min_position_valid_fraction=0.5,
-    sample_mapping=None,
-    save_path=None,
-    sort_by="gpc",  # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
-    bins=None,
-    deaminase=False,
-    min_signal=0
-    ):
-    import scipy.cluster.hierarchy as sch
-    import pandas as pd
-    import numpy as np
-    import seaborn as sns
-    import matplotlib.pyplot as plt
-    import matplotlib.gridspec as gridspec
-    import os
+    sample_col: str = "Sample_Names",
+    reference_col: str = "Reference_strand",
+    mod_target_bases: Sequence[str] = ("GpC", "CpG"),
+    layer_c: str = "nan0_0minus1",
+    layer_gpc: str = "nan0_0minus1",
+    layer_cpg: str = "nan0_0minus1",
+    layer_a: str = "nan0_0minus1",
+    cmap_c: str = "coolwarm",
+    cmap_gpc: str = "coolwarm",
+    cmap_cpg: str = "viridis",
+    cmap_a: str = "coolwarm",
+    min_quality: float = 20,
+    min_length: int = 200,
+    min_mapped_length_to_reference_length_ratio: float = 0.8,
+    min_position_valid_fraction: float = 0.5,
+    sample_mapping: Optional[Mapping[str, str]] = None,
+    save_path: str | Path | None = None,
+    sort_by: str = "gpc",  # 'gpc','cpg','c','gpc_cpg','a','none','obs:<col>'
+    bins: Optional[Dict[str, Any]] = None,
+    deaminase: bool = False,
+    min_signal: float = 0,
+    n_xticks_any_c: int = 10,
+    n_xticks_gpc: int = 10,
+    n_xticks_cpg: int = 10,
+    n_xticks_any_a: int = 10,
+    xtick_rotation: int = 90,
+    xtick_fontsize: int = 9,
+    index_col_suffix: str | None = None,
+):
+    """
+    Plot stacked heatmaps + per-position mean barplots for C, GpC, CpG, and optional A.
-    results = []
+    Key fixes vs old version:
+      - order computed ONCE per bin, applied to all matrices
+      - no hard-coded axes indices
+      - NaNs excluded from methylation denominators
+      - var_names not forced to int
+      - fixed count of x tick labels per block (controllable)
+      - adata.uns updated once at end
+    Returns
+    -------
+    results : list[dict]
+        One entry per (sample, ref) plot with matrices + bin metadata.
+    """
+    results: List[Dict[str, Any]] = []
+    save_path = Path(save_path) if save_path is not None else None
+    if save_path is not None:
+        save_path.mkdir(parents=True, exist_ok=True)
+    # Ensure categorical
+    for col in (sample_col, reference_col):
+        if col not in adata.obs:
+            raise KeyError(f"{col} not in adata.obs")
+        if not pd.api.types.is_categorical_dtype(adata.obs[col]):
+            adata.obs[col] = adata.obs[col].astype("category")
+    base_set = set(mod_target_bases)
+    include_any_c = any(b in {"C", "CpG", "GpC"} for b in base_set)
+    include_any_a = "A" in base_set
     for ref in adata.obs[reference_col].cat.categories:
         for sample in adata.obs[sample_col].cat.categories:
+            # Optionally remap sample label for display
+            display_sample = sample_mapping.get(sample, sample) if sample_mapping else sample
             try:
                 subset = adata[
                     (adata.obs[reference_col] == ref) &
                     (adata.obs[sample_col] == sample) &
-                    (adata.obs['read_quality'] >= min_quality) &
-                    (adata.obs['mapped_length'] >= min_length) &
-                    (adata.obs['mapped_length_to_reference_length_ratio'] >= min_mapped_length_to_reference_length_ratio)
+                    (adata.obs["read_quality"] >= min_quality) &
+                    (adata.obs["mapped_length"] >= min_length) &
+                    (adata.obs["mapped_length_to_reference_length_ratio"] >= min_mapped_length_to_reference_length_ratio)
                 ]
-                mask = subset.var[f"{ref}_valid_fraction"].astype(float) > float(min_position_valid_fraction)
-                subset = subset[:, mask]
+                # position-level mask
+                valid_key = f"{ref}_valid_fraction"
+                if valid_key in subset.var:
+                    mask = subset.var[valid_key].astype(float).values > float(min_position_valid_fraction)
+                    subset = subset[:, mask]
                 if subset.shape[0] == 0:
-                    print(f"  No reads left after filtering for {sample} - {ref}")
+                    print(f"No reads left after filtering for {display_sample} - {ref}")
                     continue
-                if bins:
-                    print(f"Using defined bins to subset clustermap for {sample} - {ref}")
-                    bins_temp = bins
+                # bins mode
+                if bins is None:
+                    bins_temp = {"All": (subset.obs[reference_col] == ref)}
                 else:
-                    print(f"Using all reads for clustermap for {sample} - {ref}")
-                    bins_temp = {"All": (subset.obs['Reference_strand'] == ref)}
-                # Get column positions (not var_names!) of site masks
-                any_c_sites = np.where(subset.var[f"{ref}_any_C_site"].values)[0]
-                gpc_sites = np.where(subset.var[f"{ref}_GpC_site"].values)[0]
-                cpg_sites = np.where(subset.var[f"{ref}_CpG_site"].values)[0]
-                num_any_c = len(any_c_sites)
-                num_gpc = len(gpc_sites)
-                num_cpg = len(cpg_sites)
-                print(f"Found {num_gpc} GpC sites at {gpc_sites} \nand {num_cpg} CpG sites at {cpg_sites}\n and {num_any_c} any_C sites at {any_c_sites} for {sample} - {ref}")
-                # Use var_names for x-axis tick labels
-                gpc_labels = subset.var_names[gpc_sites].astype(int)
-                cpg_labels = subset.var_names[cpg_sites].astype(int)
-                any_c_labels = subset.var_names[any_c_sites].astype(int)
-                stacked_any_c, stacked_gpc, stacked_cpg = [], [], []
-                row_labels, bin_labels = [], []
-                bin_boundaries = []
+                    bins_temp = bins
-                total_reads = subset.shape[0]
+                # find sites (positions)
+                any_c_sites = gpc_sites = cpg_sites = np.array([], dtype=int)
+                any_a_sites = np.array([], dtype=int)
+                num_any_c = num_gpc = num_cpg = num_any_a = 0
+                if include_any_c:
+                    any_c_sites = np.where(subset.var.get(f"{ref}_C_site", False).values)[0]
+                    gpc_sites   = np.where(subset.var.get(f"{ref}_GpC_site", False).values)[0]
+                    cpg_sites   = np.where(subset.var.get(f"{ref}_CpG_site", False).values)[0]
+                    num_any_c, num_gpc, num_cpg = len(any_c_sites), len(gpc_sites), len(cpg_sites)
+                    any_c_labels = _select_labels(subset, any_c_sites, ref, index_col_suffix)
+                    gpc_labels   = _select_labels(subset, gpc_sites, ref, index_col_suffix)
+                    cpg_labels   = _select_labels(subset, cpg_sites, ref, index_col_suffix)
+                if include_any_a:
+                    any_a_sites = np.where(subset.var.get(f"{ref}_A_site", False).values)[0]
+                    num_any_a = len(any_a_sites)
+                    any_a_labels = _select_labels(subset, any_a_sites, ref, index_col_suffix)
+                stacked_any_c, stacked_gpc, stacked_cpg, stacked_any_a = [], [], [], []
+                row_labels, bin_labels, bin_boundaries = [], [], []
                 percentages = {}
                 last_idx = 0
+                total_reads = subset.shape[0]
+                # ----------------------------
+                # per-bin stacking
+                # ----------------------------
                 for bin_label, bin_filter in bins_temp.items():
                     subset_bin = subset[bin_filter].copy()
                     num_reads = subset_bin.shape[0]
-                    print(f"analyzing {num_reads} reads for {bin_label} bin in {sample} - {ref}")
-                    percent_reads = (num_reads / total_reads) * 100 if total_reads > 0 else 0
+                    if num_reads == 0:
+                        percentages[bin_label] = 0.0
+                        continue
+                    percent_reads = (num_reads / total_reads) * 100
                     percentages[bin_label] = percent_reads
-                    if num_reads > 0 and num_cpg > 0 and num_gpc > 0:
-                        # Determine sorting order
-                        if sort_by.startswith("obs:"):
-                            colname = sort_by.split("obs:")[1]
-                            order = np.argsort(subset_bin.obs[colname].values)
-                        elif sort_by == "gpc":
-                            linkage = sch.linkage(subset_bin[:, gpc_sites].layers[layer_gpc], method="ward")
-                            order = sch.leaves_list(linkage)
-                        elif sort_by == "cpg":
-                            linkage = sch.linkage(subset_bin[:, cpg_sites].layers[layer_cpg], method="ward")
-                            order = sch.leaves_list(linkage)
-                        elif sort_by == "any_c":
-                            linkage = sch.linkage(subset_bin[:, any_c_sites].layers[layer_any_c], method="ward")
-                            order = sch.leaves_list(linkage)
-                        elif sort_by == "gpc_cpg":
-                            linkage = sch.linkage(subset_bin.layers[layer_gpc], method="ward")
-                            order = sch.leaves_list(linkage)
-                        elif sort_by == "none":
-                            order = np.arange(num_reads)
-                        else:
-                            raise ValueError(f"Unsupported sort_by option: {sort_by}")
+                    # compute order ONCE
+                    if sort_by.startswith("obs:"):
+                        colname = sort_by.split("obs:")[1]
+                        order = np.argsort(subset_bin.obs[colname].values)
-                        stacked_any_c.append(subset_bin[order][:, any_c_sites].layers[layer_any_c])
-                        stacked_gpc.append(subset_bin[order][:, gpc_sites].layers[layer_gpc])
-                        stacked_cpg.append(subset_bin[order][:, cpg_sites].layers[layer_cpg])
+                    elif sort_by == "gpc" and num_gpc > 0:
+                        linkage = sch.linkage(subset_bin[:, gpc_sites].layers[layer_gpc], method="ward")
+                        order = sch.leaves_list(linkage)
-                        row_labels.extend([bin_label] * num_reads)
-                        bin_labels.append(f"{bin_label}: {num_reads} reads ({percent_reads:.1f}%)")
-                        last_idx += num_reads
-                        bin_boundaries.append(last_idx)
+                    elif sort_by == "cpg" and num_cpg > 0:
+                        linkage = sch.linkage(subset_bin[:, cpg_sites].layers[layer_cpg], method="ward")
+                        order = sch.leaves_list(linkage)
-                if stacked_any_c:
+                    elif sort_by == "c" and num_any_c > 0:
+                        linkage = sch.linkage(subset_bin[:, any_c_sites].layers[layer_c], method="ward")
+                        order = sch.leaves_list(linkage)
+                    elif sort_by == "gpc_cpg":
+                        linkage = sch.linkage(subset_bin.layers[layer_gpc], method="ward")
+                        order = sch.leaves_list(linkage)
+                    elif sort_by == "a" and num_any_a > 0:
+                        linkage = sch.linkage(subset_bin[:, any_a_sites].layers[layer_a], method="ward")
+                        order = sch.leaves_list(linkage)
+                    elif sort_by == "none":
+                        order = np.arange(num_reads)
+                    else:
+                        order = np.arange(num_reads)
+                    subset_bin = subset_bin[order]
+                    # stack consistently
+                    if include_any_c and num_any_c > 0:
+                        stacked_any_c.append(subset_bin[:, any_c_sites].layers[layer_c])
+                    if include_any_c and num_gpc > 0:
+                        stacked_gpc.append(subset_bin[:, gpc_sites].layers[layer_gpc])
+                    if include_any_c and num_cpg > 0:
+                        stacked_cpg.append(subset_bin[:, cpg_sites].layers[layer_cpg])
+                    if include_any_a and num_any_a > 0:
+                        stacked_any_a.append(subset_bin[:, any_a_sites].layers[layer_a])
+                    row_labels.extend([bin_label] * num_reads)
+                    bin_labels.append(f"{bin_label}: {num_reads} reads ({percent_reads:.1f}%)")
+                    last_idx += num_reads
+                    bin_boundaries.append(last_idx)
+                # ----------------------------
+                # build matrices + means
+                # ----------------------------
+                blocks = []  # list of dicts describing what to plot in order
+                if include_any_c and stacked_any_c:
                     any_c_matrix = np.vstack(stacked_any_c)
-                    gpc_matrix = np.vstack(stacked_gpc)
-                    cpg_matrix = np.vstack(stacked_cpg)
-                    if any_c_matrix.size > 0:
-                        def normalized_mean(matrix):
-                            mean = np.nanmean(matrix, axis=0)
-                            normalized = (mean - mean.min()) / (mean.max() - mean.min() + 1e-9)
-                            return normalized
-                        def methylation_fraction(matrix):
-                            methylated = (matrix == 1).sum(axis=0)
-                            valid = (matrix != 0).sum(axis=0)
-                            return np.divide(methylated, valid, out=np.zeros_like(methylated, dtype=float), where=valid != 0)
-                        mean_gpc = methylation_fraction(gpc_matrix)
-                        mean_cpg = methylation_fraction(cpg_matrix)
-                        mean_any_c = methylation_fraction(any_c_matrix)
-                        fig = plt.figure(figsize=(18, 12))
-                        gs = gridspec.GridSpec(2, 3, height_ratios=[1, 6], hspace=0.01)
-                        fig.suptitle(f"{sample} - {ref} - {total_reads} reads", fontsize=14, y=0.95)
-                        axes_heat = [fig.add_subplot(gs[1, i]) for i in range(3)]
-                        axes_bar = [fig.add_subplot(gs[0, i], sharex=axes_heat[i]) for i in range(3)]
-                        clean_barplot(axes_bar[0], mean_any_c, f"any C site Modification Signal")
-                        clean_barplot(axes_bar[1], mean_gpc, f"GpC Modification Signal")
-                        clean_barplot(axes_bar[2], mean_cpg, f"CpG Modification Signal")
+                    gpc_matrix   = np.vstack(stacked_gpc) if stacked_gpc else np.empty((0, 0))
+                    cpg_matrix   = np.vstack(stacked_cpg) if stacked_cpg else np.empty((0, 0))
+                    mean_any_c = methylation_fraction(any_c_matrix) if any_c_matrix.size else None
+                    mean_gpc   = methylation_fraction(gpc_matrix) if gpc_matrix.size else None
+                    mean_cpg   = methylation_fraction(cpg_matrix) if cpg_matrix.size else None
+                    if any_c_matrix.size:
+                        blocks.append(dict(
+                            name="c",
+                            matrix=any_c_matrix,
+                            mean=mean_any_c,
+                            labels=any_c_labels,
+                            cmap=cmap_c,
+                            n_xticks=n_xticks_any_c,
+                            title="any C site Modification Signal"
+                        ))
+                    if gpc_matrix.size:
+                        blocks.append(dict(
+                            name="gpc",
+                            matrix=gpc_matrix,
+                            mean=mean_gpc,
+                            labels=gpc_labels,
+                            cmap=cmap_gpc,
+                            n_xticks=n_xticks_gpc,
+                            title="GpC Modification Signal"
+                        ))
+                    if cpg_matrix.size:
+                        blocks.append(dict(
+                            name="cpg",
+                            matrix=cpg_matrix,
+                            mean=mean_cpg,
+                            labels=cpg_labels,
+                            cmap=cmap_cpg,
+                            n_xticks=n_xticks_cpg,
+                            title="CpG Modification Signal"
+                        ))
+                if include_any_a and stacked_any_a:
+                    any_a_matrix = np.vstack(stacked_any_a)
+                    mean_any_a = methylation_fraction(any_a_matrix) if any_a_matrix.size else None
+                    if any_a_matrix.size:
+                        blocks.append(dict(
+                            name="a",
+                            matrix=any_a_matrix,
+                            mean=mean_any_a,
+                            labels=any_a_labels,
+                            cmap=cmap_a,
+                            n_xticks=n_xticks_any_a,
+                            title="any A site Modification Signal"
+                        ))
+                if not blocks:
+                    print(f"No matrices to plot for {display_sample} - {ref}")
+                    continue
-                        sns.heatmap(any_c_matrix, cmap=cmap_any_c, ax=axes_heat[0], xticklabels=any_c_labels[::20], yticklabels=False, cbar=False)
-                        axes_heat[0].set_xticks(range(0, len(any_c_labels), 20))
-                        axes_heat[0].set_xticklabels(any_c_labels[::20], rotation=90, fontsize=10)
-                        for boundary in bin_boundaries[:-1]:
-                            axes_heat[0].axhline(y=boundary, color="black", linewidth=2)
-                        sns.heatmap(gpc_matrix, cmap=cmap_gpc, ax=axes_heat[1], xticklabels=gpc_labels[::5], yticklabels=False, cbar=False)
-                        axes_heat[1].set_xticks(range(0, len(gpc_labels), 5))
-                        axes_heat[1].set_xticklabels(gpc_labels[::5], rotation=90, fontsize=10)
-                        for boundary in bin_boundaries[:-1]:
-                            axes_heat[1].axhline(y=boundary, color="black", linewidth=2)
-                        sns.heatmap(cpg_matrix, cmap=cmap_cpg, ax=axes_heat[2], xticklabels=cpg_labels, yticklabels=False, cbar=False)
-                        axes_heat[2].set_xticklabels(cpg_labels, rotation=90, fontsize=10)
-                        for boundary in bin_boundaries[:-1]:
-                            axes_heat[2].axhline(y=boundary, color="black", linewidth=2)
-                        plt.tight_layout()
-                        if save_path:
-                            save_name = f"{ref} — {sample}"
-                            os.makedirs(save_path, exist_ok=True)
-                            safe_name = save_name.replace("=", "").replace("__", "_").replace(",", "_")
-                            out_file = os.path.join(save_path, f"{safe_name}.png")
-                            plt.savefig(out_file, dpi=300)
-                            print(f"Saved: {out_file}")
-                            plt.close()
-                        else:
-                            plt.show()
-                        print(f"Summary for {sample} - {ref}:")
-                        for bin_label, percent in percentages.items():
-                            print(f"  - {bin_label}: {percent:.1f}%")
-                        results.append({
-                            "sample": sample,
-                            "ref": ref,
-                            "any_c_matrix": any_c_matrix,
-                            "gpc_matrix": gpc_matrix,
-                            "cpg_matrix": cpg_matrix,
-                            "row_labels": row_labels,
-                            "bin_labels": bin_labels,
-                            "bin_boundaries": bin_boundaries,
-                            "percentages": percentages
-                        })
-                        adata.uns['clustermap_results'] = results
+                gs_dim = len(blocks)
+                fig = plt.figure(figsize=(5.5 * gs_dim, 11))
+                gs = gridspec.GridSpec(2, gs_dim, height_ratios=[1, 6], hspace=0.02)
+                fig.suptitle(f"{display_sample} - {ref} - {total_reads} reads", fontsize=14, y=0.97)
+                axes_heat = [fig.add_subplot(gs[1, i]) for i in range(gs_dim)]
+                axes_bar  = [fig.add_subplot(gs[0, i], sharex=axes_heat[i]) for i in range(gs_dim)]
+                # ----------------------------
+                # plot blocks
+                # ----------------------------
+                for i, blk in enumerate(blocks):
+                    mat = blk["matrix"]
+                    mean = blk["mean"]
+                    labels = np.asarray(blk["labels"], dtype=str)
+                    n_xticks = blk["n_xticks"]
+                    # barplot
+                    clean_barplot(axes_bar[i], mean, blk["title"])
+                    # heatmap
+                    sns.heatmap(
+                        mat,
+                        cmap=blk["cmap"],
+                        ax=axes_heat[i],
+                        yticklabels=False,
+                        cbar=False
+                    )
+                    # fixed tick labels
+                    tick_pos = _fixed_tick_positions(len(labels), n_xticks)
+                    axes_heat[i].set_xticks(tick_pos)
+                    axes_heat[i].set_xticklabels(
+                        labels[tick_pos],
+                        rotation=xtick_rotation,
+                        fontsize=xtick_fontsize
+                    )
+                    # bin separators
+                    for boundary in bin_boundaries[:-1]:
+                        axes_heat[i].axhline(y=boundary, color="black", linewidth=2)
+                    axes_heat[i].set_xlabel("Position", fontsize=9)
+                plt.tight_layout()
+                # save or show
+                if save_path is not None:
+                    safe_name = f"{ref}__{display_sample}".replace("=", "").replace("__", "_").replace(",", "_").replace(" ", "_")
+                    out_file = save_path / f"{safe_name}.png"
+                    fig.savefig(out_file, dpi=300)
+                    plt.close(fig)
+                    print(f"Saved: {out_file}")
+                else:
+                    plt.show()
+                # record results
+                rec = {
+                    "sample": str(sample),
+                    "ref": str(ref),
+                    "row_labels": row_labels,
+                    "bin_labels": bin_labels,
+                    "bin_boundaries": bin_boundaries,
+                    "percentages": percentages,
+                }
+                for blk in blocks:
+                    rec[f"{blk['name']}_matrix"] = blk["matrix"]
+                    rec[f"{blk['name']}_labels"] = list(map(str, blk["labels"]))
+                results.append(rec)
+                print(f"Summary for {display_sample} - {ref}:")
+                for bin_label, percent in percentages.items():
+                    print(f"  - {bin_label}: {percent:.1f}%")
             except Exception as e:
                 import traceback
                 traceback.print_exc()
                 continue
-import os
-import math
-from typing import List, Optional, Sequence, Tuple
+    # store once at the end (HDF5 safe)
+    # matrices won't be HDF5-safe; store only metadata + maybe hit counts
+    # adata.uns["clustermap_results"] = [
+    #     {k: v for k, v in r.items() if not k.endswith("_matrix")}
+    #     for r in results
+    # ]
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
+    return results
 def plot_hmm_layers_rolling_by_sample_ref(
     adata,
@@ -466,7 +1187,7 @@ def plot_hmm_layers_rolling_by_sample_ref(
     output_dir: Optional[str] = None,
     save: bool = True,
     show_raw: bool = False,
-    cmap: str = "tab10",
+    cmap: str = "tab20",
     use_var_coords: bool = True,
 ):
     """

smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl