PyPI - smftools - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

smftools/_version.py +1 -1
smftools/cli/helpers.py +48 -0
smftools/cli/hmm_adata.py +168 -145
smftools/cli/load_adata.py +155 -95
smftools/cli/preprocess_adata.py +222 -130
smftools/cli/spatial_adata.py +441 -308
smftools/cli_entry.py +4 -5
smftools/config/conversion.yaml +12 -5
smftools/config/deaminase.yaml +11 -9
smftools/config/default.yaml +123 -19
smftools/config/direct.yaml +3 -0
smftools/config/experiment_config.py +120 -19
smftools/hmm/HMM.py +12 -1
smftools/hmm/__init__.py +0 -6
smftools/hmm/archived/call_hmm_peaks.py +106 -0
smftools/hmm/call_hmm_peaks.py +318 -90
smftools/informatics/bam_functions.py +28 -29
smftools/informatics/h5ad_functions.py +1 -1
smftools/plotting/general_plotting.py +97 -51
smftools/plotting/position_stats.py +3 -3
smftools/preprocessing/__init__.py +2 -4
smftools/preprocessing/append_base_context.py +34 -25
smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
smftools/preprocessing/binarize_on_Youden.py +10 -8
smftools/preprocessing/calculate_complexity_II.py +1 -1
smftools/preprocessing/calculate_coverage.py +16 -13
smftools/preprocessing/calculate_position_Youden.py +41 -25
smftools/preprocessing/calculate_read_modification_stats.py +1 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
smftools/preprocessing/flag_duplicate_reads.py +1 -1
smftools/preprocessing/invert_adata.py +1 -1
smftools/preprocessing/load_sample_sheet.py +1 -1
smftools/preprocessing/reindex_references_adata.py +37 -0
smftools/readwrite.py +94 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
/smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
/smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
/smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
/smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
/smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
/smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0

smftools/hmm/HMM.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import math
-from typing import List, Optional, Tuple, Union, Any, Dict
+from typing import List, Optional, Tuple, Union, Any, Dict, Sequence
 import ast
 import json
@@ -772,6 +772,8 @@ class HMM(nn.Module):
         verbose: bool = True,
         uns_key: str = "hmm_appended_layers",
         config: Optional[Union[dict, "ExperimentConfig"]] = None,  # NEW: config/dict accepted
+        uns_flag: str = "hmm_annotated",
+        force_redo: bool = False
     ):
         """
         Annotate an AnnData with HMM-derived features (in adata.obs and adata.layers).
@@ -793,6 +795,12 @@ class HMM(nn.Module):
         import torch as _torch
         from tqdm import trange, tqdm as _tqdm
+        # Only run if not already performed
+        already = bool(adata.uns.get(uns_flag, False))
+        if (already and not force_redo):
+            # QC already performed; nothing to do
+            return None if in_place else adata
         # small helpers
         def _try_json_or_literal(s):
             if s is None:
@@ -1298,6 +1306,9 @@ class HMM(nn.Module):
         new_list = existing + [l for l in appended_layers if l not in existing]
         adata.uns[uns_key] = new_list
+        # Mark that the annotation has been completed
+        adata.uns[uns_flag] = True
         return None if in_place else adata
     def merge_intervals_in_layer(

smftools/hmm/__init__.py CHANGED Viewed

@@ -1,20 +1,14 @@
-from .apply_hmm_batched import apply_hmm_batched
-from .calculate_distances import calculate_distances
 from .call_hmm_peaks import call_hmm_peaks
 from .display_hmm import display_hmm
 from .hmm_readwrite import load_hmm, save_hmm
 from .nucleosome_hmm_refinement import refine_nucleosome_calls, infer_nucleosomes_in_large_bound
-from .train_hmm import train_hmm
 __all__ = [
-    "apply_hmm_batched",
-    "calculate_distances",
     "call_hmm_peaks",
     "display_hmm",
     "load_hmm",
     "refine_nucleosome_calls",
     "infer_nucleosomes_in_large_bound",
     "save_hmm",
-    "train_hmm"
 ]

smftools/hmm/archived/call_hmm_peaks.py ADDED Viewed

@@ -0,0 +1,106 @@
+def call_hmm_peaks(
+    adata,
+    feature_configs,
+    obs_column='Reference_strand',
+    site_types=['GpC_site', 'CpG_site'],
+    save_plot=False,
+    output_dir=None,
+    date_tag=None,
+    inplace=False
+):
+    import numpy as np
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    from scipy.signal import find_peaks
+    if not inplace:
+        adata = adata.copy()
+    # Ensure obs_column is categorical
+    if not isinstance(adata.obs[obs_column].dtype, pd.CategoricalDtype):
+        adata.obs[obs_column] = pd.Categorical(adata.obs[obs_column])
+    coordinates = adata.var_names.astype(int).values
+    peak_columns = []
+    obs_updates = {}
+    for feature_layer, config in feature_configs.items():
+        min_distance = config.get('min_distance', 200)
+        peak_width = config.get('peak_width', 200)
+        peak_prominence = config.get('peak_prominence', 0.2)
+        peak_threshold = config.get('peak_threshold', 0.8)
+        matrix = adata.layers[feature_layer]
+        means = np.mean(matrix, axis=0)
+        peak_indices, _ = find_peaks(means, prominence=peak_prominence, distance=min_distance)
+        peak_centers = coordinates[peak_indices]
+        adata.uns[f'{feature_layer} peak_centers'] = peak_centers.tolist()
+        # Plot
+        plt.figure(figsize=(6, 3))
+        plt.plot(coordinates, means)
+        plt.title(f"{feature_layer} with peak calls")
+        plt.xlabel("Genomic position")
+        plt.ylabel("Mean intensity")
+        for i, center in enumerate(peak_centers):
+            start, end = center - peak_width // 2, center + peak_width // 2
+            plt.axvspan(start, end, color='purple', alpha=0.2)
+            plt.axvline(center, color='red', linestyle='--')
+            aligned = [end if i % 2 else start, 'left' if i % 2 else 'right']
+            plt.text(aligned[0], 0, f"Peak {i}\n{center}", color='red', ha=aligned[1])
+        if save_plot and output_dir:
+            filename = f"{output_dir}/{date_tag or 'output'}_{feature_layer}_peaks.png"
+            plt.savefig(filename, bbox_inches='tight')
+            print(f"Saved plot to {filename}")
+        else:
+            plt.show()
+        feature_peak_columns = []
+        for center in peak_centers:
+            start, end = center - peak_width // 2, center + peak_width // 2
+            colname = f'{feature_layer}_peak_{center}'
+            peak_columns.append(colname)
+            feature_peak_columns.append(colname)
+            peak_mask = (coordinates >= start) & (coordinates <= end)
+            adata.var[colname] = peak_mask
+            region = matrix[:, peak_mask]
+            obs_updates[f'mean_{feature_layer}_around_{center}'] = np.mean(region, axis=1)
+            obs_updates[f'sum_{feature_layer}_around_{center}'] = np.sum(region, axis=1)
+            obs_updates[f'{feature_layer}_present_at_{center}'] = np.mean(region, axis=1) > peak_threshold
+            for site_type in site_types:
+                adata.obs[f'{site_type}_sum_around_{center}'] = 0
+                adata.obs[f'{site_type}_mean_around_{center}'] = np.nan
+            for ref in adata.obs[obs_column].cat.categories:
+                ref_idx = adata.obs[obs_column] == ref
+                mask_key = f"{ref}_{site_type}"
+                for site_type in site_types:
+                    if mask_key not in adata.var:
+                        continue
+                    site_mask = adata.var[mask_key].values
+                    site_coords = coordinates[site_mask]
+                    region_mask = (site_coords >= start) & (site_coords <= end)
+                    if not region_mask.any():
+                        continue
+                    full_mask = site_mask.copy()
+                    full_mask[site_mask] = region_mask
+                    site_region = adata[ref_idx, full_mask].X
+                    if hasattr(site_region, "A"):
+                        site_region = site_region.A
+                    if site_region.shape[1] > 0:
+                        adata.obs.loc[ref_idx, f'{site_type}_sum_around_{center}'] = np.nansum(site_region, axis=1)
+                        adata.obs.loc[ref_idx, f'{site_type}_mean_around_{center}'] = np.nanmean(site_region, axis=1)
+                    else:
+                        pass
+        adata.var[f'is_in_any_{feature_layer}_peak'] = adata.var[feature_peak_columns].any(axis=1)
+        print(f"Annotated {len(peak_centers)} peaks for {feature_layer}")
+    adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)
+    adata.obs = pd.concat([adata.obs, pd.DataFrame(obs_updates, index=adata.obs.index)], axis=1)
+    return adata if not inplace else None

smftools/hmm/call_hmm_peaks.py CHANGED Viewed

@@ -1,106 +1,334 @@
+from typing import Dict, Optional, Any, Union, Sequence
+from pathlib import Path
 def call_hmm_peaks(
     adata,
-    feature_configs,
-    obs_column='Reference_strand',
-    site_types=['GpC_site', 'CpG_site'],
-    save_plot=False,
-    output_dir=None,
-    date_tag=None,
-    inplace=False
+    feature_configs: Dict[str, Dict[str, Any]],
+    ref_column: str = "Reference_strand",
+    site_types: Sequence[str] = ("GpC", "CpG"),
+    save_plot: bool = False,
+    output_dir: Optional[Union[str, "Path"]] = None,
+    date_tag: Optional[str] = None,
+    inplace: bool = True,
+    index_col_suffix: Optional[str] = None,
+    alternate_labels: bool = False,
 ):
+    """
+    Call peaks on one or more HMM-derived (or other) layers and annotate adata.var / adata.obs,
+    doing peak calling *within each reference subset*.
+    Parameters
+    ----------
+    adata : AnnData
+        Input AnnData with layers already containing feature tracks (e.g. HMM-derived masks).
+    feature_configs : dict
+        Mapping: feature_type_or_layer_suffix -> {
+            "min_distance": int (default 200),
+            "peak_width":   int (default 200),
+            "peak_prominence": float (default 0.2),
+            "peak_threshold":  float (default 0.8),
+        }
+        Keys are usually *feature types* like "all_accessible_features" or
+        "small_bound_stretch". These are matched against existing HMM layers
+        (e.g. "GpC_all_accessible_features", "Combined_small_bound_stretch")
+        using a suffix match. You can also pass full layer names if you wish.
+    ref_column : str
+        Column in adata.obs defining reference groups (e.g. "Reference_strand").
+    site_types : sequence of str
+        Site types (without "_site"); expects var columns like f"{ref}_{site_type}_site".
+        e.g. ("GpC", "CpG") -> "6B6_top_GpC_site", etc.
+    save_plot : bool
+        If True, save peak diagnostic plots instead of just showing them.
+    output_dir : path-like or None
+        Directory for saved plots (created if needed).
+    date_tag : str or None
+        Optional tag to prefix plot filenames.
+    inplace : bool
+        If False, operate on a copy and return it. If True, modify adata and return None.
+    index_col_suffix : str or None
+        If None, coordinates come from adata.var_names (cast to int when possible).
+        If set, for each ref we use adata.var[f"{ref}_{index_col_suffix}"] as the
+        coordinate system (e.g. a reindexed coordinate).
+    Returns
+    -------
+    None or AnnData
+    """
     import numpy as np
     import pandas as pd
     import matplotlib.pyplot as plt
     from scipy.signal import find_peaks
+    from scipy.sparse import issparse
     if not inplace:
         adata = adata.copy()
-    # Ensure obs_column is categorical
-    if not isinstance(adata.obs[obs_column].dtype, pd.CategoricalDtype):
-        adata.obs[obs_column] = pd.Categorical(adata.obs[obs_column])
-    coordinates = adata.var_names.astype(int).values
-    peak_columns = []
-    obs_updates = {}
-    for feature_layer, config in feature_configs.items():
-        min_distance = config.get('min_distance', 200)
-        peak_width = config.get('peak_width', 200)
-        peak_prominence = config.get('peak_prominence', 0.2)
-        peak_threshold = config.get('peak_threshold', 0.8)
-        matrix = adata.layers[feature_layer]
-        means = np.mean(matrix, axis=0)
-        peak_indices, _ = find_peaks(means, prominence=peak_prominence, distance=min_distance)
-        peak_centers = coordinates[peak_indices]
-        adata.uns[f'{feature_layer} peak_centers'] = peak_centers.tolist()
-        # Plot
-        plt.figure(figsize=(6, 3))
-        plt.plot(coordinates, means)
-        plt.title(f"{feature_layer} with peak calls")
-        plt.xlabel("Genomic position")
-        plt.ylabel("Mean intensity")
-        for i, center in enumerate(peak_centers):
-            start, end = center - peak_width // 2, center + peak_width // 2
-            plt.axvspan(start, end, color='purple', alpha=0.2)
-            plt.axvline(center, color='red', linestyle='--')
-            aligned = [end if i % 2 else start, 'left' if i % 2 else 'right']
-            plt.text(aligned[0], 0, f"Peak {i}\n{center}", color='red', ha=aligned[1])
-        if save_plot and output_dir:
-            filename = f"{output_dir}/{date_tag or 'output'}_{feature_layer}_peaks.png"
-            plt.savefig(filename, bbox_inches='tight')
-            print(f"Saved plot to {filename}")
+    # Ensure ref_column is categorical
+    if not pd.api.types.is_categorical_dtype(adata.obs[ref_column]):
+        adata.obs[ref_column] = adata.obs[ref_column].astype("category")
+    # Base coordinates (fallback)
+    try:
+        base_coordinates = adata.var_names.astype(int).values
+    except Exception:
+        base_coordinates = np.arange(adata.n_vars, dtype=int)
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+    # HMM layers known to the object (if present)
+    hmm_layers = list(adata.uns.get("hmm_appended_layers", [])) or []
+    # keep only the binary masks, not *_lengths
+    hmm_layers = [layer for layer in hmm_layers if not layer.endswith("_lengths")]
+    # Fallback: use all layer names if hmm_appended_layers is empty/missing
+    all_layer_names = list(adata.layers.keys())
+    all_peak_var_cols = []
+    # Iterate over each reference separately
+    for ref in adata.obs[ref_column].cat.categories:
+        ref_mask = (adata.obs[ref_column] == ref).values
+        if not ref_mask.any():
+            continue
+        # Per-ref coordinates: either from a reindexed column or global fallback
+        if index_col_suffix is not None:
+            coord_col = f"{ref}_{index_col_suffix}"
+            if coord_col not in adata.var:
+                raise KeyError(
+                    f"index_col_suffix='{index_col_suffix}' requested, "
+                    f"but var column '{coord_col}' is missing for ref '{ref}'."
+                )
+            coord_vals = adata.var[coord_col].values
+            # Try to coerce to numeric
+            try:
+                coordinates = coord_vals.astype(int)
+            except Exception:
+                coordinates = np.asarray(coord_vals, dtype=float)
         else:
-            plt.show()
-        feature_peak_columns = []
-        for center in peak_centers:
-            start, end = center - peak_width // 2, center + peak_width // 2
-            colname = f'{feature_layer}_peak_{center}'
-            peak_columns.append(colname)
-            feature_peak_columns.append(colname)
-            peak_mask = (coordinates >= start) & (coordinates <= end)
-            adata.var[colname] = peak_mask
-            region = matrix[:, peak_mask]
-            obs_updates[f'mean_{feature_layer}_around_{center}'] = np.mean(region, axis=1)
-            obs_updates[f'sum_{feature_layer}_around_{center}'] = np.sum(region, axis=1)
-            obs_updates[f'{feature_layer}_present_at_{center}'] = np.mean(region, axis=1) > peak_threshold
-            for site_type in site_types:
-                adata.obs[f'{site_type}_sum_around_{center}'] = 0
-                adata.obs[f'{site_type}_mean_around_{center}'] = np.nan
-            for ref in adata.obs[obs_column].cat.categories:
-                ref_idx = adata.obs[obs_column] == ref
-                mask_key = f"{ref}_{site_type}"
-                for site_type in site_types:
-                    if mask_key not in adata.var:
-                        continue
-                    site_mask = adata.var[mask_key].values
-                    site_coords = coordinates[site_mask]
-                    region_mask = (site_coords >= start) & (site_coords <= end)
-                    if not region_mask.any():
-                        continue
-                    full_mask = site_mask.copy()
-                    full_mask[site_mask] = region_mask
-                    site_region = adata[ref_idx, full_mask].X
-                    if hasattr(site_region, "A"):
-                        site_region = site_region.A
-                    if site_region.shape[1] > 0:
-                        adata.obs.loc[ref_idx, f'{site_type}_sum_around_{center}'] = np.nansum(site_region, axis=1)
-                        adata.obs.loc[ref_idx, f'{site_type}_mean_around_{center}'] = np.nanmean(site_region, axis=1)
+            coordinates = base_coordinates
+        # Resolve each feature_config key to one or more actual layer names
+        for feature_key, config in feature_configs.items():
+            # Candidate search space: HMM layers if present, else all layers
+            search_layers = hmm_layers if hmm_layers else all_layer_names
+            candidate_layers = []
+            # First: exact match
+            for lname in search_layers:
+                if lname == feature_key:
+                    candidate_layers.append(lname)
+            # Second: suffix match (e.g. "all_accessible_features" ->
+            # "GpC_all_accessible_features", "Combined_all_accessible_features", etc.)
+            if not candidate_layers:
+                for lname in search_layers:
+                    if lname.endswith(feature_key):
+                        candidate_layers.append(lname)
+            # Third: if user passed a full layer name that wasn't in hmm_layers,
+            # but does exist in adata.layers, allow it.
+            if not candidate_layers and feature_key in adata.layers:
+                candidate_layers.append(feature_key)
+            if not candidate_layers:
+                print(
+                    f"[call_hmm_peaks] WARNING: no layers found matching feature key "
+                    f"'{feature_key}' in ref '{ref}'. Skipping."
+                )
+                continue
+            # Run peak calling on each resolved layer for this ref
+            for layer_name in candidate_layers:
+                if layer_name not in adata.layers:
+                    print(
+                        f"[call_hmm_peaks] WARNING: resolved layer '{layer_name}' "
+                        f"not found in adata.layers; skipping."
+                    )
+                    continue
+                min_distance = int(config.get("min_distance", 200))
+                peak_width = int(config.get("peak_width", 200))
+                peak_prominence = float(config.get("peak_prominence", 0.2))
+                peak_threshold = float(config.get("peak_threshold", 0.8))
+                layer_data = adata.layers[layer_name]
+                if issparse(layer_data):
+                    layer_data = layer_data.toarray()
+                else:
+                    layer_data = np.asarray(layer_data)
+                # Subset rows for this ref
+                matrix = layer_data[ref_mask, :]  # (n_ref_reads, n_vars)
+                if matrix.shape[0] == 0:
+                    continue
+                # Mean signal along positions (within this ref only)
+                means = np.nanmean(matrix, axis=0)
+                # Optional rolling-mean smoothing before peak detection
+                rolling_window = int(config.get("rolling_window", 1))
+                if rolling_window > 1:
+                    # Simple centered rolling mean via convolution
+                    kernel = np.ones(rolling_window, dtype=float) / float(rolling_window)
+                    smoothed = np.convolve(means, kernel, mode="same")
+                    peak_metric = smoothed
+                else:
+                    peak_metric = means
+                # Peak detection
+                peak_indices, _ = find_peaks(
+                    peak_metric, prominence=peak_prominence, distance=min_distance
+                )
+                if peak_indices.size == 0:
+                    print(
+                        f"[call_hmm_peaks] No peaks found for layer '{layer_name}' "
+                        f"in ref '{ref}'."
+                    )
+                    continue
+                peak_centers = coordinates[peak_indices]
+                # Store per-ref peak centers
+                adata.uns[f"{layer_name}_{ref}_peak_centers"] = peak_centers.tolist()
+                # ---- Plot ----
+                plt.figure(figsize=(6, 3))
+                plt.plot(coordinates, peak_metric, linewidth=1)
+                plt.title(f"{layer_name} peaks in {ref}")
+                plt.xlabel("Coordinate")
+                plt.ylabel(f"Rolling Mean - roll size {rolling_window}")
+                for i, center in enumerate(peak_centers):
+                    start = center - peak_width // 2
+                    end = center + peak_width // 2
+                    height = peak_metric[peak_indices[i]]
+                    plt.axvspan(start, end, color="purple", alpha=0.2)
+                    plt.axvline(center, color="red", linestyle="--", linewidth=0.8)
+                    # alternate label placement a bit left/right
+                    if alternate_labels:
+                        if i % 2 == 0:
+                            x_text, ha = start, "right"
+                        else:
+                            x_text, ha = end, "left"
                     else:
-                        pass
+                        x_text, ha = start, "right"
+                    plt.text(
+                        x_text,
+                        height * 0.8,
+                        f"Peak {i}\n{center}",
+                        color="red",
+                        ha=ha,
+                        va="bottom",
+                        fontsize=8,
+                    )
+                if save_plot and output_dir is not None:
+                    tag = date_tag or "output"
+                    # include ref in filename
+                    safe_ref = str(ref).replace("/", "_")
+                    safe_layer = str(layer_name).replace("/", "_")
+                    fname = output_dir / f"{tag}_{safe_layer}_{safe_ref}_peaks.png"
+                    plt.savefig(fname, bbox_inches="tight", dpi=200)
+                    print(f"[call_hmm_peaks] Saved plot to {fname}")
+                    plt.close()
+                else:
+                    plt.tight_layout()
+                    plt.show()
+                feature_peak_cols = []
+                # ---- Per-peak annotations (within this ref) ----
+                for center in peak_centers:
+                    start = center - peak_width // 2
+                    end = center + peak_width // 2
+                    # Make column names ref- and layer-specific so they don't collide
+                    colname = f"{layer_name}_{ref}_peak_{center}"
+                    feature_peak_cols.append(colname)
+                    all_peak_var_cols.append(colname)
+                    # Var-level mask: is this position in the window?
+                    peak_mask = (coordinates >= start) & (coordinates <= end)
+                    adata.var[colname] = peak_mask
+                    # Extract signal in that window from the *ref subset* matrix
+                    region = matrix[:, peak_mask]  # (n_ref_reads, n_positions_in_window)
+                    # Per-read summary in this window for the feature layer itself
+                    mean_col = f"mean_{layer_name}_{ref}_around_{center}"
+                    sum_col = f"sum_{layer_name}_{ref}_around_{center}"
+                    present_col = f"{layer_name}_{ref}_present_at_{center}"
+                    # Create columns if missing, then fill only the ref rows
+                    if mean_col not in adata.obs:
+                        adata.obs[mean_col] = np.nan
+                    if sum_col not in adata.obs:
+                        adata.obs[sum_col] = 0.0
+                    if present_col not in adata.obs:
+                        adata.obs[present_col] = False
+                    adata.obs.loc[ref_mask, mean_col] = np.nanmean(region, axis=1)
+                    adata.obs.loc[ref_mask, sum_col] = np.nansum(region, axis=1)
+                    adata.obs.loc[ref_mask, present_col] = (
+                        adata.obs.loc[ref_mask, mean_col].values > peak_threshold
+                    )
+                    # Initialize site-type summaries (global columns; filled per ref)
+                    for site_type in site_types:
+                        sum_site_col = f"{site_type}_{ref}_sum_around_{center}"
+                        mean_site_col = f"{site_type}_{ref}_mean_around_{center}"
+                        if sum_site_col not in adata.obs:
+                            adata.obs[sum_site_col] = 0.0
+                        if mean_site_col not in adata.obs:
+                            adata.obs[mean_site_col] = np.nan
+                    # Per-site-type summaries for this ref
+                    for site_type in site_types:
+                        mask_key = f"{ref}_{site_type}_site"
+                        if mask_key not in adata.var:
+                            continue
+                        site_mask = adata.var[mask_key].values.astype(bool)
+                        if not site_mask.any():
+                            continue
+                        site_coords = coordinates[site_mask]
+                        region_mask = (site_coords >= start) & (site_coords <= end)
+                        if not region_mask.any():
+                            continue
+                        full_mask = np.zeros_like(site_mask, dtype=bool)
+                        full_mask[site_mask] = region_mask
+                        site_region = adata[ref_mask, full_mask].X
+                        if hasattr(site_region, "A"):
+                            site_region = site_region.A  # sparse -> dense
+                        if site_region.shape[1] == 0:
+                            continue
+                        sum_site_col = f"{site_type}_{ref}_sum_around_{center}"
+                        mean_site_col = f"{site_type}_{ref}_mean_around_{center}"
+                        adata.obs.loc[ref_mask, sum_site_col] = np.nansum(site_region, axis=1)
+                        adata.obs.loc[ref_mask, mean_site_col] = np.nanmean(site_region, axis=1)
-        adata.var[f'is_in_any_{feature_layer}_peak'] = adata.var[feature_peak_columns].any(axis=1)
-        print(f"Annotated {len(peak_centers)} peaks for {feature_layer}")
+                # Mark "any peak" for this (layer, ref)
+                any_col = f"is_in_any_{layer_name}_peak_{ref}"
+                adata.var[any_col] = adata.var[feature_peak_cols].any(axis=1)
+                print(
+                    f"[call_hmm_peaks] Annotated {len(peak_centers)} peaks "
+                    f"for layer '{layer_name}' in ref '{ref}'."
+                )
-    adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)
-    adata.obs = pd.concat([adata.obs, pd.DataFrame(obs_updates, index=adata.obs.index)], axis=1)
+    # Global any-peak flag across all feature layers and references
+    if all_peak_var_cols:
+        adata.var["is_in_any_peak"] = adata.var[all_peak_var_cols].any(axis=1)
-    return adata if not inplace else None
+    return None if inplace else adata

smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl