PyPI - smftools - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

smftools 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (133) hide show

smftools/__init__.py +6 -8
smftools/_settings.py +4 -6
smftools/_version.py +1 -1
smftools/cli/helpers.py +7 -1
smftools/cli/hmm_adata.py +902 -244
smftools/cli/load_adata.py +318 -198
smftools/cli/preprocess_adata.py +285 -171
smftools/cli/spatial_adata.py +137 -53
smftools/cli_entry.py +94 -178
smftools/config/__init__.py +1 -1
smftools/config/conversion.yaml +5 -1
smftools/config/deaminase.yaml +1 -1
smftools/config/default.yaml +22 -17
smftools/config/direct.yaml +8 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +505 -276
smftools/constants.py +37 -0
smftools/datasets/__init__.py +2 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2125 -1426
smftools/hmm/__init__.py +2 -3
smftools/hmm/archived/call_hmm_peaks.py +16 -1
smftools/hmm/call_hmm_peaks.py +173 -193
smftools/hmm/display_hmm.py +19 -6
smftools/hmm/hmm_readwrite.py +13 -4
smftools/hmm/nucleosome_hmm_refinement.py +102 -14
smftools/informatics/__init__.py +30 -7
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
smftools/informatics/archived/print_bam_query_seq.py +7 -1
smftools/informatics/bam_functions.py +379 -156
smftools/informatics/basecalling.py +51 -9
smftools/informatics/bed_functions.py +90 -57
smftools/informatics/binarize_converted_base_identities.py +18 -7
smftools/informatics/complement_base_list.py +7 -6
smftools/informatics/converted_BAM_to_adata.py +265 -122
smftools/informatics/fasta_functions.py +161 -83
smftools/informatics/h5ad_functions.py +195 -29
smftools/informatics/modkit_extract_to_adata.py +609 -270
smftools/informatics/modkit_functions.py +85 -44
smftools/informatics/ohe.py +44 -21
smftools/informatics/pod5_functions.py +112 -73
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +2 -7
smftools/machine_learning/data/anndata_data_module.py +143 -50
smftools/machine_learning/data/preprocessing.py +2 -1
smftools/machine_learning/evaluation/__init__.py +1 -1
smftools/machine_learning/evaluation/eval_utils.py +11 -14
smftools/machine_learning/evaluation/evaluators.py +46 -33
smftools/machine_learning/inference/__init__.py +1 -1
smftools/machine_learning/inference/inference_utils.py +7 -4
smftools/machine_learning/inference/lightning_inference.py +9 -13
smftools/machine_learning/inference/sklearn_inference.py +6 -8
smftools/machine_learning/inference/sliding_window_inference.py +35 -25
smftools/machine_learning/models/__init__.py +10 -5
smftools/machine_learning/models/base.py +28 -42
smftools/machine_learning/models/cnn.py +15 -11
smftools/machine_learning/models/lightning_base.py +71 -40
smftools/machine_learning/models/mlp.py +13 -4
smftools/machine_learning/models/positional.py +3 -2
smftools/machine_learning/models/rnn.py +3 -2
smftools/machine_learning/models/sklearn_models.py +39 -22
smftools/machine_learning/models/transformer.py +68 -53
smftools/machine_learning/models/wrappers.py +2 -1
smftools/machine_learning/training/__init__.py +2 -2
smftools/machine_learning/training/train_lightning_model.py +29 -20
smftools/machine_learning/training/train_sklearn_model.py +9 -15
smftools/machine_learning/utils/__init__.py +1 -1
smftools/machine_learning/utils/device.py +7 -4
smftools/machine_learning/utils/grl.py +3 -1
smftools/metadata.py +443 -0
smftools/plotting/__init__.py +19 -5
smftools/plotting/autocorrelation_plotting.py +145 -44
smftools/plotting/classifiers.py +162 -72
smftools/plotting/general_plotting.py +347 -168
smftools/plotting/hmm_plotting.py +42 -13
smftools/plotting/position_stats.py +145 -85
smftools/plotting/qc_plotting.py +20 -12
smftools/preprocessing/__init__.py +8 -8
smftools/preprocessing/append_base_context.py +105 -79
smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +127 -31
smftools/preprocessing/binary_layers_to_ohe.py +17 -11
smftools/preprocessing/calculate_complexity_II.py +86 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +44 -22
smftools/preprocessing/calculate_pairwise_differences.py +2 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
smftools/preprocessing/calculate_position_Youden.py +103 -55
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
smftools/preprocessing/flag_duplicate_reads.py +688 -271
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +8 -3
smftools/preprocessing/min_non_diagonal.py +2 -1
smftools/preprocessing/recipes.py +56 -23
smftools/preprocessing/reindex_references_adata.py +93 -27
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +264 -109
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +3 -4
smftools/tools/archived/classifiers.py +163 -0
smftools/tools/archived/subset_adata_v1.py +10 -1
smftools/tools/archived/subset_adata_v2.py +12 -1
smftools/tools/calculate_umap.py +54 -15
smftools/tools/cluster_adata_on_methylation.py +115 -46
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +229 -98
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
smftools-0.2.5.dist-info/RECORD +181 -0
smftools-0.2.4.dist-info/RECORD +0 -176
/smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
/smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
/smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0

smftools/hmm/__init__.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from .call_hmm_peaks import call_hmm_peaks
 from .display_hmm import display_hmm
 from .hmm_readwrite import load_hmm, save_hmm
-from .nucleosome_hmm_refinement import refine_nucleosome_calls, infer_nucleosomes_in_large_bound
+from .nucleosome_hmm_refinement import infer_nucleosomes_in_large_bound, refine_nucleosome_calls
 __all__ = [
     "call_hmm_peaks",
@@ -11,4 +10,4 @@ __all__ = [
     "refine_nucleosome_calls",
     "infer_nucleosomes_in_large_bound",
     "save_hmm",
-]
+]

smftools/hmm/archived/call_hmm_peaks.py CHANGED Viewed

@@ -8,6 +8,21 @@ def call_hmm_peaks(
     date_tag=None,
     inplace=False
 ):
+    """Call peaks from HMM feature layers and annotate AnnData.
+    Args:
+        adata: AnnData containing feature layers.
+        feature_configs: Mapping of layer name to peak config.
+        obs_column: Obs column for reference categories.
+        site_types: Site types to summarize around peaks.
+        save_plot: Whether to save peak plots.
+        output_dir: Output directory for plots.
+        date_tag: Optional tag for plot filenames.
+        inplace: Whether to modify AnnData in place.
+    Returns:
+        Annotated AnnData with peak masks and summary columns.
+    """
     import numpy as np
     import pandas as pd
     import matplotlib.pyplot as plt
@@ -103,4 +118,4 @@ def call_hmm_peaks(
     adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)
     adata.obs = pd.concat([adata.obs, pd.DataFrame(obs_updates, index=adata.obs.index)], axis=1)
-    return adata if not inplace else None
+    return adata if not inplace else None

smftools/hmm/call_hmm_peaks.py CHANGED Viewed

@@ -1,5 +1,12 @@
-from typing import Dict, Optional, Any, Union, Sequence
+# FILE: smftools/hmm/call_hmm_peaks.py
 from pathlib import Path
+from typing import Any, Dict, Optional, Sequence, Union
+from smftools.logging_utils import get_logger
+logger = get_logger(__name__)
 def call_hmm_peaks(
     adata,
@@ -14,96 +21,76 @@ def call_hmm_peaks(
     alternate_labels: bool = False,
 ):
     """
-    Call peaks on one or more HMM-derived (or other) layers and annotate adata.var / adata.obs,
-    doing peak calling *within each reference subset*.
-    Parameters
-    ----------
-    adata : AnnData
-        Input AnnData with layers already containing feature tracks (e.g. HMM-derived masks).
-    feature_configs : dict
-        Mapping: feature_type_or_layer_suffix -> {
-            "min_distance": int (default 200),
-            "peak_width":   int (default 200),
-            "peak_prominence": float (default 0.2),
-            "peak_threshold":  float (default 0.8),
-        }
-        Keys are usually *feature types* like "all_accessible_features" or
-        "small_bound_stretch". These are matched against existing HMM layers
-        (e.g. "GpC_all_accessible_features", "Combined_small_bound_stretch")
-        using a suffix match. You can also pass full layer names if you wish.
-    ref_column : str
-        Column in adata.obs defining reference groups (e.g. "Reference_strand").
-    site_types : sequence of str
-        Site types (without "_site"); expects var columns like f"{ref}_{site_type}_site".
-        e.g. ("GpC", "CpG") -> "6B6_top_GpC_site", etc.
-    save_plot : bool
-        If True, save peak diagnostic plots instead of just showing them.
-    output_dir : path-like or None
-        Directory for saved plots (created if needed).
-    date_tag : str or None
-        Optional tag to prefix plot filenames.
-    inplace : bool
-        If False, operate on a copy and return it. If True, modify adata and return None.
-    index_col_suffix : str or None
-        If None, coordinates come from adata.var_names (cast to int when possible).
-        If set, for each ref we use adata.var[f"{ref}_{index_col_suffix}"] as the
-        coordinate system (e.g. a reindexed coordinate).
-    Returns
-    -------
-    None or AnnData
+    Peak calling over HMM (or other) layers, per reference group and per layer.
+    Writes:
+      - adata.uns["{layer}_{ref}_peak_centers"] = list of centers
+      - adata.var["{layer}_{ref}_peak_{center}"] boolean window masks
+      - adata.obs per-read summaries for each peak window:
+            mean_{layer}_{ref}_around_{center}
+            sum_{layer}_{ref}_around_{center}
+            {layer}_{ref}_present_at_{center} (bool)
+        and per site-type:
+            sum_{layer}_{site}_{ref}_around_{center}
+            mean_{layer}_{site}_{ref}_around_{center}
+      - adata.var["is_in_any_{layer}_peak_{ref}"]
+      - adata.var["is_in_any_peak"] (global)
     """
+    import matplotlib.pyplot as plt
     import numpy as np
     import pandas as pd
-    import matplotlib.pyplot as plt
     from scipy.signal import find_peaks
     from scipy.sparse import issparse
     if not inplace:
         adata = adata.copy()
-    # Ensure ref_column is categorical
+    if ref_column not in adata.obs:
+        raise KeyError(f"obs column '{ref_column}' not found")
+    # Ensure categorical for predictable ref iteration
     if not pd.api.types.is_categorical_dtype(adata.obs[ref_column]):
         adata.obs[ref_column] = adata.obs[ref_column].astype("category")
-    # Base coordinates (fallback)
+    # Optional: drop duplicate obs columns once to avoid Pandas/AnnData view quirks
+    if getattr(adata.obs.columns, "duplicated", None) is not None:
+        if adata.obs.columns.duplicated().any():
+            adata.obs = adata.obs.loc[:, ~adata.obs.columns.duplicated(keep="first")].copy()
+    # Fallback coordinates from var_names
     try:
         base_coordinates = adata.var_names.astype(int).values
     except Exception:
         base_coordinates = np.arange(adata.n_vars, dtype=int)
+    # Output dir
     if output_dir is not None:
         output_dir = Path(output_dir)
         output_dir.mkdir(parents=True, exist_ok=True)
-    # HMM layers known to the object (if present)
-    hmm_layers = list(adata.uns.get("hmm_appended_layers", [])) or []
-    # keep only the binary masks, not *_lengths
-    hmm_layers = [layer for layer in hmm_layers if not layer.endswith("_lengths")]
-    # Fallback: use all layer names if hmm_appended_layers is empty/missing
-    all_layer_names = list(adata.layers.keys())
+    # Build search pool = union of declared HMM layers and actual layers; exclude helper suffixes
+    declared = list(adata.uns.get("hmm_appended_layers", []) or [])
+    search_pool = [
+        layer
+        for layer in declared
+        if not any(s in layer for s in ("_lengths", "_states", "_posterior"))
+    ]
     all_peak_var_cols = []
-    # Iterate over each reference separately
+    # Iterate per reference
     for ref in adata.obs[ref_column].cat.categories:
         ref_mask = (adata.obs[ref_column] == ref).values
         if not ref_mask.any():
             continue
-        # Per-ref coordinates: either from a reindexed column or global fallback
+        # Per-ref coordinate system
         if index_col_suffix is not None:
             coord_col = f"{ref}_{index_col_suffix}"
             if coord_col not in adata.var:
                 raise KeyError(
-                    f"index_col_suffix='{index_col_suffix}' requested, "
-                    f"but var column '{coord_col}' is missing for ref '{ref}'."
+                    f"index_col_suffix='{index_col_suffix}' requested, missing var column '{coord_col}' for ref '{ref}'."
                 )
             coord_vals = adata.var[coord_col].values
-            # Try to coerce to numeric
             try:
                 coordinates = coord_vals.astype(int)
             except Exception:
@@ -111,184 +98,159 @@ def call_hmm_peaks(
         else:
             coordinates = base_coordinates
-        # Resolve each feature_config key to one or more actual layer names
+        if coordinates.shape[0] != adata.n_vars:
+            raise ValueError(f"Coordinate length {coordinates.shape[0]} != n_vars {adata.n_vars}")
+        # Feature keys to consider
         for feature_key, config in feature_configs.items():
-            # Candidate search space: HMM layers if present, else all layers
-            search_layers = hmm_layers if hmm_layers else all_layer_names
-            candidate_layers = []
-            # First: exact match
-            for lname in search_layers:
-                if lname == feature_key:
-                    candidate_layers.append(lname)
-            # Second: suffix match (e.g. "all_accessible_features" ->
-            # "GpC_all_accessible_features", "Combined_all_accessible_features", etc.)
-            if not candidate_layers:
-                for lname in search_layers:
-                    if lname.endswith(feature_key):
-                        candidate_layers.append(lname)
-            # Third: if user passed a full layer name that wasn't in hmm_layers,
-            # but does exist in adata.layers, allow it.
-            if not candidate_layers and feature_key in adata.layers:
-                candidate_layers.append(feature_key)
-            if not candidate_layers:
-                print(
-                    f"[call_hmm_peaks] WARNING: no layers found matching feature key "
-                    f"'{feature_key}' in ref '{ref}'. Skipping."
+            # Resolve candidate layers: exact → suffix → direct present
+            candidates = [ln for ln in search_pool if ln == feature_key]
+            if not candidates:
+                candidates = [ln for ln in search_pool if str(ln).endswith(feature_key)]
+            if not candidates and feature_key in adata.layers:
+                candidates = [feature_key]
+            if not candidates:
+                logger.warning(
+                    "[call_hmm_peaks] No layers found matching '%s' in ref '%s'. Skipping.",
+                    feature_key,
+                    ref,
                 )
                 continue
-            # Run peak calling on each resolved layer for this ref
-            for layer_name in candidate_layers:
+            # Hyperparams (sanitized)
+            min_distance = max(1, int(config.get("min_distance", 200)))
+            peak_width = max(1, int(config.get("peak_width", 200)))
+            peak_prom = float(config.get("peak_prominence", 0.2))
+            peak_threshold = float(config.get("peak_threshold", 0.8))
+            rolling_window = max(1, int(config.get("rolling_window", 1)))
+            for layer_name in candidates:
                 if layer_name not in adata.layers:
-                    print(
-                        f"[call_hmm_peaks] WARNING: resolved layer '{layer_name}' "
-                        f"not found in adata.layers; skipping."
+                    logger.warning(
+                        "[call_hmm_peaks] Layer '%s' not in adata.layers; skipping.",
+                        layer_name,
                     )
                     continue
-                min_distance = int(config.get("min_distance", 200))
-                peak_width = int(config.get("peak_width", 200))
-                peak_prominence = float(config.get("peak_prominence", 0.2))
-                peak_threshold = float(config.get("peak_threshold", 0.8))
-                layer_data = adata.layers[layer_name]
-                if issparse(layer_data):
-                    layer_data = layer_data.toarray()
-                else:
-                    layer_data = np.asarray(layer_data)
+                # Dense layer data
+                L = adata.layers[layer_name]
+                L = L.toarray() if issparse(L) else np.asarray(L)
+                if L.shape != (adata.n_obs, adata.n_vars):
+                    logger.warning(
+                        "[call_hmm_peaks] Layer '%s' has shape %s, expected (%s, %s); skipping.",
+                        layer_name,
+                        L.shape,
+                        adata.n_obs,
+                        adata.n_vars,
+                    )
+                    continue
-                # Subset rows for this ref
-                matrix = layer_data[ref_mask, :]  # (n_ref_reads, n_vars)
-                if matrix.shape[0] == 0:
+                # Ref subset
+                matrix = L[ref_mask, :]
+                if matrix.size == 0 or matrix.shape[0] == 0:
                     continue
-                # Mean signal along positions (within this ref only)
                 means = np.nanmean(matrix, axis=0)
+                means = np.nan_to_num(means, nan=0.0)
-                # Optional rolling-mean smoothing before peak detection
-                rolling_window = int(config.get("rolling_window", 1))
                 if rolling_window > 1:
-                    # Simple centered rolling mean via convolution
                     kernel = np.ones(rolling_window, dtype=float) / float(rolling_window)
-                    smoothed = np.convolve(means, kernel, mode="same")
-                    peak_metric = smoothed
+                    peak_metric = np.convolve(means, kernel, mode="same")
                 else:
                     peak_metric = means
                 # Peak detection
                 peak_indices, _ = find_peaks(
-                    peak_metric, prominence=peak_prominence, distance=min_distance
+                    peak_metric, prominence=peak_prom, distance=min_distance
                 )
                 if peak_indices.size == 0:
-                    print(
-                        f"[call_hmm_peaks] No peaks found for layer '{layer_name}' "
-                        f"in ref '{ref}'."
+                    logger.info(
+                        "[call_hmm_peaks] No peaks for layer '%s' in ref '%s'.",
+                        layer_name,
+                        ref,
                     )
                     continue
                 peak_centers = coordinates[peak_indices]
-                # Store per-ref peak centers
                 adata.uns[f"{layer_name}_{ref}_peak_centers"] = peak_centers.tolist()
-                # ---- Plot ----
-                plt.figure(figsize=(6, 3))
-                plt.plot(coordinates, peak_metric, linewidth=1)
-                plt.title(f"{layer_name} peaks in {ref}")
-                plt.xlabel("Coordinate")
-                plt.ylabel(f"Rolling Mean - roll size {rolling_window}")
+                # Plot once per layer/ref
+                fig, ax = plt.subplots(figsize=(6, 3))
+                ax.plot(coordinates, peak_metric, linewidth=1)
+                ax.set_title(f"{layer_name} peaks in {ref}")
+                ax.set_xlabel("Coordinate")
+                ax.set_ylabel(f"Rolling Mean (win={rolling_window})")
                 for i, center in enumerate(peak_centers):
                     start = center - peak_width // 2
                     end = center + peak_width // 2
                     height = peak_metric[peak_indices[i]]
-                    plt.axvspan(start, end, color="purple", alpha=0.2)
-                    plt.axvline(center, color="red", linestyle="--", linewidth=0.8)
-                    # alternate label placement a bit left/right
-                    if alternate_labels:
-                        if i % 2 == 0:
-                            x_text, ha = start, "right"
-                        else:
-                            x_text, ha = end, "left"
-                    else:
-                        x_text, ha = start, "right"
-                    plt.text(
-                        x_text,
-                        height * 0.8,
-                        f"Peak {i}\n{center}",
-                        color="red",
-                        ha=ha,
-                        va="bottom",
-                        fontsize=8,
+                    ax.axvspan(start, end, alpha=0.2)
+                    ax.axvline(center, linestyle="--", linewidth=0.8)
+                    x_text, ha = (
+                        (start, "right") if (not alternate_labels or i % 2 == 0) else (end, "left")
+                    )
+                    ax.text(
+                        x_text, height * 0.8, f"Peak {i}\n{center}", ha=ha, va="bottom", fontsize=8
                     )
                 if save_plot and output_dir is not None:
                     tag = date_tag or "output"
-                    # include ref in filename
                     safe_ref = str(ref).replace("/", "_")
                     safe_layer = str(layer_name).replace("/", "_")
                     fname = output_dir / f"{tag}_{safe_layer}_{safe_ref}_peaks.png"
-                    plt.savefig(fname, bbox_inches="tight", dpi=200)
-                    print(f"[call_hmm_peaks] Saved plot to {fname}")
-                    plt.close()
+                    fig.savefig(fname, bbox_inches="tight", dpi=200)
+                    logger.info("[call_hmm_peaks] Saved plot to %s", fname)
+                    plt.close(fig)
                 else:
-                    plt.tight_layout()
+                    fig.tight_layout()
                     plt.show()
+                # Collect new obs columns; assign once per layer/ref
+                new_obs_cols: Dict[str, np.ndarray] = {}
                 feature_peak_cols = []
-                # ---- Per-peak annotations (within this ref) ----
-                for center in peak_centers:
+                for center in np.asarray(peak_centers).tolist():
                     start = center - peak_width // 2
                     end = center + peak_width // 2
-                    # Make column names ref- and layer-specific so they don't collide
+                    # var window mask
                     colname = f"{layer_name}_{ref}_peak_{center}"
                     feature_peak_cols.append(colname)
                     all_peak_var_cols.append(colname)
-                    # Var-level mask: is this position in the window?
                     peak_mask = (coordinates >= start) & (coordinates <= end)
                     adata.var[colname] = peak_mask
-                    # Extract signal in that window from the *ref subset* matrix
-                    region = matrix[:, peak_mask]  # (n_ref_reads, n_positions_in_window)
+                    # feature-layer summaries for reads in this ref
+                    region = matrix[:, peak_mask]  # (n_ref, n_window)
-                    # Per-read summary in this window for the feature layer itself
                     mean_col = f"mean_{layer_name}_{ref}_around_{center}"
                     sum_col = f"sum_{layer_name}_{ref}_around_{center}"
                     present_col = f"{layer_name}_{ref}_present_at_{center}"
-                    # Create columns if missing, then fill only the ref rows
-                    if mean_col not in adata.obs:
-                        adata.obs[mean_col] = np.nan
-                    if sum_col not in adata.obs:
-                        adata.obs[sum_col] = 0.0
-                    if present_col not in adata.obs:
-                        adata.obs[present_col] = False
-                    adata.obs.loc[ref_mask, mean_col] = np.nanmean(region, axis=1)
-                    adata.obs.loc[ref_mask, sum_col] = np.nansum(region, axis=1)
-                    adata.obs.loc[ref_mask, present_col] = (
-                        adata.obs.loc[ref_mask, mean_col].values > peak_threshold
+                    for nm, default, dt in (
+                        (mean_col, np.nan, float),
+                        (sum_col, 0.0, float),
+                        (present_col, False, bool),
+                    ):
+                        if nm not in new_obs_cols:
+                            new_obs_cols[nm] = np.full(adata.n_obs, default, dtype=dt)
+                    if region.shape[1] > 0:
+                        means_per_read = np.nanmean(region, axis=1)
+                        sums_per_read = np.nansum(region, axis=1)
+                    else:
+                        means_per_read = np.full(matrix.shape[0], np.nan, dtype=float)
+                        sums_per_read = np.zeros(matrix.shape[0], dtype=float)
+                    new_obs_cols[mean_col][ref_mask] = means_per_read
+                    new_obs_cols[sum_col][ref_mask] = sums_per_read
+                    new_obs_cols[present_col][ref_mask] = (
+                        np.nan_to_num(means_per_read, nan=0.0) > peak_threshold
                     )
-                    # Initialize site-type summaries (global columns; filled per ref)
-                    for site_type in site_types:
-                        sum_site_col = f"{site_type}_{ref}_sum_around_{center}"
-                        mean_site_col = f"{site_type}_{ref}_mean_around_{center}"
-                        if sum_site_col not in adata.obs:
-                            adata.obs[sum_site_col] = 0.0
-                        if mean_site_col not in adata.obs:
-                            adata.obs[mean_site_col] = np.nan
-                    # Per-site-type summaries for this ref
+                    # site-type summaries from adata.X, not an AnnData view
+                    Xmat = adata.X
                     for site_type in site_types:
                         mask_key = f"{ref}_{site_type}_site"
                         if mask_key not in adata.var:
@@ -299,35 +261,53 @@ def call_hmm_peaks(
                             continue
                         site_coords = coordinates[site_mask]
-                        region_mask = (site_coords >= start) & (site_coords <= end)
-                        if not region_mask.any():
+                        site_region_mask = (site_coords >= start) & (site_coords <= end)
+                        sum_site_col = f"sum_{layer_name}_{site_type}_{ref}_around_{center}"
+                        mean_site_col = f"mean_{layer_name}_{site_type}_{ref}_around_{center}"
+                        if sum_site_col not in new_obs_cols:
+                            new_obs_cols[sum_site_col] = np.zeros(adata.n_obs, dtype=float)
+                        if mean_site_col not in new_obs_cols:
+                            new_obs_cols[mean_site_col] = np.full(adata.n_obs, np.nan, dtype=float)
+                        if not site_region_mask.any():
                             continue
                         full_mask = np.zeros_like(site_mask, dtype=bool)
-                        full_mask[site_mask] = region_mask
+                        full_mask[site_mask] = site_region_mask
-                        site_region = adata[ref_mask, full_mask].X
-                        if hasattr(site_region, "A"):
-                            site_region = site_region.A  # sparse -> dense
-                        if site_region.shape[1] == 0:
-                            continue
+                        if issparse(Xmat):
+                            site_region = Xmat[ref_mask][:, full_mask]
+                            site_region = site_region.toarray()
+                        else:
+                            Xnp = np.asarray(Xmat)
+                            site_region = Xnp[np.asarray(ref_mask), :][:, np.asarray(full_mask)]
-                        sum_site_col = f"{site_type}_{ref}_sum_around_{center}"
-                        mean_site_col = f"{site_type}_{ref}_mean_around_{center}"
+                        if site_region.shape[1] > 0:
+                            new_obs_cols[sum_site_col][ref_mask] = np.nansum(site_region, axis=1)
+                            new_obs_cols[mean_site_col][ref_mask] = np.nanmean(site_region, axis=1)
-                        adata.obs.loc[ref_mask, sum_site_col] = np.nansum(site_region, axis=1)
-                        adata.obs.loc[ref_mask, mean_site_col] = np.nanmean(site_region, axis=1)
+                # one-shot assignment to avoid fragmentation
+                if new_obs_cols:
+                    adata.obs = adata.obs.assign(
+                        **{k: pd.Series(v, index=adata.obs.index) for k, v in new_obs_cols.items()}
+                    )
-                # Mark "any peak" for this (layer, ref)
+                # per (layer, ref) any-peak
                 any_col = f"is_in_any_{layer_name}_peak_{ref}"
-                adata.var[any_col] = adata.var[feature_peak_cols].any(axis=1)
-                print(
-                    f"[call_hmm_peaks] Annotated {len(peak_centers)} peaks "
-                    f"for layer '{layer_name}' in ref '{ref}'."
+                if feature_peak_cols:
+                    adata.var[any_col] = adata.var[feature_peak_cols].any(axis=1)
+                else:
+                    adata.var[any_col] = False
+                logger.info(
+                    "[call_hmm_peaks] Annotated %s peaks for layer '%s' in ref '%s'.",
+                    len(peak_centers),
+                    layer_name,
+                    ref,
                 )
-    # Global any-peak flag across all feature layers and references
+    # global any-peak across all layers/refs
     if all_peak_var_cols:
         adata.var["is_in_any_peak"] = adata.var[all_peak_var_cols].any(axis=1)

smftools/hmm/display_hmm.py CHANGED Viewed

@@ -1,18 +1,31 @@
+from smftools.logging_utils import get_logger
+logger = get_logger(__name__)
 def display_hmm(hmm, state_labels=["Non-Methylated", "Methylated"], obs_labels=["0", "1"]):
+    """Log a summary of HMM transition and emission parameters.
+    Args:
+        hmm: HMM object with edges and distributions.
+        state_labels: Optional labels for states.
+        obs_labels: Optional labels for observations.
+    """
     import torch
-    print("\n**HMM Model Overview**")
-    print(hmm)
-    print("\n**Transition Matrix**")
+    logger.info("**HMM Model Overview**")
+    logger.info("%s", hmm)
+    logger.info("**Transition Matrix**")
     transition_matrix = torch.exp(hmm.edges).detach().cpu().numpy()
     for i, row in enumerate(transition_matrix):
         label = state_labels[i] if state_labels else f"State {i}"
         formatted_row = ", ".join(f"{p:.6f}" for p in row)
-        print(f"{label}: [{formatted_row}]")
+        logger.info("%s: [%s]", label, formatted_row)
-    print("\n**Emission Probabilities**")
+    logger.info("**Emission Probabilities**")
     for i, dist in enumerate(hmm.distributions):
         label = state_labels[i] if state_labels else f"State {i}"
         probs = dist.probs.detach().cpu().numpy()
         formatted_emissions = {obs_labels[j]: probs[j] for j in range(len(probs))}
-        print(f"{label}: {formatted_emissions}")
+        logger.info("%s: %s", label, formatted_emissions)

smftools/hmm/hmm_readwrite.py CHANGED Viewed

@@ -1,16 +1,25 @@
-def load_hmm(model_path, device='cpu'):
+def load_hmm(model_path, device="cpu"):
     """
     Reads in a pretrained HMM.
     Parameters:
         model_path (str): Path to a pretrained HMM
     """
     import torch
     # Load model using PyTorch
     hmm = torch.load(model_path)
-    hmm.to(device)
+    hmm.to(device)
     return hmm
 def save_hmm(model, model_path):
+    """Save a pretrained HMM to disk.
+    Args:
+        model: HMM model instance.
+        model_path: Output path for the model.
+    """
     import torch
-    torch.save(model, model_path)
+    torch.save(model, model_path)

smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

smftools 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl