PyPI - smftools - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

smftools 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

smftools/_version.py +1 -1
smftools/cli/chimeric_adata.py +1563 -0
smftools/cli/helpers.py +49 -7
smftools/cli/hmm_adata.py +250 -32
smftools/cli/latent_adata.py +773 -0
smftools/cli/load_adata.py +78 -74
smftools/cli/preprocess_adata.py +122 -58
smftools/cli/recipes.py +26 -0
smftools/cli/spatial_adata.py +74 -112
smftools/cli/variant_adata.py +423 -0
smftools/cli_entry.py +52 -4
smftools/config/conversion.yaml +1 -1
smftools/config/deaminase.yaml +3 -0
smftools/config/default.yaml +85 -12
smftools/config/experiment_config.py +146 -1
smftools/constants.py +69 -0
smftools/hmm/HMM.py +88 -0
smftools/hmm/call_hmm_peaks.py +1 -1
smftools/informatics/__init__.py +6 -0
smftools/informatics/bam_functions.py +358 -8
smftools/informatics/binarize_converted_base_identities.py +2 -89
smftools/informatics/converted_BAM_to_adata.py +636 -175
smftools/informatics/h5ad_functions.py +198 -2
smftools/informatics/modkit_extract_to_adata.py +1007 -425
smftools/informatics/sequence_encoding.py +72 -0
smftools/logging_utils.py +21 -2
smftools/metadata.py +1 -1
smftools/plotting/__init__.py +26 -3
smftools/plotting/autocorrelation_plotting.py +22 -4
smftools/plotting/chimeric_plotting.py +1893 -0
smftools/plotting/classifiers.py +28 -14
smftools/plotting/general_plotting.py +62 -1583
smftools/plotting/hmm_plotting.py +1670 -8
smftools/plotting/latent_plotting.py +804 -0
smftools/plotting/plotting_utils.py +243 -0
smftools/plotting/position_stats.py +16 -8
smftools/plotting/preprocess_plotting.py +281 -0
smftools/plotting/qc_plotting.py +8 -3
smftools/plotting/spatial_plotting.py +1134 -0
smftools/plotting/variant_plotting.py +1231 -0
smftools/preprocessing/__init__.py +4 -0
smftools/preprocessing/append_base_context.py +18 -18
smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
smftools/preprocessing/append_variant_call_layer.py +480 -0
smftools/preprocessing/calculate_consensus.py +1 -1
smftools/preprocessing/calculate_read_modification_stats.py +6 -1
smftools/preprocessing/flag_duplicate_reads.py +4 -4
smftools/preprocessing/invert_adata.py +1 -0
smftools/readwrite.py +159 -99
smftools/schema/anndata_schema_v1.yaml +15 -1
smftools/tools/__init__.py +10 -0
smftools/tools/calculate_knn.py +121 -0
smftools/tools/calculate_leiden.py +57 -0
smftools/tools/calculate_nmf.py +130 -0
smftools/tools/calculate_pca.py +180 -0
smftools/tools/calculate_umap.py +79 -80
smftools/tools/position_stats.py +4 -4
smftools/tools/rolling_nn_distance.py +872 -0
smftools/tools/sequence_alignment.py +140 -0
smftools/tools/tensor_factorization.py +217 -0
{smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
{smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
{smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
{smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
{smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0

smftools/plotting/plotting_utils.py ADDED Viewed

@@ -0,0 +1,243 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+import numpy as np
+import pandas as pd
+from smftools.logging_utils import get_logger
+from smftools.optional_imports import require
+sns = require("seaborn", extra="plotting", purpose="plot styling")
+logger = get_logger(__name__)
+if TYPE_CHECKING:
+    import anndata as ad
+def _fixed_tick_positions(n_positions: int, n_ticks: int) -> np.ndarray:
+    """
+    Return indices for ~n_ticks evenly spaced labels across [0, n_positions-1].
+    Always includes 0 and n_positions-1 when possible.
+    """
+    n_ticks = int(max(2, n_ticks))
+    if n_positions <= n_ticks:
+        return np.arange(n_positions)
+    pos = np.linspace(0, n_positions - 1, n_ticks)
+    return np.unique(np.round(pos).astype(int))
+def _select_labels(
+    subset: "ad.AnnData", sites: np.ndarray, reference: str, index_col_suffix: str | None
+) -> np.ndarray:
+    """
+    Select tick labels for the heatmap axis.
+    Parameters
+    ----------
+    subset : AnnData view
+        The per-bin subset of the AnnData.
+    sites : np.ndarray[int]
+        Indices of the subset.var positions to annotate.
+    reference : str
+        Reference name (e.g., '6B6_top').
+    index_col_suffix : None or str
+        If None → use subset.var_names
+        Else     → use subset.var[f"{reference}_{index_col_suffix}"]
+    Returns
+    -------
+    np.ndarray[str]
+        The labels to use for tick positions.
+    """
+    if sites.size == 0:
+        return np.array([])
+    if index_col_suffix is None:
+        return subset.var_names[sites].astype(str)
+    colname = f"{reference}_{index_col_suffix}"
+    if colname not in subset.var:
+        raise KeyError(
+            f"index_col_suffix='{index_col_suffix}' requires var column '{colname}', "
+            f"but it is not present in adata.var."
+        )
+    labels = subset.var[colname].astype(str).values
+    return labels[sites]
+def normalized_mean(matrix: np.ndarray, *, ignore_nan: bool = True) -> np.ndarray:
+    """Compute normalized column means for a matrix.
+    Args:
+        matrix: Input matrix.
+    Returns:
+        1D array of normalized means.
+    """
+    mean = np.nanmean(matrix, axis=0) if ignore_nan else np.mean(matrix, axis=0)
+    denom = (mean.max() - mean.min()) + 1e-9
+    return (mean - mean.min()) / denom
+def _layer_to_numpy(
+    subset: "ad.AnnData",
+    layer_name: str,
+    sites: np.ndarray | None = None,
+    *,
+    fill_nan_strategy: str = "value",
+    fill_nan_value: float = -1,
+) -> np.ndarray:
+    """Return a (copied) numpy array for a layer with optional NaN filling."""
+    if sites is not None:
+        layer_data = subset[:, sites].layers[layer_name]
+    else:
+        layer_data = subset.layers[layer_name]
+    if hasattr(layer_data, "toarray"):
+        arr = layer_data.toarray()
+    else:
+        arr = np.asarray(layer_data)
+    arr = np.array(arr, copy=True)
+    if fill_nan_strategy == "none":
+        return arr
+    if fill_nan_strategy not in {"value", "col_mean"}:
+        raise ValueError("fill_nan_strategy must be 'none', 'value', or 'col_mean'.")
+    arr = arr.astype(float, copy=False)
+    if fill_nan_strategy == "value":
+        return np.where(np.isnan(arr), fill_nan_value, arr)
+    col_mean = np.nanmean(arr, axis=0)
+    if np.any(np.isnan(col_mean)):
+        col_mean = np.where(np.isnan(col_mean), fill_nan_value, col_mean)
+    return np.where(np.isnan(arr), col_mean, arr)
+def _infer_zero_is_valid(layer_name: str | None, matrix: np.ndarray) -> bool:
+    """Infer whether zeros should count as valid (unmethylated) values."""
+    if layer_name and "nan0_0minus1" in layer_name:
+        return False
+    if np.isnan(matrix).any():
+        return True
+    if np.any(matrix < 0):
+        return False
+    return True
+def methylation_fraction(
+    matrix: np.ndarray, *, ignore_nan: bool = True, zero_is_valid: bool = False
+) -> np.ndarray:
+    """
+    Fraction methylated per column.
+    Methylated = 1
+    Valid = finite AND not 0 (unless zero_is_valid=True)
+    """
+    matrix = np.asarray(matrix)
+    if not ignore_nan:
+        matrix = np.where(np.isnan(matrix), 0, matrix)
+    finite_mask = np.isfinite(matrix)
+    valid_mask = finite_mask if zero_is_valid else (finite_mask & (matrix != 0))
+    methyl_mask = (matrix == 1) & np.isfinite(matrix)
+    methylated = methyl_mask.sum(axis=0)
+    valid = valid_mask.sum(axis=0)
+    return np.divide(
+        methylated, valid, out=np.zeros_like(methylated, dtype=float), where=valid != 0
+    )
+def _methylation_fraction_for_layer(
+    matrix: np.ndarray,
+    layer_name: str | None,
+    *,
+    ignore_nan: bool = True,
+    zero_is_valid: bool | None = None,
+) -> np.ndarray:
+    """Compute methylation fractions with layer-aware zero handling."""
+    matrix = np.asarray(matrix)
+    if zero_is_valid is None:
+        zero_is_valid = _infer_zero_is_valid(layer_name, matrix)
+    return methylation_fraction(matrix, ignore_nan=ignore_nan, zero_is_valid=zero_is_valid)
+def clean_barplot(
+    ax,
+    mean_values,
+    title,
+    *,
+    y_max: float | None = 1.0,
+    y_label: str = "Mean",
+    y_ticks: list[float] | None = None,
+):
+    """Format a barplot with consistent axes and labels.
+    Args:
+        ax: Matplotlib axes.
+        mean_values: Values to plot.
+        title: Plot title.
+        y_max: Optional y-axis max; inferred from data if not provided.
+        y_label: Y-axis label.
+        y_ticks: Optional y-axis ticks.
+    """
+    logger.debug("Formatting barplot '%s' with %s values.", title, len(mean_values))
+    x = np.arange(len(mean_values))
+    ax.bar(x, mean_values, color="gray", width=1.0, align="edge")
+    ax.set_xlim(0, len(mean_values))
+    if y_ticks is None and y_max == 1.0:
+        y_ticks = [0.0, 0.5, 1.0]
+    if y_max is None:
+        y_max = np.nanmax(mean_values) if len(mean_values) else 1.0
+        if not np.isfinite(y_max) or y_max <= 0:
+            y_max = 1.0
+        y_max *= 1.05
+    ax.set_ylim(0, y_max)
+    if y_ticks is not None:
+        ax.set_yticks(y_ticks)
+    ax.set_ylabel(y_label)
+    ax.set_title(title, fontsize=12, pad=2)
+    for spine_name, spine in ax.spines.items():
+        spine.set_visible(spine_name == "left")
+    ax.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
+def make_row_colors(meta: pd.DataFrame) -> pd.DataFrame:
+    """
+    Convert metadata columns to RGB colors without invoking pandas Categorical.map
+    (MultiIndex-safe, category-safe).
+    """
+    row_colors = pd.DataFrame(index=meta.index)
+    for col in meta.columns:
+        s = meta[col].astype("object")
+        def _to_label(x: Any) -> str:
+            if x is None:
+                return "NA"
+            if isinstance(x, float) and np.isnan(x):
+                return "NA"
+            if isinstance(x, pd.MultiIndex):
+                return "MultiIndex"
+            if isinstance(x, tuple):
+                return "|".join(map(str, x))
+            return str(x)
+        labels = np.array([_to_label(x) for x in s.to_numpy()], dtype=object)
+        uniq = pd.unique(labels)
+        palette = dict(zip(uniq, sns.color_palette(n_colors=len(uniq))))
+        colors = [palette.get(lbl, (0.7, 0.7, 0.7)) for lbl in labels]
+        row_colors[col] = colors
+    return row_colors

smftools/plotting/position_stats.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from __future__ import annotations
+from smftools.logging_utils import get_logger
 from smftools.optional_imports import require
+logger = get_logger(__name__)
 def plot_volcano_relative_risk(
     results_dict,
@@ -29,10 +32,11 @@ def plot_volcano_relative_risk(
     plt = require("matplotlib.pyplot", extra="plotting", purpose="relative risk plots")
+    logger.info("Plotting volcano relative risk plots.")
     for ref, group_results in results_dict.items():
         for group_label, (results_df, _) in group_results.items():
             if results_df.empty:
-                print(f"Skipping empty results for {ref} / {group_label}")
+                logger.warning("Skipping empty results for %s / %s.", ref, group_label)
                 continue
             # Split by site type
@@ -100,7 +104,7 @@ def plot_volcano_relative_risk(
                 )
                 out_file = os.path.join(save_path, f"{safe_name}.png")
                 plt.savefig(out_file, dpi=300)
-                print(f"Saved: {out_file}")
+                logger.info("Saved volcano relative risk plot to %s.", out_file)
             plt.show()
@@ -131,10 +135,11 @@ def plot_bar_relative_risk(
     plt = require("matplotlib.pyplot", extra="plotting", purpose="relative risk plots")
+    logger.info("Plotting bar relative risk plots.")
     for ref, group_data in results_dict.items():
         for group_label, (df, _) in group_data.items():
             if df.empty:
-                print(f"Skipping empty result for {ref} / {group_label}")
+                logger.warning("Skipping empty result for %s / %s.", ref, group_label)
                 continue
             df = df.copy()
@@ -206,7 +211,7 @@ def plot_bar_relative_risk(
                 )
                 out_file = os.path.join(save_path, f"{safe_name}.png")
                 plt.savefig(out_file, dpi=300)
-                print(f"📁 Saved: {out_file}")
+                logger.info("Saved bar relative risk plot to %s.", out_file)
             plt.show()
@@ -240,6 +245,8 @@ def plot_positionwise_matrix(
     plt = require("matplotlib.pyplot", extra="plotting", purpose="position stats plots")
     sns = require("seaborn", extra="plotting", purpose="position stats plots")
+    logger.info("Plotting positionwise matrices for key '%s'.", key)
     def find_closest_index(index, target):
         """Find the index value closest to a target value."""
         index_vals = pd.to_numeric(index, errors="coerce")
@@ -357,7 +364,7 @@ def plot_positionwise_matrix(
                         va="center",
                         fontsize=10,
                     )
-                    print(f"Error plotting line for {highlight_axis}={pos}: {e}")
+                    logger.warning("Error plotting line for %s=%s: %s", highlight_axis, pos, e)
             line_ax.set_title(f"{highlight_axis.capitalize()} Profile(s)")
             line_ax.set_xlabel(f"{'Column' if highlight_axis == 'row' else 'Row'} position")
@@ -373,7 +380,7 @@ def plot_positionwise_matrix(
             safe_name = group.replace("=", "").replace("__", "_").replace(",", "_")
             out_file = os.path.join(save_path, f"{key}_{safe_name}.png")
             plt.savefig(out_file, dpi=300)
-            print(f"📁 Saved: {out_file}")
+            logger.info("Saved positionwise matrix plot to %s.", out_file)
         plt.show()
@@ -423,6 +430,7 @@ def plot_positionwise_matrix_grid(
     grid_spec = require("matplotlib.gridspec", extra="plotting", purpose="position stats plots")
     GridSpec = grid_spec.GridSpec
+    logger.info("Plotting positionwise matrix grid for key '%s'.", key)
     matrices = adata.uns[key]
     group_labels = list(matrices.keys())
@@ -515,7 +523,7 @@ def plot_positionwise_matrix_grid(
             os.makedirs(save_path, exist_ok=True)
             fname = outer_label.replace("_", "").replace("=", "") + ".png"
             plt.savefig(os.path.join(save_path, fname), dpi=300, bbox_inches="tight")
-            print(f"Saved {fname}")
+            logger.info("Saved positionwise matrix grid plot to %s.", fname)
         plt.close(fig)
@@ -527,4 +535,4 @@ def plot_positionwise_matrix_grid(
         for outer_label in parsed["outer"].unique():
             plot_one_grid(outer_label)
-    print("Finished plotting all grids.")
+    logger.info("Finished plotting all grids.")

smftools/plotting/preprocess_plotting.py ADDED Viewed

@@ -0,0 +1,281 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Any, Dict, List, Sequence
+import numpy as np
+import pandas as pd
+import scipy.cluster.hierarchy as sch
+from smftools.logging_utils import get_logger
+from smftools.optional_imports import require
+colors = require("matplotlib.colors", extra="plotting", purpose="plot rendering")
+plt = require("matplotlib.pyplot", extra="plotting", purpose="plot rendering")
+sns = require("seaborn", extra="plotting", purpose="plot styling")
+logger = get_logger(__name__)
+def plot_read_span_quality_clustermaps(
+    adata,
+    sample_col: str = "Sample_Names",
+    reference_col: str = "Reference_strand",
+    quality_layer: str = "base_quality_scores",
+    read_span_layer: str = "read_span_mask",
+    quality_cmap: str = "viridis",
+    read_span_color: str = "#2ca25f",
+    max_nan_fraction: float | None = None,
+    min_quality: float | None = None,
+    min_length: int | None = None,
+    min_mapped_length_to_reference_length_ratio: float | None = None,
+    demux_types: Sequence[str] = ("single", "double", "already"),
+    max_reads: int | None = None,
+    xtick_step: int | None = None,
+    xtick_rotation: int = 90,
+    xtick_fontsize: int = 9,
+    show_position_axis: bool = False,
+    position_axis_tick_target: int = 25,
+    save_path: str | Path | None = None,
+) -> List[Dict[str, Any]]:
+    """Plot read-span mask and base quality clustermaps side by side.
+    Clustering is performed using the base-quality layer ordering, which is then
+    applied to the read-span mask to keep the two panels aligned.
+    Args:
+        adata: AnnData with read-span and base-quality layers.
+        sample_col: Column in ``adata.obs`` that identifies samples.
+        reference_col: Column in ``adata.obs`` that identifies references.
+        quality_layer: Layer name containing base-quality scores.
+        read_span_layer: Layer name containing read-span masks.
+        quality_cmap: Colormap for base-quality scores.
+        read_span_color: Color for read-span mask (1-values); 0-values are white.
+        max_nan_fraction: Optional maximum fraction of NaNs allowed per position; positions
+            above this threshold are excluded.
+        min_quality: Optional minimum read quality filter.
+        min_length: Optional minimum mapped length filter.
+        min_mapped_length_to_reference_length_ratio: Optional min length ratio filter.
+        demux_types: Allowed ``demux_type`` values, if present in ``adata.obs``.
+        max_reads: Optional maximum number of reads to plot per sample/reference.
+        xtick_step: Spacing between x-axis tick labels (None = no labels).
+        xtick_rotation: Rotation for x-axis tick labels.
+        xtick_fontsize: Font size for x-axis tick labels.
+        show_position_axis: Whether to draw a position axis with tick labels.
+        position_axis_tick_target: Approximate number of ticks to show when auto-sizing.
+        save_path: Optional output directory for saving plots.
+    Returns:
+        List of dictionaries with per-plot metadata and output paths.
+    """
+    logger.info("Plotting read span and quality clustermaps.")
+    def _mask_or_true(series_name: str, predicate):
+        if series_name not in adata.obs:
+            return pd.Series(True, index=adata.obs.index)
+        s = adata.obs[series_name]
+        try:
+            return predicate(s)
+        except Exception:
+            return pd.Series(True, index=s.index)
+    def _resolve_xtick_step(n_positions: int) -> int | None:
+        if xtick_step is not None:
+            return xtick_step
+        if not show_position_axis:
+            return None
+        return max(1, int(np.ceil(n_positions / position_axis_tick_target)))
+    def _fill_nan_with_col_means(matrix: np.ndarray) -> np.ndarray:
+        filled = matrix.copy()
+        col_means = np.nanmean(filled, axis=0)
+        col_means = np.where(np.isnan(col_means), 0.0, col_means)
+        nan_rows, nan_cols = np.where(np.isnan(filled))
+        filled[nan_rows, nan_cols] = col_means[nan_cols]
+        return filled
+    if quality_layer not in adata.layers:
+        raise KeyError(f"Layer '{quality_layer}' not found in adata.layers")
+    if read_span_layer not in adata.layers:
+        raise KeyError(f"Layer '{read_span_layer}' not found in adata.layers")
+    if max_nan_fraction is not None and not (0 <= max_nan_fraction <= 1):
+        raise ValueError("max_nan_fraction must be between 0 and 1.")
+    if position_axis_tick_target < 1:
+        raise ValueError("position_axis_tick_target must be at least 1.")
+    results: List[Dict[str, Any]] = []
+    save_path = Path(save_path) if save_path is not None else None
+    if save_path is not None:
+        save_path.mkdir(parents=True, exist_ok=True)
+    for col in (sample_col, reference_col):
+        if col not in adata.obs:
+            raise KeyError(f"{col} not in adata.obs")
+        if not isinstance(adata.obs[col].dtype, pd.CategoricalDtype):
+            adata.obs[col] = adata.obs[col].astype("category")
+    for ref in adata.obs[reference_col].cat.categories:
+        for sample in adata.obs[sample_col].cat.categories:
+            qmask = _mask_or_true(
+                "read_quality",
+                (lambda s: s >= float(min_quality))
+                if (min_quality is not None)
+                else (lambda s: pd.Series(True, index=s.index)),
+            )
+            lm_mask = _mask_or_true(
+                "mapped_length",
+                (lambda s: s >= float(min_length))
+                if (min_length is not None)
+                else (lambda s: pd.Series(True, index=s.index)),
+            )
+            lrr_mask = _mask_or_true(
+                "mapped_length_to_reference_length_ratio",
+                (lambda s: s >= float(min_mapped_length_to_reference_length_ratio))
+                if (min_mapped_length_to_reference_length_ratio is not None)
+                else (lambda s: pd.Series(True, index=s.index)),
+            )
+            demux_mask = _mask_or_true(
+                "demux_type",
+                (lambda s: s.astype("string").isin(list(demux_types)))
+                if (demux_types is not None)
+                else (lambda s: pd.Series(True, index=s.index)),
+            )
+            row_mask = (
+                (adata.obs[reference_col] == ref)
+                & (adata.obs[sample_col] == sample)
+                & qmask
+                & lm_mask
+                & lrr_mask
+                & demux_mask
+            )
+            if not bool(row_mask.any()):
+                continue
+            subset = adata[row_mask, :].copy()
+            quality_matrix = np.asarray(subset.layers[quality_layer]).astype(float)
+            quality_matrix[quality_matrix < 0] = np.nan
+            read_span_matrix = np.asarray(subset.layers[read_span_layer]).astype(float)
+            if max_nan_fraction is not None:
+                nan_mask = np.isnan(quality_matrix) | np.isnan(read_span_matrix)
+                nan_fraction = nan_mask.mean(axis=0)
+                keep_columns = nan_fraction <= max_nan_fraction
+                if not np.any(keep_columns):
+                    continue
+                quality_matrix = quality_matrix[:, keep_columns]
+                read_span_matrix = read_span_matrix[:, keep_columns]
+                subset = subset[:, keep_columns].copy()
+            if max_reads is not None and quality_matrix.shape[0] > max_reads:
+                quality_matrix = quality_matrix[:max_reads]
+                read_span_matrix = read_span_matrix[:max_reads]
+                subset = subset[:max_reads, :].copy()
+            if quality_matrix.size == 0:
+                continue
+            quality_filled = _fill_nan_with_col_means(quality_matrix)
+            linkage = sch.linkage(quality_filled, method="ward")
+            order = sch.leaves_list(linkage)
+            quality_matrix = quality_matrix[order]
+            read_span_matrix = read_span_matrix[order]
+            fig, axes = plt.subplots(
+                nrows=2,
+                ncols=3,
+                figsize=(18, 6),
+                sharex="col",
+                gridspec_kw={"height_ratios": [1, 4], "width_ratios": [1, 1, 0.05]},
+            )
+            span_bar_ax, quality_bar_ax, bar_spacer_ax = axes[0]
+            span_ax, quality_ax, cbar_ax = axes[1]
+            bar_spacer_ax.set_axis_off()
+            span_mean = np.nanmean(read_span_matrix, axis=0)
+            quality_mean = np.nanmean(quality_matrix, axis=0)
+            bar_positions = np.arange(read_span_matrix.shape[1]) + 0.5
+            span_bar_ax.bar(
+                bar_positions,
+                span_mean,
+                color=read_span_color,
+                width=1.0,
+            )
+            span_bar_ax.set_title(f"{read_span_layer} mean")
+            span_bar_ax.set_xlim(0, read_span_matrix.shape[1])
+            span_bar_ax.tick_params(axis="x", labelbottom=False)
+            quality_bar_ax.bar(
+                bar_positions,
+                quality_mean,
+                color="#4c72b0",
+                width=1.0,
+            )
+            quality_bar_ax.set_title(f"{quality_layer} mean")
+            quality_bar_ax.set_xlim(0, quality_matrix.shape[1])
+            quality_bar_ax.tick_params(axis="x", labelbottom=False)
+            span_cmap = colors.ListedColormap(["white", read_span_color])
+            span_norm = colors.BoundaryNorm([-0.5, 0.5, 1.5], span_cmap.N)
+            sns.heatmap(
+                read_span_matrix,
+                cmap=span_cmap,
+                norm=span_norm,
+                ax=span_ax,
+                yticklabels=False,
+                cbar=False,
+            )
+            span_ax.set_title(read_span_layer)
+            sns.heatmap(
+                quality_matrix,
+                cmap=quality_cmap,
+                ax=quality_ax,
+                yticklabels=False,
+                cbar=True,
+                cbar_ax=cbar_ax,
+            )
+            quality_ax.set_title(quality_layer)
+            resolved_step = _resolve_xtick_step(quality_matrix.shape[1])
+            for axis in (span_ax, quality_ax):
+                if resolved_step is not None and resolved_step > 0:
+                    sites = np.arange(0, quality_matrix.shape[1], resolved_step)
+                    axis.set_xticks(sites)
+                    axis.set_xticklabels(
+                        subset.var_names[sites].astype(str),
+                        rotation=xtick_rotation,
+                        fontsize=xtick_fontsize,
+                    )
+                else:
+                    axis.set_xticks([])
+                if show_position_axis or xtick_step is not None:
+                    axis.set_xlabel("Position")
+            n_reads = quality_matrix.shape[0]
+            fig.suptitle(f"{sample} - {ref} - {n_reads} reads")
+            fig.tight_layout(rect=(0, 0, 1, 0.95))
+            out_file = None
+            if save_path is not None:
+                safe_name = f"{ref}__{sample}__read_span_quality".replace("=", "").replace(",", "_")
+                out_file = save_path / f"{safe_name}.png"
+                fig.savefig(out_file, dpi=300, bbox_inches="tight")
+                plt.close(fig)
+                logger.info("Saved read span/quality clustermap to %s.", out_file)
+            else:
+                plt.show()
+            results.append(
+                {
+                    "reference": str(ref),
+                    "sample": str(sample),
+                    "quality_layer": quality_layer,
+                    "read_span_layer": read_span_layer,
+                    "n_positions": int(quality_matrix.shape[1]),
+                    "output_path": str(out_file) if out_file is not None else None,
+                }
+            )
+    return results

smftools/plotting/qc_plotting.py CHANGED Viewed

@@ -5,10 +5,13 @@ import os
 import numpy as np
 import pandas as pd
+from smftools.logging_utils import get_logger
 from smftools.optional_imports import require
 plt = require("matplotlib.pyplot", extra="plotting", purpose="QC plots")
+logger = get_logger(__name__)
 def plot_read_qc_histograms(
     adata,
@@ -53,6 +56,7 @@ def plot_read_qc_histograms(
     dpi : int
         Figure resolution.
     """
+    logger.info("Plotting read QC histograms to %s.", outdir)
     os.makedirs(outdir, exist_ok=True)
     if sample_key not in adata.obs.columns:
@@ -60,7 +64,7 @@ def plot_read_qc_histograms(
     # Ensure sample_key is categorical for stable ordering
     samples = adata.obs[sample_key]
-    if not pd.api.types.is_categorical_dtype(samples):
+    if not isinstance(samples.dtype, pd.CategoricalDtype):
         samples = samples.astype("category")
     sample_levels = list(samples.cat.categories)
@@ -69,14 +73,14 @@ def plot_read_qc_histograms(
     is_numeric = {}
     for key in obs_keys:
         if key not in adata.obs.columns:
-            print(f"[WARN] '{key}' not found in obs; skipping.")
+            logger.warning("'%s' not found in obs; skipping.", key)
             continue
         s = adata.obs[key]
         num = pd.api.types.is_numeric_dtype(s)
         valid_keys.append(key)
         is_numeric[key] = num
     if not valid_keys:
-        print("[plot_read_qc_grid] No valid obs_keys to plot.")
+        logger.warning("No valid obs_keys to plot.")
         return
     # Precompute global numeric ranges (after clipping) so rows share x-axis per column
@@ -174,6 +178,7 @@ def plot_read_qc_histograms(
         page = start // rows_per_fig + 1
         out_png = os.path.join(outdir, f"qc_grid_{_sanitize(sample_key)}_page{page}.png")
         plt.savefig(out_png, bbox_inches="tight")
+        logger.info("Saved QC histogram page to %s.", out_png)
         plt.close(fig)

smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

smftools 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl