PyPI - smftools - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

smftools/__init__.py +43 -13
smftools/_settings.py +6 -6
smftools/_version.py +3 -1
smftools/cli/__init__.py +1 -0
smftools/cli/archived/cli_flows.py +2 -0
smftools/cli/helpers.py +9 -1
smftools/cli/hmm_adata.py +905 -242
smftools/cli/load_adata.py +432 -280
smftools/cli/preprocess_adata.py +287 -171
smftools/cli/spatial_adata.py +141 -53
smftools/cli_entry.py +119 -178
smftools/config/__init__.py +3 -1
smftools/config/conversion.yaml +5 -1
smftools/config/deaminase.yaml +1 -1
smftools/config/default.yaml +26 -18
smftools/config/direct.yaml +8 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +511 -276
smftools/constants.py +37 -0
smftools/datasets/__init__.py +4 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2133 -1428
smftools/hmm/__init__.py +24 -14
smftools/hmm/archived/apply_hmm_batched.py +2 -0
smftools/hmm/archived/calculate_distances.py +2 -0
smftools/hmm/archived/call_hmm_peaks.py +18 -1
smftools/hmm/archived/train_hmm.py +2 -0
smftools/hmm/call_hmm_peaks.py +176 -193
smftools/hmm/display_hmm.py +23 -7
smftools/hmm/hmm_readwrite.py +20 -6
smftools/hmm/nucleosome_hmm_refinement.py +104 -14
smftools/informatics/__init__.py +55 -13
smftools/informatics/archived/bam_conversion.py +2 -0
smftools/informatics/archived/bam_direct.py +2 -0
smftools/informatics/archived/basecall_pod5s.py +2 -0
smftools/informatics/archived/basecalls_to_adata.py +2 -0
smftools/informatics/archived/conversion_smf.py +2 -0
smftools/informatics/archived/deaminase_smf.py +1 -0
smftools/informatics/archived/direct_smf.py +2 -0
smftools/informatics/archived/fast5_to_pod5.py +2 -0
smftools/informatics/archived/helpers/archived/__init__.py +2 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
smftools/informatics/archived/helpers/archived/informatics.py +2 -0
smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
smftools/informatics/archived/helpers/archived/modQC.py +2 -0
smftools/informatics/archived/helpers/archived/modcall.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +9 -1
smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
smftools/informatics/archived/subsample_pod5.py +2 -0
smftools/informatics/bam_functions.py +1059 -269
smftools/informatics/basecalling.py +53 -9
smftools/informatics/bed_functions.py +357 -114
smftools/informatics/binarize_converted_base_identities.py +21 -7
smftools/informatics/complement_base_list.py +9 -6
smftools/informatics/converted_BAM_to_adata.py +324 -137
smftools/informatics/fasta_functions.py +251 -89
smftools/informatics/h5ad_functions.py +202 -30
smftools/informatics/modkit_extract_to_adata.py +623 -274
smftools/informatics/modkit_functions.py +87 -44
smftools/informatics/ohe.py +46 -21
smftools/informatics/pod5_functions.py +114 -74
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +23 -12
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +157 -50
smftools/machine_learning/data/preprocessing.py +4 -1
smftools/machine_learning/evaluation/__init__.py +3 -1
smftools/machine_learning/evaluation/eval_utils.py +13 -14
smftools/machine_learning/evaluation/evaluators.py +52 -34
smftools/machine_learning/inference/__init__.py +3 -1
smftools/machine_learning/inference/inference_utils.py +9 -4
smftools/machine_learning/inference/lightning_inference.py +14 -13
smftools/machine_learning/inference/sklearn_inference.py +8 -8
smftools/machine_learning/inference/sliding_window_inference.py +37 -25
smftools/machine_learning/models/__init__.py +12 -5
smftools/machine_learning/models/base.py +34 -43
smftools/machine_learning/models/cnn.py +22 -13
smftools/machine_learning/models/lightning_base.py +78 -42
smftools/machine_learning/models/mlp.py +18 -5
smftools/machine_learning/models/positional.py +10 -4
smftools/machine_learning/models/rnn.py +8 -3
smftools/machine_learning/models/sklearn_models.py +46 -24
smftools/machine_learning/models/transformer.py +75 -55
smftools/machine_learning/models/wrappers.py +8 -3
smftools/machine_learning/training/__init__.py +4 -2
smftools/machine_learning/training/train_lightning_model.py +42 -23
smftools/machine_learning/training/train_sklearn_model.py +11 -15
smftools/machine_learning/utils/__init__.py +3 -1
smftools/machine_learning/utils/device.py +12 -5
smftools/machine_learning/utils/grl.py +8 -2
smftools/metadata.py +443 -0
smftools/optional_imports.py +31 -0
smftools/plotting/__init__.py +32 -17
smftools/plotting/autocorrelation_plotting.py +153 -48
smftools/plotting/classifiers.py +175 -73
smftools/plotting/general_plotting.py +350 -168
smftools/plotting/hmm_plotting.py +53 -14
smftools/plotting/position_stats.py +155 -87
smftools/plotting/qc_plotting.py +25 -12
smftools/preprocessing/__init__.py +35 -37
smftools/preprocessing/append_base_context.py +105 -79
smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +127 -31
smftools/preprocessing/binary_layers_to_ohe.py +18 -11
smftools/preprocessing/calculate_complexity_II.py +89 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +44 -22
smftools/preprocessing/calculate_pairwise_differences.py +4 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
smftools/preprocessing/calculate_position_Youden.py +110 -55
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
smftools/preprocessing/flag_duplicate_reads.py +708 -303
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +9 -3
smftools/preprocessing/min_non_diagonal.py +4 -1
smftools/preprocessing/recipes.py +58 -23
smftools/preprocessing/reindex_references_adata.py +93 -27
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +264 -109
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +25 -18
smftools/tools/archived/apply_hmm.py +2 -0
smftools/tools/archived/classifiers.py +165 -0
smftools/tools/archived/classify_methylated_features.py +2 -0
smftools/tools/archived/classify_non_methylated_features.py +2 -0
smftools/tools/archived/subset_adata_v1.py +12 -1
smftools/tools/archived/subset_adata_v2.py +14 -1
smftools/tools/calculate_umap.py +56 -15
smftools/tools/cluster_adata_on_methylation.py +122 -47
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +220 -99
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
smftools-0.3.0.dist-info/METADATA +147 -0
smftools-0.3.0.dist-info/RECORD +182 -0
smftools-0.2.4.dist-info/METADATA +0 -141
smftools-0.2.4.dist-info/RECORD +0 -176
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0

smftools/preprocessing/flag_duplicate_reads.py CHANGED Viewed

@@ -1,52 +1,76 @@
+from __future__ import annotations
 # duplicate_detection_with_hier_and_plots.py
 import copy
-import warnings
 import math
 import os
+import warnings
 from collections import defaultdict
-from typing import Dict, Any, Tuple, Union, List, Optional
+from importlib.util import find_spec
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
-import torch
-import anndata as ad
 import numpy as np
 import pandas as pd
-import matplotlib.pyplot as plt
-from tqdm import tqdm
+from scipy.cluster import hierarchy as sch
+from scipy.spatial.distance import pdist, squareform
+from scipy.stats import gaussian_kde
+from smftools.logging_utils import get_logger
+from smftools.optional_imports import require
 from ..readwrite import make_dirs
-# optional imports for clustering / PCA / KDE
-try:
-    from scipy.cluster import hierarchy as sch
-    from scipy.spatial.distance import pdist, squareform
-    SCIPY_AVAILABLE = True
-except Exception:
-    sch = None
-    pdist = None
-    squareform = None
-    SCIPY_AVAILABLE = False
-try:
-    from sklearn.decomposition import PCA
-    from sklearn.cluster import KMeans, DBSCAN
-    from sklearn.mixture import GaussianMixture
-    from sklearn.metrics import silhouette_score
-    SKLEARN_AVAILABLE = True
-except Exception:
-    PCA = None
-    KMeans = DBSCAN = GaussianMixture = silhouette_score = None
-    SKLEARN_AVAILABLE = False
-try:
-    from scipy.stats import gaussian_kde
-except Exception:
-    gaussian_kde = None
-def merge_uns_preserve(orig_uns: dict, new_uns: dict, prefer="orig") -> dict:
-    """
-    Merge two .uns dicts. prefer='orig' will keep orig_uns values on conflict,
-    prefer='new' will keep new_uns values on conflict. Conflicts are reported.
+logger = get_logger(__name__)
+plt = require("matplotlib.pyplot", extra="plotting", purpose="duplicate read plots")
+torch = require("torch", extra="torch", purpose="duplicate read detection")
+if TYPE_CHECKING:
+    import anndata as ad
+SCIPY_AVAILABLE = True
+SKLEARN_AVAILABLE = find_spec("sklearn") is not None
+PCA = None
+KMeans = DBSCAN = GaussianMixture = silhouette_score = None
+if SKLEARN_AVAILABLE:
+    sklearn_cluster = require(
+        "sklearn.cluster",
+        extra="ml-base",
+        purpose="duplicate read clustering",
+    )
+    sklearn_decomp = require(
+        "sklearn.decomposition",
+        extra="ml-base",
+        purpose="duplicate read PCA",
+    )
+    sklearn_metrics = require(
+        "sklearn.metrics",
+        extra="ml-base",
+        purpose="duplicate read clustering diagnostics",
+    )
+    sklearn_mixture = require(
+        "sklearn.mixture",
+        extra="ml-base",
+        purpose="duplicate read clustering",
+    )
+    DBSCAN = sklearn_cluster.DBSCAN
+    KMeans = sklearn_cluster.KMeans
+    PCA = sklearn_decomp.PCA
+    silhouette_score = sklearn_metrics.silhouette_score
+    GaussianMixture = sklearn_mixture.GaussianMixture
+def merge_uns_preserve(orig_uns: dict, new_uns: dict, prefer: str = "orig") -> dict:
+    """Merge two ``.uns`` dictionaries while preserving preferred values.
+    Args:
+        orig_uns: Original ``.uns`` dictionary.
+        new_uns: New ``.uns`` dictionary to merge.
+        prefer: Which dictionary to prefer on conflict (``"orig"`` or ``"new"``).
+    Returns:
+        dict: Merged dictionary.
     """
     out = copy.deepcopy(new_uns) if new_uns is not None else {}
     for k, v in (orig_uns or {}).items():
@@ -55,7 +79,7 @@ def merge_uns_preserve(orig_uns: dict, new_uns: dict, prefer="orig") -> dict:
         else:
             # present in both: compare quickly (best-effort)
             try:
-                equal = (out[k] == v)
+                equal = out[k] == v
             except Exception:
                 equal = False
             if equal:
@@ -69,9 +93,10 @@ def merge_uns_preserve(orig_uns: dict, new_uns: dict, prefer="orig") -> dict:
                 out[f"orig_uns__{k}"] = copy.deepcopy(v)
     return out
 def flag_duplicate_reads(
-    adata,
-    var_filters_sets,
+    adata: ad.AnnData,
+    var_filters_sets: Sequence[dict[str, Any]],
     distance_threshold: float = 0.07,
     obs_reference_col: str = "Reference_strand",
     sample_col: str = "Barcode",
@@ -81,7 +106,7 @@ def flag_duplicate_reads(
     uns_filtered_flag: str = "read_duplicates_removed",
     bypass: bool = False,
     force_redo: bool = False,
-    keep_best_metric: Optional[str] = 'read_quality',
+    keep_best_metric: Optional[str] = "read_quality",
     keep_best_higher: bool = True,
     window_size: int = 50,
     min_overlap_positions: int = 20,
@@ -93,19 +118,119 @@ def flag_duplicate_reads(
     hierarchical_metric: str = "euclidean",
     hierarchical_window: int = 50,
     random_state: int = 0,
-):
+    demux_types: Optional[Sequence[str]] = None,
+    demux_col: str = "demux_type",
+) -> ad.AnnData:
+    """Flag duplicate reads with demux-aware keeper preference.
+    Behavior:
+        - All reads are processed (no masking by demux).
+        - At each keeper decision, prefer reads whose ``demux_col`` value is in
+          ``demux_types`` when present. Among candidates, choose by
+          ``keep_best_metric``.
+    Args:
+        adata: AnnData object to process.
+        var_filters_sets: Sequence of variable filter definitions.
+        distance_threshold: Distance threshold for duplicate detection.
+        obs_reference_col: Obs column containing reference identifiers.
+        sample_col: Obs column containing sample identifiers.
+        output_directory: Directory for output plots and artifacts.
+        metric_keys: Metric key(s) used in processing.
+        uns_flag: Flag in ``adata.uns`` indicating prior completion.
+        uns_filtered_flag: Flag to mark read duplicates removal.
+        bypass: Whether to skip processing.
+        force_redo: Whether to rerun even if ``uns_flag`` is set.
+        keep_best_metric: Obs column used to select best read within duplicates.
+        keep_best_higher: Whether higher values in ``keep_best_metric`` are preferred.
+        window_size: Window size for local comparisons.
+        min_overlap_positions: Minimum overlapping positions required.
+        do_pca: Whether to run PCA before clustering.
+        pca_n_components: Number of PCA components.
+        pca_center: Whether to center data before PCA.
+        do_hierarchical: Whether to run hierarchical clustering.
+        hierarchical_linkage: Linkage method for hierarchical clustering.
+        hierarchical_metric: Distance metric for hierarchical clustering.
+        hierarchical_window: Window size for hierarchical clustering.
+        random_state: Random seed.
+        demux_types: Preferred demux types for keeper selection.
+        demux_col: Obs column containing demux type labels.
+    Returns:
+        anndata.AnnData: AnnData object with duplicate flags stored in ``adata.obs``.
     """
-    Duplicate-flagging pipeline where hierarchical stage operates only on representatives
-    (one representative per lex cluster, i.e. the keeper). Final keeper assignment and
-    enforcement happens only after hierarchical merging.
+    import copy
+    import warnings
+    import anndata as ad
+    import numpy as np
+    import pandas as pd
+    # -------- helper: demux-aware keeper selection --------
+    def _choose_keeper_with_demux_preference(
+        members_idx: List[int],
+        adata_subset: ad.AnnData,
+        obs_index_list: List[Any],
+        *,
+        demux_col: str = "demux_type",
+        preferred_types: Optional[Sequence[str]] = None,
+        keep_best_metric: Optional[str] = None,
+        keep_best_higher: bool = True,
+        lex_keeper_mask: Optional[np.ndarray] = None,  # aligned to members order
+    ) -> int:
+        """
+        Prefer members whose demux_col ∈ preferred_types.
+        Among candidates, pick by keep_best_metric (higher/lower).
+        If metric missing/NaN, prefer lex keeper (via mask) among candidates.
+        Fallback: first candidate.
+        Returns the chosen *member index* (int from members_idx).
+        """
+        # 1) demux-preferred candidates
+        if preferred_types and (demux_col in adata_subset.obs.columns):
+            preferred = set(map(str, preferred_types))
+            demux_series = adata_subset.obs[demux_col].astype("string")
+            names = [obs_index_list[m] for m in members_idx]
+            is_pref = demux_series.loc[names].isin(preferred).to_numpy()
+            candidates = [members_idx[i] for i, ok in enumerate(is_pref) if ok]
+        else:
+            candidates = []
-    Returns (adata_unique, adata_full) as before; writes sequence__* columns into adata.obs.
-    """
-    # early exits
+        if not candidates:
+            candidates = list(members_idx)
+        # 2) metric-based within candidates
+        if keep_best_metric and (keep_best_metric in adata_subset.obs.columns):
+            cand_names = [obs_index_list[m] for m in candidates]
+            try:
+                vals = pd.to_numeric(
+                    adata_subset.obs.loc[cand_names, keep_best_metric],
+                    errors="coerce",
+                ).to_numpy(dtype=float)
+            except Exception:
+                vals = np.array([np.nan] * len(candidates), dtype=float)
+            if not np.all(np.isnan(vals)):
+                if keep_best_higher:
+                    vals = np.where(np.isnan(vals), -np.inf, vals)
+                    return candidates[int(np.nanargmax(vals))]
+                else:
+                    vals = np.where(np.isnan(vals), np.inf, vals)
+                    return candidates[int(np.nanargmin(vals))]
+        # 3) metric unhelpful — prefer lex keeper if provided
+        if lex_keeper_mask is not None:
+            for i, midx in enumerate(members_idx):
+                if (midx in candidates) and bool(lex_keeper_mask[i]):
+                    return midx
+        # 4) fallback
+        return candidates[0]
+    # -------- early exits --------
     already = bool(adata.uns.get(uns_flag, False))
-    if (already and not force_redo):
+    if already and not force_redo:
         if "is_duplicate" in adata.obs.columns:
-            adata_unique = adata[adata.obs["is_duplicate"] == False].copy()
+            adata_unique = adata[~adata.obs["is_duplicate"]].copy()
             return adata_unique, adata
         else:
             return adata.copy(), adata.copy()
@@ -117,17 +242,23 @@ def flag_duplicate_reads(
     # local UnionFind
     class UnionFind:
+        """Disjoint-set union-find helper for clustering indices."""
         def __init__(self, size):
+            """Initialize parent pointers for the union-find."""
             self.parent = list(range(size))
         def find(self, x):
+            """Find the root for a member with path compression."""
             while self.parent[x] != x:
                 self.parent[x] = self.parent[self.parent[x]]
                 x = self.parent[x]
             return x
         def union(self, x, y):
-            rx = self.find(x); ry = self.find(y)
+            """Union the sets that contain x and y."""
+            rx = self.find(x)
+            ry = self.find(y)
             if rx != ry:
                 self.parent[ry] = rx
@@ -139,14 +270,14 @@ def flag_duplicate_reads(
     for sample in samples:
         for ref in references:
-            print(f"Processing sample={sample} ref={ref}")
+            logger.info("Processing sample=%s ref=%s", sample, ref)
             sample_mask = adata.obs[sample_col] == sample
             ref_mask = adata.obs[obs_reference_col] == ref
             subset_mask = sample_mask & ref_mask
             adata_subset = adata[subset_mask].copy()
             if adata_subset.n_obs < 2:
-                print(f"  Skipping {sample}_{ref} (too few reads)")
+                logger.info("  Skipping %s_%s (too few reads)", sample, ref)
                 continue
             N = adata_subset.shape[0]
@@ -162,7 +293,12 @@ def flag_duplicate_reads(
             selected_cols = adata.var.index[combined_mask.tolist()].to_list()
             col_indices = [adata.var.index.get_loc(c) for c in selected_cols]
-            print(f"  Selected {len(col_indices)} columns out of {adata.var.shape[0]} for {ref}")
+            logger.info(
+                "  Selected %s columns out of %s for %s",
+                len(col_indices),
+                adata.var.shape[0],
+                ref,
+            )
             # Extract data matrix (dense numpy) for the subset
             X = adata_subset.X
@@ -187,7 +323,10 @@ def flag_duplicate_reads(
             hierarchical_found_dists = []
             # Lexicographic windowed pass function
-            def cluster_pass(X_tensor_local, reverse=False, window=int(window_size), record_distances=False):
+            def cluster_pass(
+                X_tensor_local, reverse=False, window=int(window_size), record_distances=False
+            ):
+                """Perform a lexicographic windowed clustering pass."""
                 N_local = X_tensor_local.shape[0]
                 X_sortable = X_tensor_local.clone().nan_to_num(-1.0)
                 sort_keys = [tuple(row.numpy().tolist()) for row in X_sortable]
@@ -208,10 +347,16 @@ def flag_duplicate_reads(
                     if enough_overlap.any():
                         diffs = (row_i_exp != block_rows) & valid_mask
                         hamming_counts = diffs.sum(dim=1).float()
-                        hamming_dists = torch.where(valid_counts > 0, hamming_counts / valid_counts, torch.tensor(float("nan")))
+                        hamming_dists = torch.where(
+                            valid_counts > 0,
+                            hamming_counts / valid_counts,
+                            torch.tensor(float("nan")),
+                        )
                         # record distances (legacy list of all local comparisons)
                         hamming_np = hamming_dists.cpu().numpy().tolist()
-                        local_hamming_dists.extend([float(x) for x in hamming_np if (not np.isnan(x))])
+                        local_hamming_dists.extend(
+                            [float(x) for x in hamming_np if (not np.isnan(x))]
+                        )
                         matches = (hamming_dists < distance_threshold) & (enough_overlap)
                         for offset_local, m in enumerate(matches):
                             if m:
@@ -223,20 +368,28 @@ def flag_duplicate_reads(
                             next_local_idx = i + 1
                             if next_local_idx < len(sorted_X):
                                 next_global = sorted_idx[next_local_idx]
-                                vm_pair = (~torch.isnan(row_i)) & (~torch.isnan(sorted_X[next_local_idx]))
+                                vm_pair = (~torch.isnan(row_i)) & (
+                                    ~torch.isnan(sorted_X[next_local_idx])
+                                )
                                 vc = vm_pair.sum().item()
                                 if vc >= min_overlap_positions:
-                                    d = float(((row_i[vm_pair] != sorted_X[next_local_idx][vm_pair]).sum().item()) / vc)
+                                    d = float(
+                                        (
+                                            (row_i[vm_pair] != sorted_X[next_local_idx][vm_pair])
+                                            .sum()
+                                            .item()
+                                        )
+                                        / vc
+                                    )
                                     if reverse:
                                         rev_hamming_to_prev[next_global] = d
                                     else:
                                         fwd_hamming_to_next[sorted_idx[i]] = d
                 return cluster_pairs_local
-            # run forward pass
+            # run forward & reverse windows
             pairs_fwd = cluster_pass(X_tensor, reverse=False, record_distances=True)
             involved_in_fwd = set([p for pair in pairs_fwd for p in pair])
-            # build mask for reverse pass to avoid re-checking items already paired
             mask_for_rev = np.ones(N, dtype=bool)
             if len(involved_in_fwd) > 0:
                 for idx in involved_in_fwd:
@@ -245,8 +398,9 @@ def flag_duplicate_reads(
             if len(rev_idx_map) > 0:
                 reduced_tensor = X_tensor[rev_idx_map]
                 pairs_rev_local = cluster_pass(reduced_tensor, reverse=True, record_distances=True)
-                # remap local reduced indices to global
-                remapped_rev_pairs = [(int(rev_idx_map[i]), int(rev_idx_map[j])) for (i, j) in pairs_rev_local]
+                remapped_rev_pairs = [
+                    (int(rev_idx_map[i]), int(rev_idx_map[j])) for (i, j) in pairs_rev_local
+                ]
             else:
                 remapped_rev_pairs = []
@@ -265,53 +419,41 @@ def flag_duplicate_reads(
             id_map = {old: new for new, old in enumerate(sorted(unique_initial.tolist()))}
             merged_cluster_mapped = np.array([id_map[int(x)] for x in merged_cluster], dtype=int)
-            # cluster sizes and choose lex-keeper per lex-cluster (representatives)
+            # cluster sizes and choose lex-keeper per lex-cluster (demux-aware)
             cluster_sizes = np.zeros_like(merged_cluster_mapped)
             cluster_counts = []
             unique_clusters = np.unique(merged_cluster_mapped)
             keeper_for_cluster = {}
+            obs_index = list(adata_subset.obs.index)
             for cid in unique_clusters:
                 members = np.where(merged_cluster_mapped == cid)[0].tolist()
                 csize = int(len(members))
                 cluster_counts.append(csize)
                 cluster_sizes[members] = csize
-                # pick lex keeper (representative)
-                if len(members) == 1:
-                    keeper_for_cluster[cid] = members[0]
-                else:
-                    if keep_best_metric is None:
-                        keeper_for_cluster[cid] = members[0]
-                    else:
-                        obs_index = list(adata_subset.obs.index)
-                        member_names = [obs_index[m] for m in members]
-                        try:
-                            vals = pd.to_numeric(adata_subset.obs.loc[member_names, keep_best_metric], errors="coerce").to_numpy(dtype=float)
-                        except Exception:
-                            vals = np.array([np.nan] * len(members), dtype=float)
-                        if np.all(np.isnan(vals)):
-                            keeper_for_cluster[cid] = members[0]
-                        else:
-                            if keep_best_higher:
-                                nan_mask = np.isnan(vals)
-                                vals[nan_mask] = -np.inf
-                                rel_idx = int(np.nanargmax(vals))
-                            else:
-                                nan_mask = np.isnan(vals)
-                                vals[nan_mask] = np.inf
-                                rel_idx = int(np.nanargmin(vals))
-                            keeper_for_cluster[cid] = members[rel_idx]
-            # expose lex keeper info (record only; do not enforce deletion yet)
+                keeper_for_cluster[cid] = _choose_keeper_with_demux_preference(
+                    members,
+                    adata_subset,
+                    obs_index,
+                    demux_col=demux_col,
+                    preferred_types=demux_types,
+                    keep_best_metric=keep_best_metric,
+                    keep_best_higher=keep_best_higher,
+                    lex_keeper_mask=None,  # no lex preference yet
+                )
+            # expose lex keeper info (record only)
             lex_is_keeper = np.zeros((N,), dtype=bool)
             lex_is_duplicate = np.zeros((N,), dtype=bool)
-            for cid, members in zip(unique_clusters, [np.where(merged_cluster_mapped == cid)[0].tolist() for cid in unique_clusters]):
+            for cid in unique_clusters:
+                members = np.where(merged_cluster_mapped == cid)[0].tolist()
                 keeper_idx = keeper_for_cluster[cid]
                 lex_is_keeper[keeper_idx] = True
                 for m in members:
                     if m != keeper_idx:
                         lex_is_duplicate[m] = True
-            # note: these are just recorded for inspection / later preference
-            # and will be written to adata_subset.obs below
             # record lex min pair (min of fwd/rev neighbor) for each read
             min_pair = np.full((N,), np.nan, dtype=float)
             for i in range(N):
@@ -349,7 +491,12 @@ def flag_duplicate_reads(
                         if n_comp <= 0:
                             reps_for_clustering = reps_arr
                         else:
-                            pca = PCA(n_components=n_comp, random_state=int(random_state), svd_solver="auto", copy=True)
+                            pca = PCA(
+                                n_components=n_comp,
+                                random_state=int(random_state),
+                                svd_solver="auto",
+                                copy=True,
+                            )
                             reps_for_clustering = pca.fit_transform(reps_arr)
                     else:
                         reps_for_clustering = reps_arr
@@ -360,10 +507,12 @@ def flag_duplicate_reads(
                         Z = sch.linkage(pdist_vec, method=hierarchical_linkage)
                         leaves = sch.leaves_list(Z)
                     except Exception as e:
-                        warnings.warn(f"hierarchical pass failed: {e}; skipping hierarchical stage.")
+                        warnings.warn(
+                            f"hierarchical pass failed: {e}; skipping hierarchical stage."
+                        )
                         leaves = np.arange(len(rep_global_indices), dtype=int)
-                    # apply windowed hamming comparisons across ordered reps and union via same UF (so clusters of all reads merge)
+                    # windowed hamming comparisons across ordered reps and union
                     order_global_reps = [rep_global_indices[i] for i in leaves]
                     n_reps = len(order_global_reps)
                     for pos in range(n_reps):
@@ -389,55 +538,40 @@ def flag_duplicate_reads(
                 merged_cluster_after[i] = uf.find(i)
             unique_final = np.unique(merged_cluster_after)
             id_map_final = {old: new for new, old in enumerate(sorted(unique_final.tolist()))}
-            merged_cluster_mapped_final = np.array([id_map_final[int(x)] for x in merged_cluster_after], dtype=int)
+            merged_cluster_mapped_final = np.array(
+                [id_map_final[int(x)] for x in merged_cluster_after], dtype=int
+            )
-            # compute final cluster members and choose final keeper per final cluster
+            # compute final cluster members and choose final keeper per final cluster (demux-aware)
             cluster_sizes_final = np.zeros_like(merged_cluster_mapped_final)
-            final_cluster_counts = []
-            final_unique = np.unique(merged_cluster_mapped_final)
             final_keeper_for_cluster = {}
             cluster_members_map = {}
-            for cid in final_unique:
+            obs_index = list(adata_subset.obs.index)
+            lex_mask_full = lex_is_keeper  # use lex keeper as optional tiebreaker
+            for cid in np.unique(merged_cluster_mapped_final):
                 members = np.where(merged_cluster_mapped_final == cid)[0].tolist()
                 cluster_members_map[cid] = members
-                csize = len(members)
-                final_cluster_counts.append(csize)
-                cluster_sizes_final[members] = csize
-                if csize == 1:
-                    final_keeper_for_cluster[cid] = members[0]
-                else:
-                    # prefer keep_best_metric if available; do not automatically prefer lex-keeper here unless you want to;
-                    # (user previously asked for preferring lex keepers — if desired, you can prefer lex_is_keeper among members)
-                    obs_index = list(adata_subset.obs.index)
-                    member_names = [obs_index[m] for m in members]
-                    if keep_best_metric is not None and keep_best_metric in adata_subset.obs.columns:
-                        try:
-                            vals = pd.to_numeric(adata_subset.obs.loc[member_names, keep_best_metric], errors="coerce").to_numpy(dtype=float)
-                        except Exception:
-                            vals = np.array([np.nan] * len(members), dtype=float)
-                        if np.all(np.isnan(vals)):
-                            final_keeper_for_cluster[cid] = members[0]
-                        else:
-                            if keep_best_higher:
-                                nan_mask = np.isnan(vals)
-                                vals[nan_mask] = -np.inf
-                                rel_idx = int(np.nanargmax(vals))
-                            else:
-                                nan_mask = np.isnan(vals)
-                                vals[nan_mask] = np.inf
-                                rel_idx = int(np.nanargmin(vals))
-                            final_keeper_for_cluster[cid] = members[rel_idx]
-                    else:
-                        # if lex keepers present among members, prefer them
-                        lex_members = [m for m in members if lex_is_keeper[m]]
-                        if len(lex_members) > 0:
-                            final_keeper_for_cluster[cid] = lex_members[0]
-                        else:
-                            final_keeper_for_cluster[cid] = members[0]
-            # update sequence__is_duplicate based on final clusters: non-keepers in multi-member clusters are duplicates
+                cluster_sizes_final[members] = len(members)
+                lex_mask_members = np.array([bool(lex_mask_full[m]) for m in members], dtype=bool)
+                keeper = _choose_keeper_with_demux_preference(
+                    members,
+                    adata_subset,
+                    obs_index,
+                    demux_col=demux_col,
+                    preferred_types=demux_types,
+                    keep_best_metric=keep_best_metric,
+                    keep_best_higher=keep_best_higher,
+                    lex_keeper_mask=lex_mask_members,
+                )
+                final_keeper_for_cluster[cid] = keeper
+            # update sequence__is_duplicate based on final clusters
             sequence_is_duplicate = np.zeros((N,), dtype=bool)
-            for cid in final_unique:
+            for cid in np.unique(merged_cluster_mapped_final):
                 keeper = final_keeper_for_cluster[cid]
                 members = cluster_members_map[cid]
                 if len(members) > 1:
@@ -446,8 +580,7 @@ def flag_duplicate_reads(
                             sequence_is_duplicate[m] = True
             # propagate hierarchical distances into hierarchical_min_pair for all cluster members
-            for (i_g, j_g, d) in hierarchical_pairs:
-                # identify their final cluster ids (after unions)
+            for i_g, j_g, d in hierarchical_pairs:
                 c_i = merged_cluster_mapped_final[int(i_g)]
                 c_j = merged_cluster_mapped_final[int(j_g)]
                 members_i = cluster_members_map.get(c_i, [int(i_g)])
@@ -459,7 +592,7 @@ def flag_duplicate_reads(
                     if np.isnan(hierarchical_min_pair[mj]) or (d < hierarchical_min_pair[mj]):
                         hierarchical_min_pair[mj] = d
-            # combine lex-phase min_pair and hierarchical_min_pair into the final sequence__min_hamming_to_pair
+            # combine min pairs
             combined_min = min_pair.copy()
             for i in range(N):
                 hval = hierarchical_min_pair[i]
@@ -475,69 +608,117 @@ def flag_duplicate_reads(
             adata_subset.obs["rev_hamming_to_prev"] = rev_hamming_to_prev
             adata_subset.obs["sequence__hier_hamming_to_pair"] = hierarchical_min_pair
             adata_subset.obs["sequence__min_hamming_to_pair"] = combined_min
-            # persist lex bookkeeping columns (informational)
+            # persist lex bookkeeping
             adata_subset.obs["sequence__lex_is_keeper"] = lex_is_keeper
             adata_subset.obs["sequence__lex_is_duplicate"] = lex_is_duplicate
             adata_processed_list.append(adata_subset)
-            histograms.append({
-                "sample": sample,
-                "reference": ref,
-                "distances": local_hamming_dists,            # lex local comparisons
-                "cluster_counts": final_cluster_counts,
-                "hierarchical_pairs": hierarchical_found_dists,
-            })
-    # Merge annotated subsets back together BEFORE plotting so plotting sees fwd_hamming_to_next, etc.
+            histograms.append(
+                {
+                    "sample": sample,
+                    "reference": ref,
+                    "distances": local_hamming_dists,
+                    "cluster_counts": [
+                        int(x) for x in np.unique(cluster_sizes_final[cluster_sizes_final > 0])
+                    ],
+                    "hierarchical_pairs": hierarchical_found_dists,
+                }
+            )
+    # Merge annotated subsets back together BEFORE plotting
     _original_uns = copy.deepcopy(adata.uns)
     if len(adata_processed_list) == 0:
         return adata.copy(), adata.copy()
     adata_full = ad.concat(adata_processed_list, merge="same", join="outer", index_unique=None)
+    # preserve uns (prefer original on conflicts)
+    def merge_uns_preserve(orig_uns: dict, new_uns: dict, prefer="orig") -> dict:
+        """Merge .uns dictionaries while preserving original on conflicts."""
+        out = copy.deepcopy(new_uns) if new_uns is not None else {}
+        for k, v in (orig_uns or {}).items():
+            if k not in out:
+                out[k] = copy.deepcopy(v)
+            else:
+                try:
+                    equal = out[k] == v
+                except Exception:
+                    equal = False
+                if equal:
+                    continue
+                if prefer == "orig":
+                    out[k] = copy.deepcopy(v)
+                else:
+                    out[f"orig_uns__{k}"] = copy.deepcopy(v)
+        return out
     adata_full.uns = merge_uns_preserve(_original_uns, adata_full.uns, prefer="orig")
     # Ensure expected numeric columns exist (create if missing)
-    for col in ("fwd_hamming_to_next", "rev_hamming_to_prev", "sequence__min_hamming_to_pair", "sequence__hier_hamming_to_pair"):
+    for col in (
+        "fwd_hamming_to_next",
+        "rev_hamming_to_prev",
+        "sequence__min_hamming_to_pair",
+        "sequence__hier_hamming_to_pair",
+    ):
         if col not in adata_full.obs.columns:
             adata_full.obs[col] = np.nan
-    # histograms (now driven by adata_full if requested)
-    hist_outs = os.path.join(output_directory, "read_pair_hamming_distance_histograms")
-    make_dirs([hist_outs])
-    plot_histogram_pages(histograms,
-                         distance_threshold=distance_threshold,
-                         adata=adata_full,
-                         output_directory=hist_outs,
-                         distance_types=["min","fwd","rev","hier","lex_local"],
-                         sample_key=sample_col,
-                         )
+    # histograms
+    hist_outs = (
+        os.path.join(output_directory, "read_pair_hamming_distance_histograms")
+        if output_directory
+        else None
+    )
+    if hist_outs:
+        make_dirs([hist_outs])
+    plot_histogram_pages(
+        histograms,
+        distance_threshold=distance_threshold,
+        adata=adata_full,
+        output_directory=hist_outs,
+        distance_types=["min", "fwd", "rev", "hier", "lex_local"],
+        sample_key=sample_col,
+    )
     # hamming vs metric scatter
-    scatter_outs = os.path.join(output_directory, "read_pair_hamming_distance_scatter_plots")
-    make_dirs([scatter_outs])
-    plot_hamming_vs_metric_pages(adata_full,
-                                metric_keys=metric_keys,
-                                output_dir=scatter_outs,
-                                hamming_col="sequence__min_hamming_to_pair",
-                                highlight_threshold=distance_threshold,
-                                highlight_color="red",
-                                sample_col=sample_col)
+    scatter_outs = (
+        os.path.join(output_directory, "read_pair_hamming_distance_scatter_plots")
+        if output_directory
+        else None
+    )
+    if scatter_outs:
+        make_dirs([scatter_outs])
+    plot_hamming_vs_metric_pages(
+        adata_full,
+        metric_keys=metric_keys,
+        output_dir=scatter_outs,
+        hamming_col="sequence__min_hamming_to_pair",
+        highlight_threshold=distance_threshold,
+        highlight_color="red",
+        sample_col=sample_col,
+    )
     # boolean columns from neighbor distances
-    fwd_vals = pd.to_numeric(adata_full.obs.get("fwd_hamming_to_next", pd.Series(np.nan, index=adata_full.obs.index)), errors="coerce")
-    rev_vals = pd.to_numeric(adata_full.obs.get("rev_hamming_to_prev", pd.Series(np.nan, index=adata_full.obs.index)), errors="coerce")
+    fwd_vals = pd.to_numeric(
+        adata_full.obs.get("fwd_hamming_to_next", pd.Series(np.nan, index=adata_full.obs.index)),
+        errors="coerce",
+    )
+    rev_vals = pd.to_numeric(
+        adata_full.obs.get("rev_hamming_to_prev", pd.Series(np.nan, index=adata_full.obs.index)),
+        errors="coerce",
+    )
     is_dup_dist = (fwd_vals < float(distance_threshold)) | (rev_vals < float(distance_threshold))
     is_dup_dist = is_dup_dist.fillna(False).astype(bool)
     adata_full.obs["is_duplicate_distance"] = is_dup_dist.values
-    # combine sequence-derived flag with others
-    if "sequence__is_duplicate" in adata_full.obs.columns:
-        seq_dup = adata_full.obs["sequence__is_duplicate"].astype(bool)
-    else:
-        seq_dup = pd.Series(False, index=adata_full.obs.index)
-    # cluster-based duplicate indicator (if any clustering columns exist)
+    # combine with sequence flag and any clustering flags
+    seq_dup = (
+        adata_full.obs["sequence__is_duplicate"].astype(bool)
+        if "sequence__is_duplicate" in adata_full.obs.columns
+        else pd.Series(False, index=adata_full.obs.index)
+    )
     cluster_cols = [c for c in adata_full.obs.columns if c.startswith("hamming_cluster__")]
     if cluster_cols:
         cl_mask = pd.Series(False, index=adata_full.obs.index)
@@ -550,59 +731,61 @@ def flag_duplicate_reads(
     else:
         adata_full.obs["is_duplicate_clustering"] = False
-    final_dup = seq_dup | adata_full.obs["is_duplicate_distance"].astype(bool) | adata_full.obs["is_duplicate_clustering"].astype(bool)
+    final_dup = (
+        seq_dup
+        | adata_full.obs["is_duplicate_distance"].astype(bool)
+        | adata_full.obs["is_duplicate_clustering"].astype(bool)
+    )
     adata_full.obs["is_duplicate"] = final_dup.values
-    # Final keeper enforcement: recompute per-cluster keeper from sequence__merged_cluster_id and
-    # ensure that keeper is not marked duplicate
-    if "sequence__merged_cluster_id" in adata_full.obs.columns:
-        keeper_idx_by_cluster = {}
-        metric_col = keep_best_metric if 'keep_best_metric' in locals() else None
+    # -------- Final keeper enforcement on adata_full (demux-aware) --------
+    keeper_idx_by_cluster = {}
+    metric_col = keep_best_metric
-        # group by cluster id
-        grp = adata_full.obs[["sequence__merged_cluster_id", "sequence__cluster_size"]].copy()
-        for cid, sub in grp.groupby("sequence__merged_cluster_id"):
-            try:
-                members = sub.index.to_list()
-            except Exception:
-                members = list(sub.index)
-            keeper = None
-            # prefer keep_best_metric (if present), else prefer lex keeper among members, else first member
-            if metric_col and metric_col in adata_full.obs.columns:
-                try:
-                    vals = pd.to_numeric(adata_full.obs.loc[members, metric_col], errors="coerce")
-                    if vals.notna().any():
-                        keeper = vals.idxmax() if keep_best_higher else vals.idxmin()
-                    else:
-                        keeper = members[0]
-                except Exception:
-                    keeper = members[0]
-            else:
-                # prefer lex keeper if present in this merged cluster
-                lex_candidates = [m for m in members if ("sequence__lex_is_keeper" in adata_full.obs.columns and adata_full.obs.at[m, "sequence__lex_is_keeper"])]
-                if len(lex_candidates) > 0:
-                    keeper = lex_candidates[0]
-                else:
-                    keeper = members[0]
+    # Build an index→row-number mapping
+    name_to_pos = {name: i for i, name in enumerate(adata_full.obs.index)}
+    obs_index_full = list(adata_full.obs.index)
-            keeper_idx_by_cluster[cid] = keeper
+    lex_col = (
+        "sequence__lex_is_keeper" if "sequence__lex_is_keeper" in adata_full.obs.columns else None
+    )
-        # force keepers not to be duplicates
-        is_dup_series = adata_full.obs["is_duplicate"].astype(bool)
-        for cid, keeper_idx in keeper_idx_by_cluster.items():
-            if keeper_idx in adata_full.obs.index:
-                is_dup_series.at[keeper_idx] = False
-                # clear sequence__is_duplicate for keeper if present
-                if "sequence__is_duplicate" in adata_full.obs.columns:
-                    adata_full.obs.at[keeper_idx, "sequence__is_duplicate"] = False
-                # clear lex duplicate flag too if present
-                if "sequence__lex_is_duplicate" in adata_full.obs.columns:
-                    adata_full.obs.at[keeper_idx, "sequence__lex_is_duplicate"] = False
+    for cid, sub in adata_full.obs.groupby("sequence__merged_cluster_id", dropna=False):
+        members_names = sub.index.to_list()
+        members_pos = [name_to_pos[n] for n in members_names]
-        adata_full.obs["is_duplicate"] = is_dup_series.values
+        if lex_col:
+            lex_mask_members = adata_full.obs.loc[members_names, lex_col].astype(bool).to_numpy()
+        else:
+            lex_mask_members = np.zeros(len(members_pos), dtype=bool)
+        keeper_pos = _choose_keeper_with_demux_preference(
+            members_pos,
+            adata_full,
+            obs_index_full,
+            demux_col=demux_col,
+            preferred_types=demux_types,
+            keep_best_metric=metric_col,
+            keep_best_higher=keep_best_higher,
+            lex_keeper_mask=lex_mask_members,
+        )
+        keeper_name = obs_index_full[keeper_pos]
+        keeper_idx_by_cluster[cid] = keeper_name
+    # enforce: keepers are not duplicates
+    is_dup_series = adata_full.obs["is_duplicate"].astype(bool)
+    for cid, keeper_name in keeper_idx_by_cluster.items():
+        if keeper_name in adata_full.obs.index:
+            is_dup_series.at[keeper_name] = False
+            if "sequence__is_duplicate" in adata_full.obs.columns:
+                adata_full.obs.at[keeper_name, "sequence__is_duplicate"] = False
+            if "sequence__lex_is_duplicate" in adata_full.obs.columns:
+                adata_full.obs.at[keeper_name, "sequence__lex_is_duplicate"] = False
+    adata_full.obs["is_duplicate"] = is_dup_series.values
     # reason column
     def _dup_reason_row(row):
+        """Build a semi-colon delimited duplicate reason string."""
         reasons = []
         if row.get("is_duplicate_distance", False):
             reasons.append("distance_thresh")
@@ -632,6 +815,7 @@ def flag_duplicate_reads(
 # Plot helpers (use adata_full as input)
 # ---------------------------
 def plot_histogram_pages(
     histograms,
     distance_threshold,
@@ -674,10 +858,11 @@ def plot_histogram_pages(
         use_adata = False
     if len(samples) == 0 or len(references) == 0:
-        print("No histogram data to plot.")
+        logger.info("No histogram data to plot.")
         return {"distance_pages": [], "cluster_size_pages": []}
     def clean_array(arr):
+        """Filter array values to finite [0, 1] range for plotting."""
         if arr is None or len(arr) == 0:
             return np.array([], dtype=float)
         a = np.asarray(arr, dtype=float)
@@ -707,7 +892,9 @@ def plot_histogram_pages(
                 if "rev" in distance_types and "rev_hamming_to_prev" in group.columns:
                     grid[(s, r)]["rev"].extend(clean_array(group["rev_hamming_to_prev"].to_numpy()))
                 if "hier" in distance_types and "sequence__hier_hamming_to_pair" in group.columns:
-                    grid[(s, r)]["hier"].extend(clean_array(group["sequence__hier_hamming_to_pair"].to_numpy()))
+                    grid[(s, r)]["hier"].extend(
+                        clean_array(group["sequence__hier_hamming_to_pair"].to_numpy())
+                    )
         else:
             for (s, r), group in grouped:
                 if "min" in distance_types and distance_key in group.columns:
@@ -717,7 +904,9 @@ def plot_histogram_pages(
                 if "rev" in distance_types and "rev_hamming_to_prev" in group.columns:
                     grid[(s, r)]["rev"].extend(clean_array(group["rev_hamming_to_prev"].to_numpy()))
                 if "hier" in distance_types and "sequence__hier_hamming_to_pair" in group.columns:
-                    grid[(s, r)]["hier"].extend(clean_array(group["sequence__hier_hamming_to_pair"].to_numpy()))
+                    grid[(s, r)]["hier"].extend(
+                        clean_array(group["sequence__hier_hamming_to_pair"].to_numpy())
+                    )
     # legacy histograms fallback
     if histograms:
@@ -753,9 +942,17 @@ def plot_histogram_pages(
     # counts (for labels)
     if use_adata:
-        counts = {(s, r): int(((adata.obs[sample_key] == s) & (adata.obs[ref_key] == r)).sum()) for s in samples for r in references}
+        counts = {
+            (s, r): int(((adata.obs[sample_key] == s) & (adata.obs[ref_key] == r)).sum())
+            for s in samples
+            for r in references
+        }
     else:
-        counts = {(s, r): sum(len(grid[(s, r)][dt]) for dt in distance_types) for s in samples for r in references}
+        counts = {
+            (s, r): sum(len(grid[(s, r)][dt]) for dt in distance_types)
+            for s in samples
+            for r in references
+        }
     distance_pages = []
     cluster_size_pages = []
@@ -773,7 +970,9 @@ def plot_histogram_pages(
         # Distance histogram page
         fig_w = figsize_per_cell[0] * ncols
         fig_h = figsize_per_cell[1] * nrows
-        fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(fig_w, fig_h), dpi=dpi, squeeze=False)
+        fig, axes = plt.subplots(
+            nrows=nrows, ncols=ncols, figsize=(fig_w, fig_h), dpi=dpi, squeeze=False
+        )
         for r_idx, sample_name in enumerate(chunk):
             for c_idx, ref_name in enumerate(references):
@@ -789,17 +988,37 @@ def plot_histogram_pages(
                         vals = vals[(vals >= 0.0) & (vals <= ref_vmax)]
                     if vals.size > 0:
                         any_data = True
-                        ax.hist(vals, bins=bins_edges, alpha=0.5, label=dtype, density=False, stacked=False,
-                                color=dtype_colors.get(dtype, None))
+                        ax.hist(
+                            vals,
+                            bins=bins_edges,
+                            alpha=0.5,
+                            label=dtype,
+                            density=False,
+                            stacked=False,
+                            color=dtype_colors.get(dtype, None),
+                        )
                 if not any_data:
-                    ax.text(0.5, 0.5, "No data", ha="center", va="center", transform=ax.transAxes, fontsize=10, color="gray")
+                    ax.text(
+                        0.5,
+                        0.5,
+                        "No data",
+                        ha="center",
+                        va="center",
+                        transform=ax.transAxes,
+                        fontsize=10,
+                        color="gray",
+                    )
                 # threshold line (make sure it is within axis)
                 ax.axvline(distance_threshold, color="red", linestyle="--", linewidth=1)
                 if r_idx == 0:
                     ax.set_title(str(ref_name), fontsize=10)
                 if c_idx == 0:
-                    total_reads = sum(counts.get((sample_name, ref), 0) for ref in references) if not use_adata else int((adata.obs[sample_key] == sample_name).sum())
+                    total_reads = (
+                        sum(counts.get((sample_name, ref), 0) for ref in references)
+                        if not use_adata
+                        else int((adata.obs[sample_key] == sample_name).sum())
+                    )
                     ax.set_ylabel(f"{sample_name}\n(n={total_reads})", fontsize=9)
                 if r_idx == nrows - 1:
                     ax.set_xlabel("Hamming Distance", fontsize=9)
@@ -811,12 +1030,16 @@ def plot_histogram_pages(
                 if r_idx == 0 and c_idx == 0:
                     ax.legend(fontsize=7, loc="upper right")
-        fig.suptitle(f"Hamming distance histograms (rows=samples, cols=references) — page {page+1}/{n_pages}", fontsize=12, y=0.995)
+        fig.suptitle(
+            f"Hamming distance histograms (rows=samples, cols=references) — page {page + 1}/{n_pages}",
+            fontsize=12,
+            y=0.995,
+        )
         fig.tight_layout(rect=[0, 0, 1, 0.96])
         if output_directory:
             os.makedirs(output_directory, exist_ok=True)
-            fname = os.path.join(output_directory, f"hamming_histograms_page_{page+1}.png")
+            fname = os.path.join(output_directory, f"hamming_histograms_page_{page + 1}.png")
             plt.savefig(fname, bbox_inches="tight")
             distance_pages.append(fname)
         else:
@@ -826,22 +1049,43 @@ def plot_histogram_pages(
         # Cluster-size histogram page (unchanged except it uses adata-derived sizes per cluster if available)
         fig_w = figsize_per_cell[0] * ncols
         fig_h = figsize_per_cell[1] * nrows
-        fig2, axes2 = plt.subplots(nrows=nrows, ncols=ncols, figsize=(fig_w, fig_h), dpi=dpi, squeeze=False)
+        fig2, axes2 = plt.subplots(
+            nrows=nrows, ncols=ncols, figsize=(fig_w, fig_h), dpi=dpi, squeeze=False
+        )
         for r_idx, sample_name in enumerate(chunk):
             for c_idx, ref_name in enumerate(references):
                 ax = axes2[r_idx][c_idx]
                 sizes = []
-                if use_adata and ("sequence__merged_cluster_id" in adata.obs.columns and "sequence__cluster_size" in adata.obs.columns):
-                    sub = adata.obs[(adata.obs[sample_key] == sample_name) & (adata.obs[ref_key] == ref_name)]
+                if use_adata and (
+                    "sequence__merged_cluster_id" in adata.obs.columns
+                    and "sequence__cluster_size" in adata.obs.columns
+                ):
+                    sub = adata.obs[
+                        (adata.obs[sample_key] == sample_name) & (adata.obs[ref_key] == ref_name)
+                    ]
                     if not sub.empty:
                         try:
-                            grp = sub.groupby("sequence__merged_cluster_id")["sequence__cluster_size"].first()
-                            sizes = [int(x) for x in grp.to_numpy().tolist() if (pd.notna(x) and np.isfinite(x))]
+                            grp = sub.groupby("sequence__merged_cluster_id")[
+                                "sequence__cluster_size"
+                            ].first()
+                            sizes = [
+                                int(x)
+                                for x in grp.to_numpy().tolist()
+                                if (pd.notna(x) and np.isfinite(x))
+                            ]
                         except Exception:
                             try:
-                                unique_pairs = sub[["sequence__merged_cluster_id", "sequence__cluster_size"]].drop_duplicates()
-                                sizes = [int(x) for x in unique_pairs["sequence__cluster_size"].dropna().astype(int).tolist()]
+                                unique_pairs = sub[
+                                    ["sequence__merged_cluster_id", "sequence__cluster_size"]
+                                ].drop_duplicates()
+                                sizes = [
+                                    int(x)
+                                    for x in unique_pairs["sequence__cluster_size"]
+                                    .dropna()
+                                    .astype(int)
+                                    .tolist()
+                                ]
                             except Exception:
                                 sizes = []
                 if (not sizes) and histograms:
@@ -855,23 +1099,38 @@ def plot_histogram_pages(
                     ax.set_xlabel("Cluster size")
                     ax.set_ylabel("Count")
                 else:
-                    ax.text(0.5, 0.5, "No clusters", ha="center", va="center", transform=ax.transAxes, fontsize=10, color="gray")
+                    ax.text(
+                        0.5,
+                        0.5,
+                        "No clusters",
+                        ha="center",
+                        va="center",
+                        transform=ax.transAxes,
+                        fontsize=10,
+                        color="gray",
+                    )
                 if r_idx == 0:
                     ax.set_title(str(ref_name), fontsize=10)
                 if c_idx == 0:
-                    total_reads = sum(counts.get((sample_name, ref), 0) for ref in references) if not use_adata else int((adata.obs[sample_key] == sample_name).sum())
+                    total_reads = (
+                        sum(counts.get((sample_name, ref), 0) for ref in references)
+                        if not use_adata
+                        else int((adata.obs[sample_key] == sample_name).sum())
+                    )
                     ax.set_ylabel(f"{sample_name}\n(n={total_reads})", fontsize=9)
                 if r_idx != nrows - 1:
                     ax.set_xticklabels([])
                 ax.grid(True, alpha=0.25)
-        fig2.suptitle(f"Union-find cluster size histograms — page {page+1}/{n_pages}", fontsize=12, y=0.995)
+        fig2.suptitle(
+            f"Union-find cluster size histograms — page {page + 1}/{n_pages}", fontsize=12, y=0.995
+        )
         fig2.tight_layout(rect=[0, 0, 1, 0.96])
         if output_directory:
-            fname2 = os.path.join(output_directory, f"cluster_size_histograms_page_{page+1}.png")
+            fname2 = os.path.join(output_directory, f"cluster_size_histograms_page_{page + 1}.png")
             plt.savefig(fname2, bbox_inches="tight")
             cluster_size_pages.append(fname2)
         else:
@@ -923,7 +1182,9 @@ def plot_hamming_vs_metric_pages(
     obs = adata.obs
     if sample_col not in obs.columns or ref_col not in obs.columns:
-        raise ValueError(f"sample_col '{sample_col}' and ref_col '{ref_col}' must exist in adata.obs")
+        raise ValueError(
+            f"sample_col '{sample_col}' and ref_col '{ref_col}' must exist in adata.obs"
+        )
     # canonicalize samples and refs
     if samples is None:
@@ -964,14 +1225,24 @@ def plot_hamming_vs_metric_pages(
             sY = pd.to_numeric(obs[hamming_col], errors="coerce") if hamming_present else None
             if (sX is not None) and (sY is not None):
-                valid_both = sX.notna() & sY.notna() & np.isfinite(sX.values) & np.isfinite(sY.values)
+                valid_both = (
+                    sX.notna() & sY.notna() & np.isfinite(sX.values) & np.isfinite(sY.values)
+                )
                 if valid_both.any():
                     xvals = sX[valid_both].to_numpy(dtype=float)
                     yvals = sY[valid_both].to_numpy(dtype=float)
                     xmin, xmax = float(np.nanmin(xvals)), float(np.nanmax(xvals))
                     ymin, ymax = float(np.nanmin(yvals)), float(np.nanmax(yvals))
-                    xpad = max(1e-6, (xmax - xmin) * 0.05) if xmax > xmin else max(1e-3, abs(xmin) * 0.05 + 1e-3)
-                    ypad = max(1e-6, (ymax - ymin) * 0.05) if ymax > ymin else max(1e-3, abs(ymin) * 0.05 + 1e-3)
+                    xpad = (
+                        max(1e-6, (xmax - xmin) * 0.05)
+                        if xmax > xmin
+                        else max(1e-3, abs(xmin) * 0.05 + 1e-3)
+                    )
+                    ypad = (
+                        max(1e-6, (ymax - ymin) * 0.05)
+                        if ymax > ymin
+                        else max(1e-3, abs(ymin) * 0.05 + 1e-3)
+                    )
                     global_xlim = (xmin - xpad, xmax + xpad)
                     global_ylim = (ymin - ypad, ymax + ypad)
                 else:
@@ -979,23 +1250,39 @@ def plot_hamming_vs_metric_pages(
                     sY_finite = sY[np.isfinite(sY)]
                     if sX_finite.size > 0:
                         xmin, xmax = float(np.nanmin(sX_finite)), float(np.nanmax(sX_finite))
-                        xpad = max(1e-6, (xmax - xmin) * 0.05) if xmax > xmin else max(1e-3, abs(xmin) * 0.05 + 1e-3)
+                        xpad = (
+                            max(1e-6, (xmax - xmin) * 0.05)
+                            if xmax > xmin
+                            else max(1e-3, abs(xmin) * 0.05 + 1e-3)
+                        )
                         global_xlim = (xmin - xpad, xmax + xpad)
                     if sY_finite.size > 0:
                         ymin, ymax = float(np.nanmin(sY_finite)), float(np.nanmax(sY_finite))
-                        ypad = max(1e-6, (ymax - ymin) * 0.05) if ymax > ymin else max(1e-3, abs(ymin) * 0.05 + 1e-3)
+                        ypad = (
+                            max(1e-6, (ymax - ymin) * 0.05)
+                            if ymax > ymin
+                            else max(1e-3, abs(ymin) * 0.05 + 1e-3)
+                        )
                         global_ylim = (ymin - ypad, ymax + ypad)
             elif sX is not None:
                 sX_finite = sX[np.isfinite(sX)]
                 if sX_finite.size > 0:
                     xmin, xmax = float(np.nanmin(sX_finite)), float(np.nanmax(sX_finite))
-                    xpad = max(1e-6, (xmax - xmin) * 0.05) if xmax > xmin else max(1e-3, abs(xmin) * 0.05 + 1e-3)
+                    xpad = (
+                        max(1e-6, (xmax - xmin) * 0.05)
+                        if xmax > xmin
+                        else max(1e-3, abs(xmin) * 0.05 + 1e-3)
+                    )
                     global_xlim = (xmin - xpad, xmax + xpad)
             elif sY is not None:
                 sY_finite = sY[np.isfinite(sY)]
                 if sY_finite.size > 0:
                     ymin, ymax = float(np.nanmin(sY_finite)), float(np.nanmax(sY_finite))
-                    ypad = max(1e-6, (ymax - ymin) * 0.05) if ymax > ymin else max(1e-3, abs(ymin) * 0.05 + 1e-3)
+                    ypad = (
+                        max(1e-6, (ymax - ymin) * 0.05)
+                        if ymax > ymin
+                        else max(1e-3, abs(ymin) * 0.05 + 1e-3)
+                    )
                     global_ylim = (ymin - ypad, ymax + ypad)
         # pagination
@@ -1005,15 +1292,19 @@ def plot_hamming_vs_metric_pages(
             ncols = len(cols)
             fig_w = ncols * figsize_per_cell[0]
             fig_h = nrows * figsize_per_cell[1]
-            fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(fig_w, fig_h), dpi=dpi, squeeze=False)
+            fig, axes = plt.subplots(
+                nrows=nrows, ncols=ncols, figsize=(fig_w, fig_h), dpi=dpi, squeeze=False
+            )
             for r_idx, sample_name in enumerate(chunk):
                 for c_idx, ref_name in enumerate(cols):
                     ax = axes[r_idx][c_idx]
                     if ref_name == extra_col:
-                        mask = (obs[sample_col].values == sample_name)
+                        mask = obs[sample_col].values == sample_name
                     else:
-                        mask = (obs[sample_col].values == sample_name) & (obs[ref_col].values == ref_name)
+                        mask = (obs[sample_col].values == sample_name) & (
+                            obs[ref_col].values == ref_name
+                        )
                     sub = obs[mask]
@@ -1022,7 +1313,9 @@ def plot_hamming_vs_metric_pages(
                     else:
                         x_all = np.array([], dtype=float)
                     if hamming_col in sub.columns:
-                        y_all = pd.to_numeric(sub[hamming_col], errors="coerce").to_numpy(dtype=float)
+                        y_all = pd.to_numeric(sub[hamming_col], errors="coerce").to_numpy(
+                            dtype=float
+                        )
                     else:
                         y_all = np.array([], dtype=float)
@@ -1040,32 +1333,67 @@ def plot_hamming_vs_metric_pages(
                         idxs_valid = np.array([], dtype=int)
                     if x.size == 0:
-                        ax.text(0.5, 0.5, "No data", ha="center", va="center", transform=ax.transAxes)
+                        ax.text(
+                            0.5, 0.5, "No data", ha="center", va="center", transform=ax.transAxes
+                        )
                         clusters_info[(sample_name, ref_name)] = {"diag": None, "n_points": 0}
                     else:
                         # Decide color mapping
-                        if color_by_duplicate and duplicate_col in adata.obs.columns and idxs_valid.size > 0:
+                        if (
+                            color_by_duplicate
+                            and duplicate_col in adata.obs.columns
+                            and idxs_valid.size > 0
+                        ):
                             # get boolean series aligned to idxs_valid
                             try:
-                                dup_flags = adata.obs.loc[idxs_valid, duplicate_col].astype(bool).to_numpy()
+                                dup_flags = (
+                                    adata.obs.loc[idxs_valid, duplicate_col].astype(bool).to_numpy()
+                                )
                             except Exception:
                                 dup_flags = np.zeros(len(idxs_valid), dtype=bool)
                             mask_dup = dup_flags
                             mask_nondup = ~mask_dup
                             # plot non-duplicates first in gray, duplicates in highlight color
                             if mask_nondup.any():
-                                ax.scatter(x[mask_nondup], y[mask_nondup], s=12, alpha=0.6, rasterized=True, c="lightgray")
+                                ax.scatter(
+                                    x[mask_nondup],
+                                    y[mask_nondup],
+                                    s=12,
+                                    alpha=0.6,
+                                    rasterized=True,
+                                    c="lightgray",
+                                )
                             if mask_dup.any():
-                                ax.scatter(x[mask_dup], y[mask_dup], s=20, alpha=0.9, rasterized=True, c=highlight_color, edgecolors="k", linewidths=0.3)
+                                ax.scatter(
+                                    x[mask_dup],
+                                    y[mask_dup],
+                                    s=20,
+                                    alpha=0.9,
+                                    rasterized=True,
+                                    c=highlight_color,
+                                    edgecolors="k",
+                                    linewidths=0.3,
+                                )
                         else:
                             # old behavior: highlight by threshold if requested
                             if highlight_threshold is not None and y.size:
                                 mask_low = (y < float(highlight_threshold)) & np.isfinite(y)
                                 mask_high = ~mask_low
                                 if mask_high.any():
-                                    ax.scatter(x[mask_high], y[mask_high], s=12, alpha=0.6, rasterized=True)
+                                    ax.scatter(
+                                        x[mask_high], y[mask_high], s=12, alpha=0.6, rasterized=True
+                                    )
                                 if mask_low.any():
-                                    ax.scatter(x[mask_low], y[mask_low], s=18, alpha=0.9, rasterized=True, c=highlight_color, edgecolors="k", linewidths=0.3)
+                                    ax.scatter(
+                                        x[mask_low],
+                                        y[mask_low],
+                                        s=18,
+                                        alpha=0.9,
+                                        rasterized=True,
+                                        c=highlight_color,
+                                        edgecolors="k",
+                                        linewidths=0.3,
+                                    )
                             else:
                                 ax.scatter(x, y, s=12, alpha=0.6, rasterized=True)
@@ -1081,7 +1409,9 @@ def plot_hamming_vs_metric_pages(
                                     zi = gaussian_kde(np.vstack([x, y]))(coords).reshape(xi_g.shape)
                                     ax.contourf(xi_g, yi_g, zi, levels=8, alpha=0.35, cmap="Blues")
                                 else:
-                                    ax.scatter(x, y, c=kde2, s=16, cmap="viridis", alpha=0.7, linewidths=0)
+                                    ax.scatter(
+                                        x, y, c=kde2, s=16, cmap="viridis", alpha=0.7, linewidths=0
+                                    )
                             except Exception:
                                 pass
@@ -1090,16 +1420,29 @@ def plot_hamming_vs_metric_pages(
                                 a, b = np.polyfit(x, y, 1)
                                 xs = np.linspace(np.nanmin(x), np.nanmax(x), 100)
                                 ys = a * xs + b
-                                ax.plot(xs, ys, linestyle="--", linewidth=1.2, alpha=0.9, color="red")
+                                ax.plot(
+                                    xs, ys, linestyle="--", linewidth=1.2, alpha=0.9, color="red"
+                                )
                                 r = np.corrcoef(x, y)[0, 1]
-                                ax.text(0.98, 0.02, f"r={float(r):.3f}", ha="right", va="bottom", transform=ax.transAxes, fontsize=8,
-                                        bbox=dict(facecolor="white", alpha=0.6, boxstyle="round,pad=0.2"))
+                                ax.text(
+                                    0.98,
+                                    0.02,
+                                    f"r={float(r):.3f}",
+                                    ha="right",
+                                    va="bottom",
+                                    transform=ax.transAxes,
+                                    fontsize=8,
+                                    bbox=dict(
+                                        facecolor="white", alpha=0.6, boxstyle="round,pad=0.2"
+                                    ),
+                                )
                             except Exception:
                                 pass
                         if clustering:
                             cl_labels, diag = _run_clustering(
-                                x, y,
+                                x,
+                                y,
                                 method=clustering.get("method", "dbscan"),
                                 n_clusters=clustering.get("n_clusters", 2),
                                 dbscan_eps=clustering.get("dbscan_eps", 0.05),
@@ -1113,32 +1456,59 @@ def plot_hamming_vs_metric_pages(
                             if len(unique_nonnoise) > 0:
                                 medians = {}
                                 for lab in unique_nonnoise:
-                                    mask_lab = (cl_labels == lab)
-                                    medians[lab] = float(np.median(y[mask_lab])) if mask_lab.any() else float("nan")
-                                sorted_by_median = sorted(unique_nonnoise, key=lambda l: (np.nan if np.isnan(medians[l]) else medians[l]), reverse=True)
+                                    mask_lab = cl_labels == lab
+                                    medians[lab] = (
+                                        float(np.median(y[mask_lab]))
+                                        if mask_lab.any()
+                                        else float("nan")
+                                    )
+                                sorted_by_median = sorted(
+                                    unique_nonnoise,
+                                    key=lambda idx: (
+                                        np.nan if np.isnan(medians[idx]) else medians[idx]
+                                    ),
+                                    reverse=True,
+                                )
                                 mapping = {old: new for new, old in enumerate(sorted_by_median)}
                                 for i_lab in range(len(remapped_labels)):
                                     if remapped_labels[i_lab] != -1:
-                                        remapped_labels[i_lab] = mapping.get(remapped_labels[i_lab], -1)
+                                        remapped_labels[i_lab] = mapping.get(
+                                            remapped_labels[i_lab], -1
+                                        )
                                 diag = diag or {}
-                                diag["cluster_median_hamming"] = {int(old): medians[old] for old in medians}
-                                diag["cluster_old_to_new_map"] = {int(old): int(new) for old, new in mapping.items()}
+                                diag["cluster_median_hamming"] = {
+                                    int(old): medians[old] for old in medians
+                                }
+                                diag["cluster_old_to_new_map"] = {
+                                    int(old): int(new) for old, new in mapping.items()
+                                }
                             else:
                                 remapped_labels = cl_labels.copy()
                                 diag = diag or {}
                                 diag["cluster_median_hamming"] = {}
                                 diag["cluster_old_to_new_map"] = {}
-                            _overlay_clusters_on_ax(ax, x, y, remapped_labels, diag,
-                                                    cmap=clustering.get("cmap", "tab10"),
-                                                    hull=clustering.get("hull", True),
-                                                    show_cluster_labels=True)
+                            _overlay_clusters_on_ax(
+                                ax,
+                                x,
+                                y,
+                                remapped_labels,
+                                diag,
+                                cmap=clustering.get("cmap", "tab10"),
+                                hull=clustering.get("hull", True),
+                                show_cluster_labels=True,
+                            )
-                            clusters_info[(sample_name, ref_name)] = {"diag": diag, "n_points": len(x)}
+                            clusters_info[(sample_name, ref_name)] = {
+                                "diag": diag,
+                                "n_points": len(x),
+                            }
                             if write_clusters_to_adata and idxs_valid.size > 0:
-                                colname_safe_ref = (ref_name if ref_name != extra_col else "ALLREFS")
-                                colname = f"hamming_cluster__{metric}__{sample_name}__{colname_safe_ref}"
+                                colname_safe_ref = ref_name if ref_name != extra_col else "ALLREFS"
+                                colname = (
+                                    f"hamming_cluster__{metric}__{sample_name}__{colname_safe_ref}"
+                                )
                                 if colname not in adata.obs.columns:
                                     adata.obs[colname] = np.nan
                                 lab_arr = remapped_labels.astype(float)
@@ -1178,7 +1548,11 @@ def plot_hamming_vs_metric_pages(
                 plt.show()
             plt.close(fig)
-        saved_map[metric] = {"files": files, "clusters_info": clusters_info, "written_cols": written_cols}
+        saved_map[metric] = {
+            "files": files,
+            "clusters_info": clusters_info,
+            "written_cols": written_cols,
+        }
     return saved_map
@@ -1187,7 +1561,7 @@ def _run_clustering(
     x: np.ndarray,
     y: np.ndarray,
     *,
-    method: str = "kmeans",   # "kmeans", "dbscan", "gmm", "hdbscan"
+    method: str = "kmeans",  # "kmeans", "dbscan", "gmm", "hdbscan"
     n_clusters: int = 2,
     dbscan_eps: float = 0.05,
     dbscan_min_samples: int = 5,
@@ -1198,13 +1572,6 @@ def _run_clustering(
     Run clustering on 2D points (x,y). Returns labels (len = npoints) and diagnostics dict.
     Labels follow sklearn conventions (noise -> -1 for DBSCAN/HDBSCAN).
     """
-    try:
-        from sklearn.cluster import KMeans, DBSCAN
-        from sklearn.mixture import GaussianMixture
-        from sklearn.metrics import silhouette_score
-    except Exception:
-        KMeans = DBSCAN = GaussianMixture = silhouette_score = None
     pts = np.column_stack([x, y])
     diagnostics: Dict[str, Any] = {"method": method, "n_input": len(x)}
     if len(x) < min_points:
@@ -1270,7 +1637,11 @@ def _run_clustering(
     # compute silhouette if suitable
     try:
-        if diagnostics.get("n_clusters_found", 0) >= 2 and len(x) >= 3 and silhouette_score is not None:
+        if (
+            diagnostics.get("n_clusters_found", 0) >= 2
+            and len(x) >= 3
+            and silhouette_score is not None
+        ):
             diagnostics["silhouette"] = float(silhouette_score(pts, labels))
         else:
             diagnostics["silhouette"] = None
@@ -1305,7 +1676,6 @@ def _overlay_clusters_on_ax(
     Labels == -1 are noise and drawn in grey.
     Also annotates cluster numbers near centroids (contiguous numbers starting at 0).
     """
-    import matplotlib.colors as mcolors
     from scipy.spatial import ConvexHull
     labels = np.asarray(labels)
@@ -1323,19 +1693,47 @@ def _overlay_clusters_on_ax(
         if not mask.any():
             continue
         col = (0.6, 0.6, 0.6, 0.6) if lab == -1 else colors[idx % ncolors]
-        ax.scatter(x[mask], y[mask], s=20, c=[col], alpha=alpha_pts, marker=marker, linewidths=0.2, edgecolors="none", rasterized=True)
+        ax.scatter(
+            x[mask],
+            y[mask],
+            s=20,
+            c=[col],
+            alpha=alpha_pts,
+            marker=marker,
+            linewidths=0.2,
+            edgecolors="none",
+            rasterized=True,
+        )
         if lab != -1:
             # centroid
             if plot_centroids:
                 cx = float(np.mean(x[mask]))
                 cy = float(np.mean(y[mask]))
-                ax.scatter([cx], [cy], s=centroid_size, marker=centroid_marker, c=[col], edgecolor="k", linewidth=0.6, zorder=10)
+                ax.scatter(
+                    [cx],
+                    [cy],
+                    s=centroid_size,
+                    marker=centroid_marker,
+                    c=[col],
+                    edgecolor="k",
+                    linewidth=0.6,
+                    zorder=10,
+                )
                 if show_cluster_labels:
-                    ax.text(cx, cy, str(int(lab)), color="white", fontsize=cluster_label_fontsize,
-                            ha="center", va="center", weight="bold", zorder=12,
-                            bbox=dict(facecolor=(0,0,0,0.5), pad=0.3, boxstyle="round"))
+                    ax.text(
+                        cx,
+                        cy,
+                        str(int(lab)),
+                        color="white",
+                        fontsize=cluster_label_fontsize,
+                        ha="center",
+                        va="center",
+                        weight="bold",
+                        zorder=12,
+                        bbox=dict(facecolor=(0, 0, 0, 0.5), pad=0.3, boxstyle="round"),
+                    )
             # hull
             if hull and np.sum(mask) >= 3:
@@ -1343,9 +1741,16 @@ def _overlay_clusters_on_ax(
                     ch_pts = pts[mask]
                     hull_idx = ConvexHull(ch_pts).vertices
                     hull_poly = ch_pts[hull_idx]
-                    ax.fill(hull_poly[:, 0], hull_poly[:, 1], alpha=hull_alpha, facecolor=col, edgecolor=hull_edgecolor, linewidth=0.6, zorder=5)
+                    ax.fill(
+                        hull_poly[:, 0],
+                        hull_poly[:, 1],
+                        alpha=hull_alpha,
+                        facecolor=col,
+                        edgecolor=hull_edgecolor,
+                        linewidth=0.6,
+                        zorder=5,
+                    )
                 except Exception:
                     pass
     return None

smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl