PyPI - smftools - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

smftools 0.3.1py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

smftools/_version.py +1 -1
smftools/cli/chimeric_adata.py +1563 -0
smftools/cli/helpers.py +18 -2
smftools/cli/hmm_adata.py +18 -1
smftools/cli/latent_adata.py +522 -67
smftools/cli/load_adata.py +2 -2
smftools/cli/preprocess_adata.py +32 -93
smftools/cli/recipes.py +26 -0
smftools/cli/spatial_adata.py +23 -109
smftools/cli/variant_adata.py +423 -0
smftools/cli_entry.py +41 -5
smftools/config/conversion.yaml +0 -10
smftools/config/deaminase.yaml +3 -0
smftools/config/default.yaml +49 -13
smftools/config/experiment_config.py +96 -3
smftools/constants.py +4 -0
smftools/hmm/call_hmm_peaks.py +1 -1
smftools/informatics/binarize_converted_base_identities.py +2 -89
smftools/informatics/converted_BAM_to_adata.py +53 -13
smftools/informatics/h5ad_functions.py +83 -0
smftools/informatics/modkit_extract_to_adata.py +4 -0
smftools/plotting/__init__.py +26 -12
smftools/plotting/autocorrelation_plotting.py +22 -4
smftools/plotting/chimeric_plotting.py +1893 -0
smftools/plotting/classifiers.py +28 -14
smftools/plotting/general_plotting.py +58 -3362
smftools/plotting/hmm_plotting.py +1586 -2
smftools/plotting/latent_plotting.py +804 -0
smftools/plotting/plotting_utils.py +243 -0
smftools/plotting/position_stats.py +16 -8
smftools/plotting/preprocess_plotting.py +281 -0
smftools/plotting/qc_plotting.py +8 -3
smftools/plotting/spatial_plotting.py +1134 -0
smftools/plotting/variant_plotting.py +1231 -0
smftools/preprocessing/__init__.py +3 -0
smftools/preprocessing/append_base_context.py +1 -1
smftools/preprocessing/append_mismatch_frequency_sites.py +35 -6
smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
smftools/preprocessing/append_variant_call_layer.py +480 -0
smftools/preprocessing/flag_duplicate_reads.py +4 -4
smftools/preprocessing/invert_adata.py +1 -0
smftools/readwrite.py +109 -85
smftools/tools/__init__.py +6 -0
smftools/tools/calculate_knn.py +121 -0
smftools/tools/calculate_nmf.py +18 -7
smftools/tools/calculate_pca.py +180 -0
smftools/tools/calculate_umap.py +70 -154
smftools/tools/position_stats.py +4 -4
smftools/tools/rolling_nn_distance.py +640 -3
smftools/tools/sequence_alignment.py +140 -0
smftools/tools/tensor_factorization.py +52 -4
{smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/METADATA +3 -1
{smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/RECORD +56 -42
{smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
{smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
{smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0

smftools/cli/latent_adata.py CHANGED Viewed

@@ -2,16 +2,118 @@ from __future__ import annotations
 import logging
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional, Sequence, Tuple
 import anndata as ad
-from smftools.constants import LATENT_DIR, LOGGING_DIR, SEQUENCE_INTEGER_ENCODING
+from smftools.constants import LATENT_DIR, LOGGING_DIR, REFERENCE_STRAND, SEQUENCE_INTEGER_ENCODING
 from smftools.logging_utils import get_logger, setup_logging
 logger = get_logger(__name__)
+def _build_mod_sites_var_filter_mask(
+    adata: ad.AnnData,
+    references: Sequence[str],
+    cfg,
+    smf_modality: str,
+    deaminase: bool,
+) -> "np.ndarray":
+    """Build a boolean var mask for mod sites across references."""
+    import numpy as np
+    mod_target_bases = _expand_mod_target_bases(cfg.mod_target_bases)
+    ref_masks = []
+    for ref in references:
+        if deaminase and smf_modality != "direct":
+            mod_site_cols = [f"{ref}_C_site"]
+        else:
+            mod_site_cols = [f"{ref}_{base}_site" for base in mod_target_bases]
+        position_col = f"position_in_{ref}"
+        required_cols = mod_site_cols + [position_col]
+        missing = [col for col in required_cols if col not in adata.var.columns]
+        if missing:
+            raise KeyError(f"var_filters not found in adata.var: {missing}")
+        mod_masks = [np.asarray(adata.var[col].values, dtype=bool) for col in mod_site_cols]
+        mod_mask = mod_masks[0] if len(mod_masks) == 1 else np.logical_or.reduce(mod_masks)
+        position_mask = np.asarray(adata.var[position_col].values, dtype=bool)
+        ref_masks.append(np.logical_and(mod_mask, position_mask))
+    if not ref_masks:
+        return np.ones(adata.n_vars, dtype=bool)
+    return np.logical_and.reduce(ref_masks)
+def _build_shared_valid_non_mod_sites_mask(
+    adata: ad.AnnData,
+    references: Sequence[str],
+    cfg,
+    smf_modality: str,
+    deaminase: bool,
+) -> "np.ndarray":
+    """Build a boolean var mask for shared valid positions without mod sites."""
+    import numpy as np
+    shared_position_mask = _build_reference_position_mask(adata, references)
+    if len(references) == 0:
+        return shared_position_mask
+    mod_target_bases = _expand_mod_target_bases(cfg.mod_target_bases)
+    ref_mod_masks = []
+    for ref in references:
+        if deaminase and smf_modality != "direct":
+            mod_site_cols = [f"{ref}_C_site"]
+        else:
+            mod_site_cols = [f"{ref}_{base}_site" for base in mod_target_bases]
+        required_cols = mod_site_cols
+        missing = [col for col in required_cols if col not in adata.var.columns]
+        if missing:
+            raise KeyError(f"var_filters not found in adata.var: {missing}")
+        mod_masks = [np.asarray(adata.var[col].values, dtype=bool) for col in mod_site_cols]
+        ref_mod_masks.append(
+            mod_masks[0] if len(mod_masks) == 1 else np.logical_or.reduce(mod_masks)
+        )
+    any_mod_mask = (
+        np.logical_or.reduce(ref_mod_masks) if ref_mod_masks else np.zeros(adata.n_vars, dtype=bool)
+    )
+    return np.logical_and(shared_position_mask, np.logical_not(any_mod_mask))
+def _expand_mod_target_bases(mod_target_bases: Sequence[str]) -> list[str]:
+    """Ensure ambiguous GpC/CpG sites are included when requested."""
+    bases = list(mod_target_bases)
+    if any(base in {"GpC", "CpG"} for base in bases) and "ambiguous_GpC_CpG" not in bases:
+        bases.append("ambiguous_GpC_CpG")
+    return bases
+def _build_reference_position_mask(
+    adata: ad.AnnData,
+    references: Sequence[str],
+) -> "np.ndarray":
+    """Build a boolean var mask for positions valid across references."""
+    import numpy as np
+    ref_masks = []
+    for ref in references:
+        position_col = f"position_in_{ref}"
+        if position_col not in adata.var.columns:
+            raise KeyError(f"var_filters not found in adata.var: {position_col}")
+        position_mask = np.asarray(adata.var[position_col].values, dtype=bool)
+        ref_masks.append(position_mask)
+    if not ref_masks:
+        return np.ones(adata.n_vars, dtype=bool)
+    return np.logical_and.reduce(ref_masks)
 def latent_adata(
     config_path: str,
 ) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
@@ -43,6 +145,8 @@ def latent_adata(
     pp_path = paths.pp
     pp_dedup_path = paths.pp_dedup
     spatial_path = paths.spatial
+    chimeric_path = paths.chimeric
+    variant_path = paths.variant
     hmm_path = paths.hmm
     latent_path = paths.latent
@@ -59,15 +163,21 @@ def latent_adata(
         return adata
     # 3) Decide which AnnData to use as the *starting point* for latent analyses
-    if latent_path.exists():
-        start_adata = _load(latent_path)
-        source_path = latent_path
-    elif hmm_path.exists():
+    if hmm_path.exists():
         start_adata = _load(hmm_path)
         source_path = hmm_path
+    elif latent_path.exists():
+        start_adata = _load(latent_path)
+        source_path = latent_path
     elif spatial_path.exists():
         start_adata = _load(spatial_path)
         source_path = spatial_path
+    elif chimeric_path.exists():
+        start_adata = _load(chimeric_path)
+        source_path = chimeric_path
+    elif variant_path.exists():
+        start_adata = _load(variant_path)
+        source_path = variant_path
     elif pp_dedup_path.exists():
         start_adata = _load(pp_dedup_path)
         source_path = pp_dedup_path
@@ -109,7 +219,7 @@ def latent_adata_core(
     Does:
     - Optional sample sheet load.
     - Optional inversion & reindexing.
-    - PCA/UMAP/Leiden.
+    - PCA/KNN/UMAP/Leiden/NMP/PARAFAC
     - Save latent AnnData to `latent_adata_path`.
     Returns
@@ -130,20 +240,24 @@ def latent_adata_core(
     from ..metadata import record_smftools_metadata
     from ..plotting import (
         plot_cp_sequence_components,
-        plot_embedding,
+        plot_embedding_grid,
         plot_nmf_components,
-        plot_pca,
-        plot_umap,
+        plot_pca_components,
+        plot_pca_explained_variance,
+        plot_pca_grid,
+        plot_umap_grid,
     )
     from ..preprocessing import (
         invert_adata,
         load_sample_sheet,
         reindex_references_adata,
     )
-    from ..readwrite import make_dirs, safe_read_h5ad
+    from ..readwrite import make_dirs
     from ..tools import (
+        calculate_knn,
         calculate_leiden,
         calculate_nmf,
+        calculate_pca,
         calculate_sequence_cp_decomposition,
         calculate_umap,
     )
@@ -214,97 +328,438 @@ def latent_adata_core(
     references = adata.obs[cfg.reference_column].cat.categories
+    latent_dir_dedup = latent_directory / "deduplicated"
     # ============================================================
-    # 2) PCA/UMAP on *deduplicated* preprocessed AnnData
+    # 2) PCA/UMAP/NMF at valid modified base site binary encodings shared across references
     # ============================================================
-    latent_dir_dedup = latent_directory / "deduplicated"
-    umap_dir = latent_dir_dedup / "07_umaps"
-    nmf_dir = latent_dir_dedup / "07b_nmf"
-    nmf_sequence_dir = latent_dir_dedup / "07c_nmf_sequence"
-    var_filters = []
-    if smf_modality == "direct":
-        for ref in references:
-            for base in cfg.mod_target_bases:
-                var_filters.append(f"{ref}_{base}_site")
-    elif deaminase:
-        for ref in references:
-            var_filters.append(f"{ref}_C_site")
+    SUBSET = "shared_valid_mod_sites_binary_mod_arrays"
+    pca_dir = latent_dir_dedup / f"01_pca_{SUBSET}"
+    umap_dir = latent_dir_dedup / f"01_umap_{SUBSET}"
+    nmf_dir = latent_dir_dedup / f"01_nmf_{SUBSET}"
+    mod_site_layers = []
+    for mod_base in cfg.mod_target_bases:
+        mod_site_layers += [f"Modified_{mod_base}_site_count", f"Fraction_{mod_base}_site_modified"]
+    plotting_layers = [cfg.sample_name_col_for_plotting, REFERENCE_STRAND] + mod_site_layers
+    plotting_layers += cfg.umap_layers_to_plot
+    mod_sites_mask = _build_mod_sites_var_filter_mask(
+        adata=adata,
+        references=references,
+        cfg=cfg,
+        smf_modality=smf_modality,
+        deaminase=deaminase,
+    )
+    non_mod_sites_mask = _build_shared_valid_non_mod_sites_mask(
+        adata=adata,
+        references=references,
+        cfg=cfg,
+        smf_modality=smf_modality,
+        deaminase=deaminase,
+    )
+    # PCA calculation
+    adata = calculate_pca(
+        adata,
+        layer=cfg.layer_for_umap_plotting,
+        var_mask=mod_sites_mask,
+        n_pcs=10,
+        output_suffix=SUBSET,
+    )
+    # KNN calculation
+    adata = calculate_knn(
+        adata,
+        obsm=f"X_pca_{SUBSET}",
+        knn_neighbors=15,
+    )
+    # UMAP Calculation
+    adata = calculate_umap(
+        adata,
+        obsm=f"X_pca_{SUBSET}",
+        output_suffix=SUBSET,
+    )
+    # Leiden clustering
+    calculate_leiden(adata, resolution=0.1, connectivities_key=f"connectivities_X_pca_{SUBSET}")
+    # NMF Calculation
+    adata = calculate_nmf(
+        adata,
+        layer=cfg.layer_for_umap_plotting,
+        var_mask=mod_sites_mask,
+        n_components=2,
+        suffix=SUBSET,
+    )
+    # PCA
+    if pca_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
+        logger.debug(f"{pca_dir} already exists. Skipping PCA calculation and plotting.")
     else:
-        for ref in references:
-            for base in cfg.mod_target_bases:
-                var_filters.append(f"{ref}_{base}_site")
+        make_dirs([pca_dir])
+        plot_pca_grid(adata, subset=SUBSET, color=plotting_layers, output_dir=pca_dir)
+        plot_pca_explained_variance(adata, subset=SUBSET, output_dir=pca_dir)
+        plot_pca_components(adata, output_dir=pca_dir, suffix=SUBSET)
-    # UMAP / Leiden
+    # UMAP
     if umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
         logger.debug(f"{umap_dir} already exists. Skipping UMAP plotting.")
     else:
         make_dirs([umap_dir])
+        plot_umap_grid(adata, subset=SUBSET, color=plotting_layers, output_dir=umap_dir)
-        adata = calculate_umap(
-            adata,
-            layer=cfg.layer_for_umap_plotting,
-            var_filters=var_filters,
-            n_pcs=10,
-            knn_neighbors=15,
-        )
+    # NMF
+    if nmf_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
+        logger.debug(f"{nmf_dir} already exists. Skipping NMF plotting.")
+    else:
+        make_dirs([nmf_dir])
-        calculate_leiden(adata, resolution=0.1)
+        plot_embedding_grid(adata, basis=f"nmf_{SUBSET}", color=plotting_layers, output_dir=nmf_dir)
+        plot_nmf_components(adata, output_dir=nmf_dir, suffix=SUBSET)
-        umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
-        umap_layers += cfg.umap_layers_to_plot
-        plot_umap(adata, color=umap_layers, output_dir=umap_dir)
-        plot_pca(adata, color=umap_layers, output_dir=umap_dir)
+    # ============================================================
+    # 3) PCA/UMAP/NMF at valid base site integer encodings shared across references
+    # ============================================================
+    SUBSET = "shared_valid_ref_sites_integer_sequence_encodings"
+    pca_dir = latent_dir_dedup / f"02_pca_{SUBSET}"
+    umap_dir = latent_dir_dedup / f"02_umap_{SUBSET}"
+    nmf_dir = latent_dir_dedup / f"02_nmf_{SUBSET}"
+    valid_sites = _build_reference_position_mask(adata, references)
+    # PCA calculation
+    adata = calculate_pca(
+        adata,
+        layer=SEQUENCE_INTEGER_ENCODING,
+        var_mask=valid_sites,
+        n_pcs=10,
+        output_suffix=SUBSET,
+    )
+    # KNN calculation
+    adata = calculate_knn(
+        adata,
+        obsm=f"X_pca_{SUBSET}",
+        knn_neighbors=15,
+    )
+    # UMAP Calculation
+    adata = calculate_umap(
+        adata,
+        obsm=f"X_pca_{SUBSET}",
+        output_suffix=SUBSET,
+    )
+    # Leiden clustering
+    calculate_leiden(adata, resolution=0.1, connectivities_key=f"connectivities_X_pca_{SUBSET}")
+    # NMF Calculation
+    adata = calculate_nmf(
+        adata,
+        layer=SEQUENCE_INTEGER_ENCODING,
+        var_mask=valid_sites,
+        n_components=2,
+        suffix=SUBSET,
+    )
+    # PCA
+    if pca_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
+        logger.debug(f"{pca_dir} already exists. Skipping PCA calculation and plotting.")
+    else:
+        make_dirs([pca_dir])
+        plot_pca_grid(adata, subset=SUBSET, color=plotting_layers, output_dir=pca_dir)
+        plot_pca_explained_variance(adata, subset=SUBSET, output_dir=pca_dir)
+        plot_pca_components(adata, output_dir=pca_dir, suffix=SUBSET)
+    # UMAP
+    if umap_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
+        logger.debug(f"{umap_dir} already exists. Skipping UMAP plotting.")
+    else:
+        make_dirs([umap_dir])
+        plot_umap_grid(adata, subset=SUBSET, color=plotting_layers, output_dir=umap_dir)
     # NMF
-    if nmf_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
+    if nmf_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
         logger.debug(f"{nmf_dir} already exists. Skipping NMF plotting.")
     else:
         make_dirs([nmf_dir])
-        adata = calculate_nmf(
+        plot_embedding_grid(adata, basis=f"nmf_{SUBSET}", color=plotting_layers, output_dir=nmf_dir)
+        plot_nmf_components(adata, output_dir=nmf_dir, suffix=SUBSET)
+    # ============================================================
+    # 3) CP PARAFAC factorization of shared mod site OHE sequences with mask layer
+    # ============================================================
+    SUBSET = "shared_valid_mod_sites_ohe_sequence_N_masked"
+    cp_sequence_dir = latent_dir_dedup / f"03_cp_{SUBSET}"
+    # Calculate CP tensor factorization
+    if SEQUENCE_INTEGER_ENCODING not in adata.layers:
+        logger.warning(
+            "Layer %s not found; skipping sequence integer encoding CP.",
+            SEQUENCE_INTEGER_ENCODING,
+        )
+    else:
+        adata = calculate_sequence_cp_decomposition(
+            adata,
+            layer=SEQUENCE_INTEGER_ENCODING,
+            var_mask=mod_sites_mask,
+            var_mask_name="shared_reference_and_mod_site_positions",
+            rank=2,
+            embedding_key=f"X_cp_{SUBSET}",
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+            non_negative=False,
+        )
+    # CP decomposition using sequence integer encoding (no var filters)
+    if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
+        logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
+    else:
+        make_dirs([cp_sequence_dir])
+        plot_embedding_grid(
+            adata,
+            basis=f"cp_{SUBSET}",
+            color=plotting_layers,
+            output_dir=cp_sequence_dir,
+        )
+        plot_cp_sequence_components(
+            adata,
+            output_dir=cp_sequence_dir,
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+        )
+    # ============================================================
+    # 4) Non-negative CP PARAFAC factorization of shared mod site OHE sequences with mask layer
+    # ============================================================
+    SUBSET = "shared_valid_mod_sites_ohe_sequence_N_masked_non_negative"
+    cp_sequence_dir = latent_dir_dedup / f"04_cp_{SUBSET}"
+    # Calculate CP tensor factorization
+    if SEQUENCE_INTEGER_ENCODING not in adata.layers:
+        logger.warning(
+            "Layer %s not found; skipping sequence integer encoding CP.",
+            SEQUENCE_INTEGER_ENCODING,
+        )
+    else:
+        adata = calculate_sequence_cp_decomposition(
+            adata,
+            layer=SEQUENCE_INTEGER_ENCODING,
+            var_mask=mod_sites_mask,
+            var_mask_name="shared_reference_mod_site_positions",
+            rank=2,
+            embedding_key=f"X_cp_{SUBSET}",
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+            non_negative=True,
+        )
+    # CP decomposition using sequence integer encoding (no var filters)
+    if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
+        logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
+    else:
+        make_dirs([cp_sequence_dir])
+        plot_embedding_grid(
+            adata,
+            basis=f"cp_{SUBSET}",
+            color=plotting_layers,
+            output_dir=cp_sequence_dir,
+        )
+        plot_cp_sequence_components(
+            adata,
+            output_dir=cp_sequence_dir,
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+        )
+    # ============================================================
+    # 6) CP PARAFAC factorization of non mod-site OHE sequences with mask layer
+    # ============================================================
+    SUBSET = "non_mod_site_ohe_sequence_N_masked"
+    cp_sequence_dir = latent_dir_dedup / f"05_cp_{SUBSET}"
+    # Calculate CP tensor factorization
+    if SEQUENCE_INTEGER_ENCODING not in adata.layers:
+        logger.warning(
+            "Layer %s not found; skipping sequence integer encoding CP.",
+            SEQUENCE_INTEGER_ENCODING,
+        )
+    else:
+        adata = calculate_sequence_cp_decomposition(
+            adata,
+            layer=SEQUENCE_INTEGER_ENCODING,
+            var_mask=non_mod_sites_mask,
+            var_mask_name="non_mod_site_reference_positions",
+            rank=2,
+            embedding_key=f"X_cp_{SUBSET}",
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+            non_negative=False,
+        )
+    # CP decomposition using sequence integer encoding (no var filters)
+    if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
+        logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
+    else:
+        make_dirs([cp_sequence_dir])
+        plot_embedding_grid(
+            adata,
+            basis=f"cp_{SUBSET}",
+            color=plotting_layers,
+            output_dir=cp_sequence_dir,
+        )
+        plot_cp_sequence_components(
+            adata,
+            output_dir=cp_sequence_dir,
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+        )
+    # ============================================================
+    # 7) Non-negative CP PARAFAC factorization of full OHE sequences with mask layer
+    # ============================================================
+    SUBSET = "non_mod_site_ohe_sequence_N_masked_non_negative"
+    cp_sequence_dir = latent_dir_dedup / f"06_cp_{SUBSET}"
+    # Calculate CP tensor factorization
+    if SEQUENCE_INTEGER_ENCODING not in adata.layers:
+        logger.warning(
+            "Layer %s not found; skipping sequence integer encoding CP.",
+            SEQUENCE_INTEGER_ENCODING,
+        )
+    else:
+        adata = calculate_sequence_cp_decomposition(
             adata,
-            layer=cfg.layer_for_umap_plotting,
-            var_filters=var_filters,
-            n_components=5,
+            layer=SEQUENCE_INTEGER_ENCODING,
+            var_mask=non_mod_sites_mask,
+            var_mask_name="non_mod_site_reference_positions",
+            rank=2,
+            embedding_key=f"X_cp_{SUBSET}",
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+            non_negative=True,
         )
-        nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
-        nmf_layers += cfg.umap_layers_to_plot
-        plot_embedding(adata, basis="nmf", color=nmf_layers, output_dir=nmf_dir)
-        plot_nmf_components(adata, output_dir=nmf_dir)
     # CP decomposition using sequence integer encoding (no var filters)
-    if nmf_sequence_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
-        logger.debug(f"{nmf_sequence_dir} already exists. Skipping sequence CP plotting.")
-    elif SEQUENCE_INTEGER_ENCODING not in adata.layers:
+    if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
+        logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
+    else:
+        make_dirs([cp_sequence_dir])
+        plot_embedding_grid(
+            adata,
+            basis=f"cp_{SUBSET}",
+            color=plotting_layers,
+            output_dir=cp_sequence_dir,
+        )
+        plot_cp_sequence_components(
+            adata,
+            output_dir=cp_sequence_dir,
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+        )
+    # ============================================================
+    # 8) CP PARAFAC factorization of full OHE sequences with mask layer
+    # ============================================================
+    SUBSET = "full_ohe_sequence_N_masked"
+    cp_sequence_dir = latent_dir_dedup / f"07_cp_{SUBSET}"
+    # Calculate CP tensor factorization
+    if SEQUENCE_INTEGER_ENCODING not in adata.layers:
         logger.warning(
             "Layer %s not found; skipping sequence integer encoding CP.",
             SEQUENCE_INTEGER_ENCODING,
         )
     else:
-        make_dirs([nmf_sequence_dir])
         adata = calculate_sequence_cp_decomposition(
             adata,
             layer=SEQUENCE_INTEGER_ENCODING,
-            rank=5,
-            embedding_key="X_cp_sequence",
-            components_key="H_cp_sequence",
-            uns_key="cp_sequence",
+            var_mask=_build_reference_position_mask(adata, references),
+            var_mask_name="shared_reference_positions",
+            rank=2,
+            embedding_key=f"X_cp_{SUBSET}",
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+            non_negative=False,
+        )
+    # CP decomposition using sequence integer encoding (no var filters)
+    if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
+        logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
+    else:
+        make_dirs([cp_sequence_dir])
+        plot_embedding_grid(
+            adata,
+            basis=f"cp_{SUBSET}",
+            color=plotting_layers,
+            output_dir=cp_sequence_dir,
+        )
+        plot_cp_sequence_components(
+            adata,
+            output_dir=cp_sequence_dir,
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+        )
+    # ============================================================
+    # 9) Non-negative CP PARAFAC factorization of full OHE sequences with mask layer
+    # ============================================================
+    SUBSET = "full_ohe_sequence_N_masked_non_negative"
+    cp_sequence_dir = latent_dir_dedup / f"08_cp_{SUBSET}"
+    # Calculate CP tensor factorization
+    if SEQUENCE_INTEGER_ENCODING not in adata.layers:
+        logger.warning(
+            "Layer %s not found; skipping sequence integer encoding CP.",
+            SEQUENCE_INTEGER_ENCODING,
+        )
+    else:
+        adata = calculate_sequence_cp_decomposition(
+            adata,
+            layer=SEQUENCE_INTEGER_ENCODING,
+            var_mask=_build_reference_position_mask(adata, references),
+            var_mask_name="shared_reference_positions",
+            rank=2,
+            embedding_key=f"X_cp_{SUBSET}",
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
+            non_negative=True,
+        )
+    # CP decomposition using sequence integer encoding (no var filters)
+    if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
+        logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
+    else:
+        make_dirs([cp_sequence_dir])
+        plot_embedding_grid(
+            adata,
+            basis=f"cp_{SUBSET}",
+            color=plotting_layers,
+            output_dir=cp_sequence_dir,
         )
-        nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
-        nmf_layers += cfg.umap_layers_to_plot
-        plot_embedding(adata, basis="cp_sequence", color=nmf_layers, output_dir=nmf_sequence_dir)
         plot_cp_sequence_components(
             adata,
-            output_dir=nmf_sequence_dir,
-            components_key="H_cp_sequence",
-            uns_key="cp_sequence",
+            output_dir=cp_sequence_dir,
+            components_key=f"H_cp_{SUBSET}",
+            uns_key=f"cp_{SUBSET}",
         )
     # ============================================================
-    # 5) Save latent AnnData
+    # 10) Save latent AnnData
     # ============================================================
-    if (not latent_adata_path.exists()) or getattr(cfg, "force_redo_latent_analyses", False):
-        logger.info("Saving latent analyzed AnnData (post preprocessing and duplicate removal).")
+    if not latent_adata_path.exists():
+        logger.info("Saving latent analyzed AnnData")
         record_smftools_metadata(
             adata,
             step_name="latent",

smftools 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

smftools 0.3.1py3-none-any.whl → 0.3.2py3-none-any.whl