smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +49 -7
- smftools/cli/hmm_adata.py +250 -32
- smftools/cli/latent_adata.py +773 -0
- smftools/cli/load_adata.py +78 -74
- smftools/cli/preprocess_adata.py +122 -58
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +74 -112
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +52 -4
- smftools/config/conversion.yaml +1 -1
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +85 -12
- smftools/config/experiment_config.py +146 -1
- smftools/constants.py +69 -0
- smftools/hmm/HMM.py +88 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +636 -175
- smftools/informatics/h5ad_functions.py +198 -2
- smftools/informatics/modkit_extract_to_adata.py +1007 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +26 -3
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +62 -1583
- smftools/plotting/hmm_plotting.py +1670 -8
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +4 -0
- smftools/preprocessing/append_base_context.py +18 -18
- smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +159 -99
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +10 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +130 -0
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +79 -80
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +872 -0
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +217 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
smftools/tools/calculate_umap.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
from smftools.logging_utils import get_logger
|
|
6
6
|
from smftools.optional_imports import require
|
|
@@ -13,91 +13,90 @@ logger = get_logger(__name__)
|
|
|
13
13
|
|
|
14
14
|
def calculate_umap(
|
|
15
15
|
adata: "ad.AnnData",
|
|
16
|
-
|
|
17
|
-
var_filters: Sequence[str] | None = None,
|
|
18
|
-
n_pcs: int = 15,
|
|
19
|
-
knn_neighbors: int = 100,
|
|
16
|
+
obsm: str | None = "X_pca",
|
|
20
17
|
overwrite: bool = True,
|
|
21
18
|
threads: int = 8,
|
|
19
|
+
random_state: int | None = 0,
|
|
20
|
+
output_suffix: str | None = None,
|
|
22
21
|
) -> "ad.AnnData":
|
|
23
|
-
"""Compute
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
adata: AnnData object to update.
|
|
27
|
-
layer: Layer name to use for PCA/UMAP (``None`` uses ``adata.X``).
|
|
28
|
-
var_filters: Optional list of var masks to subset features.
|
|
29
|
-
n_pcs: Number of principal components.
|
|
30
|
-
knn_neighbors: Number of neighbors for the graph.
|
|
31
|
-
overwrite: Whether to recompute embeddings if they exist.
|
|
32
|
-
threads: Number of OMP threads for computation.
|
|
33
|
-
|
|
34
|
-
Returns:
|
|
35
|
-
anndata.AnnData: Updated AnnData object.
|
|
36
|
-
"""
|
|
37
|
-
import os
|
|
22
|
+
"""Compute UMAP embedding from an `.obsm` embedding, and store connectivities."""
|
|
38
23
|
|
|
39
24
|
import numpy as np
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
25
|
+
import scipy.sparse as sp
|
|
26
|
+
|
|
27
|
+
if obsm is None:
|
|
28
|
+
raise ValueError("obsm must be a key in adata.obsm (e.g., 'X_pca').")
|
|
29
|
+
|
|
30
|
+
if obsm not in adata.obsm:
|
|
31
|
+
raise KeyError(f"`{obsm}` not found in adata.obsm. Available: {list(adata.obsm.keys())}")
|
|
32
|
+
|
|
33
|
+
umap = require("umap", extra="umap", purpose="UMAP calculation")
|
|
34
|
+
|
|
35
|
+
output_obsm = f"X_umap_{output_suffix}" if output_suffix else "X_umap"
|
|
36
|
+
conn_key = f"connectivities_{obsm}"
|
|
37
|
+
|
|
38
|
+
# Decide n_neighbors: prefer stored KNN params, else UMAP default-ish
|
|
39
|
+
n_neighbors = None
|
|
40
|
+
knn_uns_key = f"knn_distances_{obsm}"
|
|
41
|
+
if knn_uns_key in adata.uns:
|
|
42
|
+
params = adata.uns[knn_uns_key].get("params", {})
|
|
43
|
+
n_neighbors = params.get("n_neighbors_used", params.get("n_neighbors", None))
|
|
44
|
+
if n_neighbors is None:
|
|
45
|
+
n_neighbors = 15 # reasonable default if KNN wasn't precomputed
|
|
46
|
+
logger.warning(
|
|
47
|
+
"No %r found in adata.uns; defaulting n_neighbors=%d for UMAP.",
|
|
48
|
+
knn_uns_key,
|
|
49
|
+
n_neighbors,
|
|
54
50
|
)
|
|
51
|
+
|
|
52
|
+
# Build input matrix X and handle NaNs locally
|
|
53
|
+
X = adata.obsm[obsm]
|
|
54
|
+
if sp.issparse(X):
|
|
55
|
+
# UMAP can accept sparse CSR; keep it sparse
|
|
56
|
+
pass
|
|
55
57
|
else:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
adata.obsm["X_umap"] = adata_subset.obsm["X_umap"]
|
|
87
|
-
adata.obsp["distances"] = adata_subset.obsp["distances"]
|
|
88
|
-
adata.obsp["connectivities"] = adata_subset.obsp["connectivities"]
|
|
89
|
-
adata.uns["neighbors"] = adata_subset.uns["neighbors"]
|
|
90
|
-
|
|
91
|
-
# Fix varm["PCs"] shape mismatch
|
|
92
|
-
pc_matrix = np.zeros((adata.shape[1], adata_subset.varm["PCs"].shape[1]))
|
|
93
|
-
if var_filters:
|
|
94
|
-
subset_mask = np.logical_or.reduce([adata.var[f].values for f in var_filters])
|
|
95
|
-
pc_matrix[subset_mask, :] = adata_subset.varm["PCs"]
|
|
58
|
+
X = np.asarray(X)
|
|
59
|
+
if np.isnan(X).any():
|
|
60
|
+
logger.warning("NaNs detected in %s; filling NaNs with 0.5 for UMAP.", obsm)
|
|
61
|
+
X = np.nan_to_num(X, nan=0.5)
|
|
62
|
+
|
|
63
|
+
if (not overwrite) and (output_obsm in adata.obsm) and (conn_key in adata.obsp):
|
|
64
|
+
logger.info("UMAP + connectivities already exist and overwrite=False; skipping.")
|
|
65
|
+
return adata
|
|
66
|
+
|
|
67
|
+
logger.info("Running UMAP (obsm=%s, n_neighbors=%d, metric=euclidean)", obsm, n_neighbors)
|
|
68
|
+
|
|
69
|
+
# Note: umap-learn uses numba threading; n_jobs controls parallelism in UMAP
|
|
70
|
+
# and is ignored when random_state is set (umap-learn behavior).
|
|
71
|
+
umap_model = umap.UMAP(
|
|
72
|
+
n_neighbors=int(n_neighbors),
|
|
73
|
+
n_components=2,
|
|
74
|
+
metric="euclidean",
|
|
75
|
+
random_state=random_state,
|
|
76
|
+
n_jobs=int(threads),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
embedding = umap_model.fit_transform(X)
|
|
80
|
+
adata.obsm[output_obsm] = embedding
|
|
81
|
+
|
|
82
|
+
# UMAP's computed fuzzy graph
|
|
83
|
+
connectivities = getattr(umap_model, "graph_", None)
|
|
84
|
+
if connectivities is not None:
|
|
85
|
+
adata.obsp[conn_key] = (
|
|
86
|
+
connectivities.tocsr() if sp.issparse(connectivities) else connectivities
|
|
87
|
+
)
|
|
96
88
|
else:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
adata.
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
89
|
+
logger.warning("UMAP model did not expose graph_; connectivities not stored.")
|
|
90
|
+
|
|
91
|
+
adata.uns[output_obsm] = {
|
|
92
|
+
"params": {
|
|
93
|
+
"obsm": obsm,
|
|
94
|
+
"n_neighbors": int(n_neighbors),
|
|
95
|
+
"metric": "euclidean",
|
|
96
|
+
"random_state": random_state,
|
|
97
|
+
"n_jobs": int(threads),
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
logger.info("Stored: adata.obsm[%s]=%s", output_obsm, embedding.shape)
|
|
103
102
|
return adata
|
smftools/tools/position_stats.py
CHANGED
|
@@ -320,12 +320,12 @@ def compute_positionwise_statistics(
|
|
|
320
320
|
|
|
321
321
|
# samples / refs
|
|
322
322
|
sseries = adata.obs[sample_col]
|
|
323
|
-
if not pd.
|
|
323
|
+
if not isinstance(sseries.dtype, pd.CategoricalDtype):
|
|
324
324
|
sseries = sseries.astype("category")
|
|
325
325
|
samples = list(sseries.cat.categories)
|
|
326
326
|
|
|
327
327
|
rseries = adata.obs[ref_col]
|
|
328
|
-
if not pd.
|
|
328
|
+
if not isinstance(rseries.dtype, pd.CategoricalDtype):
|
|
329
329
|
rseries = rseries.astype("category")
|
|
330
330
|
references = list(rseries.cat.categories)
|
|
331
331
|
|
|
@@ -509,12 +509,12 @@ def plot_positionwise_matrices(
|
|
|
509
509
|
|
|
510
510
|
# canonicalize sample/ref order
|
|
511
511
|
sseries = adata.obs[sample_col]
|
|
512
|
-
if not pd.
|
|
512
|
+
if not isinstance(sseries.dtype, pd.CategoricalDtype):
|
|
513
513
|
sseries = sseries.astype("category")
|
|
514
514
|
samples = list(sseries.cat.categories)
|
|
515
515
|
|
|
516
516
|
rseries = adata.obs[ref_col]
|
|
517
|
-
if not pd.
|
|
517
|
+
if not isinstance(rseries.dtype, pd.CategoricalDtype):
|
|
518
518
|
rseries = rseries.astype("category")
|
|
519
519
|
references = list(rseries.cat.categories)
|
|
520
520
|
|