smftools 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +18 -2
- smftools/cli/hmm_adata.py +18 -1
- smftools/cli/latent_adata.py +522 -67
- smftools/cli/load_adata.py +2 -2
- smftools/cli/preprocess_adata.py +32 -93
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +23 -109
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +41 -5
- smftools/config/conversion.yaml +0 -10
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +49 -13
- smftools/config/experiment_config.py +96 -3
- smftools/constants.py +4 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +53 -13
- smftools/informatics/h5ad_functions.py +83 -0
- smftools/informatics/modkit_extract_to_adata.py +4 -0
- smftools/plotting/__init__.py +26 -12
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +58 -3362
- smftools/plotting/hmm_plotting.py +1586 -2
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +3 -0
- smftools/preprocessing/append_base_context.py +1 -1
- smftools/preprocessing/append_mismatch_frequency_sites.py +35 -6
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +109 -85
- smftools/tools/__init__.py +6 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_nmf.py +18 -7
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +70 -154
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +640 -3
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +52 -4
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/METADATA +3 -1
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/RECORD +56 -42
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from smftools.logging_utils import get_logger
|
|
6
|
+
from smftools.optional_imports import require
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
import anndata as ad
|
|
10
|
+
|
|
11
|
+
logger = get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def calculate_knn(
|
|
15
|
+
adata: "ad.AnnData",
|
|
16
|
+
obsm: str = "X_pca",
|
|
17
|
+
knn_neighbors: int = 100,
|
|
18
|
+
overwrite: bool = True,
|
|
19
|
+
threads: int = 8,
|
|
20
|
+
random_state: int | None = 0,
|
|
21
|
+
symmetrize: bool = True,
|
|
22
|
+
) -> "ad.AnnData":
|
|
23
|
+
"""Compute a KNN distance graph on an embedding in `adata.obsm[obsm]`.
|
|
24
|
+
|
|
25
|
+
Stores:
|
|
26
|
+
- adata.obsp[f"knn_distances_{obsm}"] : CSR sparse matrix of distances
|
|
27
|
+
- adata.uns[f"knn_distances_{obsm}"]["params"] : metadata
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
adata: AnnData object to update.
|
|
31
|
+
obsm: Key in `adata.obsm` to use as the embedding.
|
|
32
|
+
knn_neighbors: Target number of neighbors (will be clipped to n_obs-1).
|
|
33
|
+
overwrite: If False and graph exists, do nothing.
|
|
34
|
+
threads: Parallel jobs for pynndescent.
|
|
35
|
+
random_state: Seed for pynndescent.
|
|
36
|
+
symmetrize: If True, make distance graph symmetric via min(A, A.T).
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Updated AnnData.
|
|
40
|
+
"""
|
|
41
|
+
import numpy as np
|
|
42
|
+
import scipy.sparse as sp
|
|
43
|
+
|
|
44
|
+
if obsm not in adata.obsm:
|
|
45
|
+
raise KeyError(f"`{obsm}` not found in adata.obsm. Available: {list(adata.obsm.keys())}")
|
|
46
|
+
|
|
47
|
+
out_key = f"knn_distances_{obsm}"
|
|
48
|
+
if not overwrite and out_key in adata.obsp:
|
|
49
|
+
logger.info("KNN graph %r already exists and overwrite=False; skipping.", out_key)
|
|
50
|
+
return adata
|
|
51
|
+
|
|
52
|
+
data = adata.obsm[obsm]
|
|
53
|
+
|
|
54
|
+
if sp.issparse(data):
|
|
55
|
+
# Convert to float32 for pynndescent/numba friendliness if needed
|
|
56
|
+
data = data.astype(np.float32)
|
|
57
|
+
logger.info(
|
|
58
|
+
"Sparse embedding detected (%s). Proceeding without NaN check.", type(data).__name__
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
data = np.asarray(data)
|
|
62
|
+
if np.isnan(data).any():
|
|
63
|
+
logger.warning("NaNs detected in %s; filling NaNs with 0.5 before KNN.", obsm)
|
|
64
|
+
data = np.nan_to_num(data, nan=0.5)
|
|
65
|
+
data = data.astype(np.float32, copy=False)
|
|
66
|
+
|
|
67
|
+
pynndescent = require("pynndescent", extra="umap", purpose="KNN graph computation")
|
|
68
|
+
|
|
69
|
+
n_obs = data.shape[0]
|
|
70
|
+
if n_obs < 2:
|
|
71
|
+
raise ValueError(f"Need at least 2 observations for KNN; got n_obs={n_obs}")
|
|
72
|
+
|
|
73
|
+
n_neighbors = min(int(knn_neighbors), n_obs - 1)
|
|
74
|
+
if n_neighbors < 1:
|
|
75
|
+
raise ValueError(f"Computed n_neighbors={n_neighbors}; check knn_neighbors and n_obs.")
|
|
76
|
+
|
|
77
|
+
logger.info(
|
|
78
|
+
"Running pynndescent KNN (obsm=%s, n_neighbors=%d, metric=euclidean, n_jobs=%d)",
|
|
79
|
+
obsm,
|
|
80
|
+
n_neighbors,
|
|
81
|
+
threads,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
nn_index = pynndescent.NNDescent(
|
|
85
|
+
data,
|
|
86
|
+
n_neighbors=n_neighbors,
|
|
87
|
+
metric="euclidean",
|
|
88
|
+
random_state=random_state,
|
|
89
|
+
n_jobs=threads,
|
|
90
|
+
)
|
|
91
|
+
knn_indices, knn_dists = nn_index.neighbor_graph # shapes: (n_obs, n_neighbors)
|
|
92
|
+
|
|
93
|
+
rows = np.repeat(np.arange(n_obs, dtype=np.int64), n_neighbors)
|
|
94
|
+
cols = knn_indices.reshape(-1).astype(np.int64, copy=False)
|
|
95
|
+
vals = knn_dists.reshape(-1).astype(np.float32, copy=False)
|
|
96
|
+
|
|
97
|
+
distances = sp.coo_matrix((vals, (rows, cols)), shape=(n_obs, n_obs)).tocsr()
|
|
98
|
+
|
|
99
|
+
# Optional: ensure diagonal is 0 and (optionally) symmetrize
|
|
100
|
+
distances.setdiag(0.0)
|
|
101
|
+
distances.eliminate_zeros()
|
|
102
|
+
|
|
103
|
+
if symmetrize:
|
|
104
|
+
# Keep the smaller directed distance for each undirected edge
|
|
105
|
+
distances = distances.minimum(distances.T)
|
|
106
|
+
|
|
107
|
+
adata.obsp[out_key] = distances
|
|
108
|
+
adata.uns[out_key] = {
|
|
109
|
+
"params": {
|
|
110
|
+
"obsm": obsm,
|
|
111
|
+
"n_neighbors_requested": int(knn_neighbors),
|
|
112
|
+
"n_neighbors_used": int(n_neighbors),
|
|
113
|
+
"method": "pynndescent",
|
|
114
|
+
"metric": "euclidean",
|
|
115
|
+
"random_state": random_state,
|
|
116
|
+
"n_jobs": int(threads),
|
|
117
|
+
"symmetrize": bool(symmetrize),
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return adata
|
smftools/tools/calculate_nmf.py
CHANGED
|
@@ -9,6 +9,7 @@ from smftools.optional_imports import require
|
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
11
|
import anndata as ad
|
|
12
|
+
import numpy as np
|
|
12
13
|
|
|
13
14
|
logger = get_logger(__name__)
|
|
14
15
|
|
|
@@ -16,7 +17,7 @@ logger = get_logger(__name__)
|
|
|
16
17
|
def calculate_nmf(
|
|
17
18
|
adata: "ad.AnnData",
|
|
18
19
|
layer: str | None = "nan_half",
|
|
19
|
-
|
|
20
|
+
var_mask: "np.ndarray | Sequence[bool] | None" = None,
|
|
20
21
|
n_components: int = 2,
|
|
21
22
|
max_iter: int = 200,
|
|
22
23
|
random_state: int = 0,
|
|
@@ -24,13 +25,14 @@ def calculate_nmf(
|
|
|
24
25
|
embedding_key: str = "X_nmf",
|
|
25
26
|
components_key: str = "H_nmf",
|
|
26
27
|
uns_key: str = "nmf",
|
|
28
|
+
suffix: str | None = None,
|
|
27
29
|
) -> "ad.AnnData":
|
|
28
30
|
"""Compute a low-dimensional NMF embedding.
|
|
29
31
|
|
|
30
32
|
Args:
|
|
31
33
|
adata: AnnData object to update.
|
|
32
34
|
layer: Layer name to use for NMF (``None`` uses ``adata.X``).
|
|
33
|
-
|
|
35
|
+
var_mask: Optional boolean mask to subset features.
|
|
34
36
|
n_components: Number of NMF components to compute.
|
|
35
37
|
max_iter: Maximum number of NMF iterations.
|
|
36
38
|
random_state: Random seed for the NMF initializer.
|
|
@@ -47,6 +49,11 @@ def calculate_nmf(
|
|
|
47
49
|
require("sklearn", extra="ml-base", purpose="NMF calculation")
|
|
48
50
|
from sklearn.decomposition import NMF
|
|
49
51
|
|
|
52
|
+
if suffix:
|
|
53
|
+
embedding_key = f"{embedding_key}_{suffix}"
|
|
54
|
+
components_key = f"{components_key}_{suffix}"
|
|
55
|
+
uns_key = f"{uns_key}_{suffix}"
|
|
56
|
+
|
|
50
57
|
has_embedding = embedding_key in adata.obsm
|
|
51
58
|
has_components = components_key in adata.varm
|
|
52
59
|
if has_embedding and has_components and not overwrite:
|
|
@@ -56,17 +63,21 @@ def calculate_nmf(
|
|
|
56
63
|
logger.info("NMF embedding present without components; recomputing to store components.")
|
|
57
64
|
|
|
58
65
|
subset_mask = None
|
|
59
|
-
if
|
|
60
|
-
subset_mask = np.
|
|
66
|
+
if var_mask is not None:
|
|
67
|
+
subset_mask = np.asarray(var_mask, dtype=bool)
|
|
68
|
+
if subset_mask.ndim != 1 or subset_mask.shape[0] != adata.n_vars:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"var_mask must be a 1D boolean array with length matching adata.n_vars."
|
|
71
|
+
)
|
|
61
72
|
adata_subset = adata[:, subset_mask].copy()
|
|
62
73
|
logger.info(
|
|
63
74
|
"Subsetting adata: retained %s features based on filters %s",
|
|
64
75
|
adata_subset.shape[1],
|
|
65
|
-
|
|
76
|
+
"var_mask",
|
|
66
77
|
)
|
|
67
78
|
else:
|
|
68
79
|
adata_subset = adata.copy()
|
|
69
|
-
logger.info("No
|
|
80
|
+
logger.info("No var_mask provided. Using all features.")
|
|
70
81
|
|
|
71
82
|
data = adata_subset.layers[layer] if layer else adata_subset.X
|
|
72
83
|
if issparse(data):
|
|
@@ -107,7 +118,7 @@ def calculate_nmf(
|
|
|
107
118
|
"max_iter": max_iter,
|
|
108
119
|
"random_state": random_state,
|
|
109
120
|
"layer": layer,
|
|
110
|
-
"
|
|
121
|
+
"var_mask_provided": var_mask is not None,
|
|
111
122
|
"components_key": components_key,
|
|
112
123
|
}
|
|
113
124
|
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Sequence
|
|
4
|
+
|
|
5
|
+
from smftools.logging_utils import get_logger
|
|
6
|
+
from smftools.optional_imports import require
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
import anndata as ad
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def calculate_pca(
|
|
16
|
+
adata: "ad.AnnData",
|
|
17
|
+
layer: str | None = "nan_half",
|
|
18
|
+
var_mask: "np.ndarray | Sequence[bool] | None" = None,
|
|
19
|
+
n_pcs: int = 15,
|
|
20
|
+
overwrite: bool = True,
|
|
21
|
+
output_suffix: str | None = None,
|
|
22
|
+
fill_nan: float | None = 0.5,
|
|
23
|
+
) -> "ad.AnnData":
|
|
24
|
+
"""Compute PCA and store scores in `.obsm` and loadings in `.varm`."""
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
import scipy.sparse as sp
|
|
28
|
+
|
|
29
|
+
obsm_output = f"X_pca_{output_suffix}" if output_suffix else "X_pca"
|
|
30
|
+
varm_output = f"PCs_{output_suffix}" if output_suffix else "PCs"
|
|
31
|
+
|
|
32
|
+
if not overwrite and obsm_output in adata.obsm and varm_output in adata.varm:
|
|
33
|
+
logger.info(
|
|
34
|
+
"PCA outputs already exist and overwrite=False; skipping (%s, %s).",
|
|
35
|
+
obsm_output,
|
|
36
|
+
varm_output,
|
|
37
|
+
)
|
|
38
|
+
return adata
|
|
39
|
+
|
|
40
|
+
# --- Build feature subset mask (over vars) ---
|
|
41
|
+
if var_mask is not None:
|
|
42
|
+
subset_mask = np.asarray(var_mask, dtype=bool)
|
|
43
|
+
if subset_mask.ndim != 1 or subset_mask.shape[0] != adata.n_vars:
|
|
44
|
+
raise ValueError(
|
|
45
|
+
"var_mask must be a 1D boolean array with length matching adata.n_vars."
|
|
46
|
+
)
|
|
47
|
+
n_vars_used = int(subset_mask.sum())
|
|
48
|
+
if n_vars_used == 0:
|
|
49
|
+
raise ValueError("var_mask retained 0 features.")
|
|
50
|
+
logger.info(
|
|
51
|
+
"Subsetting vars: retained %d / %d features from var_mask",
|
|
52
|
+
n_vars_used,
|
|
53
|
+
adata.n_vars,
|
|
54
|
+
)
|
|
55
|
+
else:
|
|
56
|
+
subset_mask = slice(None)
|
|
57
|
+
n_vars_used = adata.n_vars
|
|
58
|
+
logger.info("No var_mask provided; using all %d features.", adata.n_vars)
|
|
59
|
+
|
|
60
|
+
# --- Pull matrix view ---
|
|
61
|
+
if layer is None:
|
|
62
|
+
matrix = adata.X
|
|
63
|
+
layer_used = None
|
|
64
|
+
else:
|
|
65
|
+
if layer not in adata.layers:
|
|
66
|
+
raise KeyError(
|
|
67
|
+
f"Layer {layer!r} not found in adata.layers. Available: {list(adata.layers.keys())}"
|
|
68
|
+
)
|
|
69
|
+
matrix = adata.layers[layer]
|
|
70
|
+
layer_used = layer
|
|
71
|
+
|
|
72
|
+
matrix = matrix[:, subset_mask] # slice view (sparse OK)
|
|
73
|
+
|
|
74
|
+
n_obs = matrix.shape[0]
|
|
75
|
+
if n_obs < 2:
|
|
76
|
+
raise ValueError(f"PCA requires at least 2 observations; got n_obs={n_obs}")
|
|
77
|
+
if n_vars_used < 1:
|
|
78
|
+
raise ValueError("PCA requires at least 1 feature.")
|
|
79
|
+
|
|
80
|
+
n_pcs_requested = int(n_pcs)
|
|
81
|
+
n_pcs_used = min(n_pcs_requested, n_obs, n_vars_used)
|
|
82
|
+
if n_pcs_used < 1:
|
|
83
|
+
raise ValueError(f"n_pcs_used became {n_pcs_used}; check inputs.")
|
|
84
|
+
|
|
85
|
+
# --- NaN handling (dense only; sparse usually won’t store NaNs) ---
|
|
86
|
+
if not sp.issparse(matrix):
|
|
87
|
+
X = np.asarray(matrix, dtype=np.float32)
|
|
88
|
+
if fill_nan is not None and np.isnan(X).any():
|
|
89
|
+
logger.warning("NaNs detected; filling NaNs with %s before PCA.", fill_nan)
|
|
90
|
+
X = np.nan_to_num(X, nan=float(fill_nan))
|
|
91
|
+
else:
|
|
92
|
+
X = matrix # keep sparse
|
|
93
|
+
|
|
94
|
+
# --- PCA ---
|
|
95
|
+
# Prefer sklearn's randomized PCA for speed on big matrices.
|
|
96
|
+
used_sklearn = False
|
|
97
|
+
try:
|
|
98
|
+
sklearn = require("sklearn", extra="ml", purpose="PCA computation")
|
|
99
|
+
from sklearn.decomposition import PCA, TruncatedSVD
|
|
100
|
+
|
|
101
|
+
if sp.issparse(X):
|
|
102
|
+
# TruncatedSVD works on sparse without centering; good approximation.
|
|
103
|
+
# If you *need* centered PCA on sparse, you'd need different machinery.
|
|
104
|
+
logger.info("Running TruncatedSVD (sparse) with n_components=%d", n_pcs_used)
|
|
105
|
+
model = TruncatedSVD(n_components=n_pcs_used, random_state=0)
|
|
106
|
+
scores = model.fit_transform(X) # (n_obs, n_pcs)
|
|
107
|
+
loadings = model.components_.T # (n_vars_used, n_pcs)
|
|
108
|
+
mean = None
|
|
109
|
+
explained_variance_ratio = getattr(model, "explained_variance_ratio_", None)
|
|
110
|
+
else:
|
|
111
|
+
logger.info(
|
|
112
|
+
"Running sklearn PCA with n_components=%d (svd_solver=randomized)", n_pcs_used
|
|
113
|
+
)
|
|
114
|
+
model = PCA(n_components=n_pcs_used, svd_solver="randomized", random_state=0)
|
|
115
|
+
scores = model.fit_transform(X) # (n_obs, n_pcs)
|
|
116
|
+
loadings = model.components_.T # (n_vars_used, n_pcs)
|
|
117
|
+
mean = model.mean_
|
|
118
|
+
explained_variance_ratio = model.explained_variance_ratio_
|
|
119
|
+
|
|
120
|
+
used_sklearn = True
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
# Fallback to your manual SVD (dense only)
|
|
124
|
+
if sp.issparse(X):
|
|
125
|
+
raise RuntimeError(
|
|
126
|
+
"Sparse input PCA fallback is not implemented without sklearn. "
|
|
127
|
+
"Install scikit-learn (extra 'ml') or densify upstream."
|
|
128
|
+
) from e
|
|
129
|
+
|
|
130
|
+
import scipy.linalg as spla
|
|
131
|
+
|
|
132
|
+
logger.warning(
|
|
133
|
+
"sklearn PCA unavailable; falling back to full SVD (can be slow). Reason: %s", e
|
|
134
|
+
)
|
|
135
|
+
Xd = np.asarray(X, dtype=np.float64)
|
|
136
|
+
mean = Xd.mean(axis=0)
|
|
137
|
+
centered = Xd - mean
|
|
138
|
+
u, s, vt = spla.svd(centered, full_matrices=False)
|
|
139
|
+
u = u[:, :n_pcs_used]
|
|
140
|
+
s = s[:n_pcs_used]
|
|
141
|
+
vt = vt[:n_pcs_used]
|
|
142
|
+
scores = u * s
|
|
143
|
+
loadings = vt.T
|
|
144
|
+
explained_variance_ratio = None
|
|
145
|
+
|
|
146
|
+
# --- Store scores (obsm) ---
|
|
147
|
+
adata.obsm[obsm_output] = scores
|
|
148
|
+
|
|
149
|
+
# --- Store loadings (varm) with original var dimension ---
|
|
150
|
+
pc_matrix = np.zeros((adata.n_vars, n_pcs_used), dtype=np.float32)
|
|
151
|
+
if isinstance(subset_mask, slice):
|
|
152
|
+
pc_matrix[:, :] = loadings
|
|
153
|
+
else:
|
|
154
|
+
pc_matrix[subset_mask, :] = loadings.astype(np.float32, copy=False)
|
|
155
|
+
|
|
156
|
+
adata.varm[varm_output] = pc_matrix
|
|
157
|
+
|
|
158
|
+
# --- Metadata ---
|
|
159
|
+
adata.uns[obsm_output] = {
|
|
160
|
+
"params": {
|
|
161
|
+
"layer": layer_used,
|
|
162
|
+
"var_mask_provided": var_mask is not None,
|
|
163
|
+
"n_pcs_requested": n_pcs_requested,
|
|
164
|
+
"n_pcs_used": int(n_pcs_used),
|
|
165
|
+
"used_sklearn": used_sklearn,
|
|
166
|
+
"fill_nan": fill_nan,
|
|
167
|
+
"note_sparse": bool(sp.issparse(matrix)),
|
|
168
|
+
},
|
|
169
|
+
"explained_variance_ratio": explained_variance_ratio,
|
|
170
|
+
"mean": mean.tolist() if (mean is not None and isinstance(mean, np.ndarray)) else None,
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
logger.info(
|
|
174
|
+
"Stored PCA: adata.obsm[%s] (%s) and adata.varm[%s] (%s)",
|
|
175
|
+
obsm_output,
|
|
176
|
+
scores.shape,
|
|
177
|
+
varm_output,
|
|
178
|
+
pc_matrix.shape,
|
|
179
|
+
)
|
|
180
|
+
return adata
|
smftools/tools/calculate_umap.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
from smftools.logging_utils import get_logger
|
|
6
6
|
from smftools.optional_imports import require
|
|
@@ -13,174 +13,90 @@ logger = get_logger(__name__)
|
|
|
13
13
|
|
|
14
14
|
def calculate_umap(
|
|
15
15
|
adata: "ad.AnnData",
|
|
16
|
-
|
|
17
|
-
var_filters: Sequence[str] | None = None,
|
|
18
|
-
n_pcs: int = 15,
|
|
19
|
-
knn_neighbors: int = 100,
|
|
16
|
+
obsm: str | None = "X_pca",
|
|
20
17
|
overwrite: bool = True,
|
|
21
18
|
threads: int = 8,
|
|
22
19
|
random_state: int | None = 0,
|
|
20
|
+
output_suffix: str | None = None,
|
|
23
21
|
) -> "ad.AnnData":
|
|
24
|
-
"""Compute
|
|
25
|
-
|
|
26
|
-
Args:
|
|
27
|
-
adata: AnnData object to update.
|
|
28
|
-
layer: Layer name to use for PCA/UMAP (``None`` uses ``adata.X``).
|
|
29
|
-
var_filters: Optional list of var masks to subset features.
|
|
30
|
-
n_pcs: Number of principal components.
|
|
31
|
-
knn_neighbors: Number of neighbors for the graph.
|
|
32
|
-
overwrite: Whether to recompute embeddings if they exist.
|
|
33
|
-
threads: Number of OMP threads for computation.
|
|
34
|
-
|
|
35
|
-
Returns:
|
|
36
|
-
anndata.AnnData: Updated AnnData object.
|
|
37
|
-
"""
|
|
38
|
-
import os
|
|
22
|
+
"""Compute UMAP embedding from an `.obsm` embedding, and store connectivities."""
|
|
39
23
|
|
|
40
24
|
import numpy as np
|
|
41
|
-
import scipy.linalg as spla
|
|
42
25
|
import scipy.sparse as sp
|
|
43
26
|
|
|
27
|
+
if obsm is None:
|
|
28
|
+
raise ValueError("obsm must be a key in adata.obsm (e.g., 'X_pca').")
|
|
29
|
+
|
|
30
|
+
if obsm not in adata.obsm:
|
|
31
|
+
raise KeyError(f"`{obsm}` not found in adata.obsm. Available: {list(adata.obsm.keys())}")
|
|
32
|
+
|
|
44
33
|
umap = require("umap", extra="umap", purpose="UMAP calculation")
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
#
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
34
|
+
|
|
35
|
+
output_obsm = f"X_umap_{output_suffix}" if output_suffix else "X_umap"
|
|
36
|
+
conn_key = f"connectivities_{obsm}"
|
|
37
|
+
|
|
38
|
+
# Decide n_neighbors: prefer stored KNN params, else UMAP default-ish
|
|
39
|
+
n_neighbors = None
|
|
40
|
+
knn_uns_key = f"knn_distances_{obsm}"
|
|
41
|
+
if knn_uns_key in adata.uns:
|
|
42
|
+
params = adata.uns[knn_uns_key].get("params", {})
|
|
43
|
+
n_neighbors = params.get("n_neighbors_used", params.get("n_neighbors", None))
|
|
44
|
+
if n_neighbors is None:
|
|
45
|
+
n_neighbors = 15 # reasonable default if KNN wasn't precomputed
|
|
46
|
+
logger.warning(
|
|
47
|
+
"No %r found in adata.uns; defaulting n_neighbors=%d for UMAP.",
|
|
48
|
+
knn_uns_key,
|
|
49
|
+
n_neighbors,
|
|
57
50
|
)
|
|
51
|
+
|
|
52
|
+
# Build input matrix X and handle NaNs locally
|
|
53
|
+
X = adata.obsm[obsm]
|
|
54
|
+
if sp.issparse(X):
|
|
55
|
+
# UMAP can accept sparse CSR; keep it sparse
|
|
56
|
+
pass
|
|
58
57
|
else:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
logger.warning("Converting sparse matrix to dense for PCA.")
|
|
89
|
-
matrix = matrix.toarray()
|
|
90
|
-
|
|
91
|
-
matrix = np.asarray(matrix, dtype=float)
|
|
92
|
-
mean = matrix.mean(axis=0)
|
|
93
|
-
centered = matrix - mean
|
|
94
|
-
|
|
95
|
-
if centered.shape[0] == 0 or centered.shape[1] == 0:
|
|
96
|
-
raise ValueError("PCA requires a non-empty matrix.")
|
|
97
|
-
|
|
98
|
-
if n_pcs <= 0:
|
|
99
|
-
raise ValueError("n_pcs must be positive.")
|
|
100
|
-
|
|
101
|
-
if centered.shape[1] <= n_pcs:
|
|
102
|
-
n_pcs = centered.shape[1]
|
|
103
|
-
|
|
104
|
-
if centered.shape[0] < n_pcs:
|
|
105
|
-
n_pcs = centered.shape[0]
|
|
106
|
-
|
|
107
|
-
u, s, vt = spla.svd(centered, full_matrices=False)
|
|
108
|
-
|
|
109
|
-
u = u[:, :n_pcs]
|
|
110
|
-
s = s[:n_pcs]
|
|
111
|
-
vt = vt[:n_pcs]
|
|
112
|
-
|
|
113
|
-
adata_subset.obsm["X_pca"] = u * s
|
|
114
|
-
adata_subset.varm["PCs"] = vt.T
|
|
115
|
-
|
|
116
|
-
logger.info("Running neighborhood graph with pynndescent (n_neighbors=%s)", knn_neighbors)
|
|
117
|
-
n_neighbors = min(knn_neighbors, max(1, adata_subset.n_obs - 1))
|
|
118
|
-
nn_index = pynndescent.NNDescent(
|
|
119
|
-
adata_subset.obsm["X_pca"],
|
|
120
|
-
n_neighbors=n_neighbors,
|
|
121
|
-
metric="euclidean",
|
|
122
|
-
random_state=random_state,
|
|
123
|
-
n_jobs=threads,
|
|
124
|
-
)
|
|
125
|
-
knn_indices, knn_dists = nn_index.neighbor_graph
|
|
126
|
-
|
|
127
|
-
rows = np.repeat(np.arange(adata_subset.n_obs), n_neighbors)
|
|
128
|
-
cols = knn_indices.reshape(-1)
|
|
129
|
-
distances = sp.coo_matrix(
|
|
130
|
-
(knn_dists.reshape(-1), (rows, cols)),
|
|
131
|
-
shape=(adata_subset.n_obs, adata_subset.n_obs),
|
|
132
|
-
).tocsr()
|
|
133
|
-
adata_subset.obsp["distances"] = distances
|
|
134
|
-
|
|
135
|
-
logger.info("Running UMAP")
|
|
136
|
-
umap_model = umap.UMAP(
|
|
137
|
-
n_neighbors=n_neighbors,
|
|
138
|
-
n_components=2,
|
|
139
|
-
metric="euclidean",
|
|
140
|
-
random_state=random_state,
|
|
58
|
+
X = np.asarray(X)
|
|
59
|
+
if np.isnan(X).any():
|
|
60
|
+
logger.warning("NaNs detected in %s; filling NaNs with 0.5 for UMAP.", obsm)
|
|
61
|
+
X = np.nan_to_num(X, nan=0.5)
|
|
62
|
+
|
|
63
|
+
if (not overwrite) and (output_obsm in adata.obsm) and (conn_key in adata.obsp):
|
|
64
|
+
logger.info("UMAP + connectivities already exist and overwrite=False; skipping.")
|
|
65
|
+
return adata
|
|
66
|
+
|
|
67
|
+
logger.info("Running UMAP (obsm=%s, n_neighbors=%d, metric=euclidean)", obsm, n_neighbors)
|
|
68
|
+
|
|
69
|
+
# Note: umap-learn uses numba threading; n_jobs controls parallelism in UMAP
|
|
70
|
+
# and is ignored when random_state is set (umap-learn behavior).
|
|
71
|
+
umap_model = umap.UMAP(
|
|
72
|
+
n_neighbors=int(n_neighbors),
|
|
73
|
+
n_components=2,
|
|
74
|
+
metric="euclidean",
|
|
75
|
+
random_state=random_state,
|
|
76
|
+
n_jobs=int(threads),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
embedding = umap_model.fit_transform(X)
|
|
80
|
+
adata.obsm[output_obsm] = embedding
|
|
81
|
+
|
|
82
|
+
# UMAP's computed fuzzy graph
|
|
83
|
+
connectivities = getattr(umap_model, "graph_", None)
|
|
84
|
+
if connectivities is not None:
|
|
85
|
+
adata.obsp[conn_key] = (
|
|
86
|
+
connectivities.tocsr() if sp.issparse(connectivities) else connectivities
|
|
141
87
|
)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
fuzzy_result = fuzzy_simplicial_set(
|
|
148
|
-
adata_subset.obsm["X_pca"],
|
|
149
|
-
n_neighbors=n_neighbors,
|
|
150
|
-
random_state=random_state,
|
|
151
|
-
metric="euclidean",
|
|
152
|
-
knn_indices=knn_indices,
|
|
153
|
-
knn_dists=knn_dists,
|
|
154
|
-
)
|
|
155
|
-
connectivities = fuzzy_result[0] if isinstance(fuzzy_result, tuple) else fuzzy_result
|
|
156
|
-
except TypeError:
|
|
157
|
-
connectivities = umap_model.graph_
|
|
158
|
-
|
|
159
|
-
adata_subset.obsp["connectivities"] = connectivities
|
|
160
|
-
|
|
161
|
-
# Step 4: Store results in original adata
|
|
162
|
-
adata.obsm["X_pca"] = adata_subset.obsm["X_pca"]
|
|
163
|
-
adata.obsm["X_umap"] = adata_subset.obsm["X_umap"]
|
|
164
|
-
adata.obsp["distances"] = adata_subset.obsp["distances"]
|
|
165
|
-
adata.obsp["connectivities"] = adata_subset.obsp["connectivities"]
|
|
166
|
-
adata.uns["neighbors"] = {
|
|
88
|
+
else:
|
|
89
|
+
logger.warning("UMAP model did not expose graph_; connectivities not stored.")
|
|
90
|
+
|
|
91
|
+
adata.uns[output_obsm] = {
|
|
167
92
|
"params": {
|
|
168
|
-
"
|
|
169
|
-
"
|
|
93
|
+
"obsm": obsm,
|
|
94
|
+
"n_neighbors": int(n_neighbors),
|
|
170
95
|
"metric": "euclidean",
|
|
96
|
+
"random_state": random_state,
|
|
97
|
+
"n_jobs": int(threads),
|
|
171
98
|
}
|
|
172
99
|
}
|
|
173
100
|
|
|
174
|
-
|
|
175
|
-
pc_matrix = np.zeros((adata.shape[1], adata_subset.varm["PCs"].shape[1]))
|
|
176
|
-
if var_filters:
|
|
177
|
-
subset_mask = np.logical_or.reduce([adata.var[f].values for f in var_filters])
|
|
178
|
-
pc_matrix[subset_mask, :] = adata_subset.varm["PCs"]
|
|
179
|
-
else:
|
|
180
|
-
pc_matrix = adata_subset.varm["PCs"] # No subsetting case
|
|
181
|
-
|
|
182
|
-
adata.varm["PCs"] = pc_matrix
|
|
183
|
-
|
|
184
|
-
logger.info("Stored: adata.obsm['X_pca'] and adata.obsm['X_umap']")
|
|
185
|
-
|
|
101
|
+
logger.info("Stored: adata.obsm[%s]=%s", output_obsm, embedding.shape)
|
|
186
102
|
return adata
|
smftools/tools/position_stats.py
CHANGED
|
@@ -320,12 +320,12 @@ def compute_positionwise_statistics(
|
|
|
320
320
|
|
|
321
321
|
# samples / refs
|
|
322
322
|
sseries = adata.obs[sample_col]
|
|
323
|
-
if not pd.
|
|
323
|
+
if not isinstance(sseries.dtype, pd.CategoricalDtype):
|
|
324
324
|
sseries = sseries.astype("category")
|
|
325
325
|
samples = list(sseries.cat.categories)
|
|
326
326
|
|
|
327
327
|
rseries = adata.obs[ref_col]
|
|
328
|
-
if not pd.
|
|
328
|
+
if not isinstance(rseries.dtype, pd.CategoricalDtype):
|
|
329
329
|
rseries = rseries.astype("category")
|
|
330
330
|
references = list(rseries.cat.categories)
|
|
331
331
|
|
|
@@ -509,12 +509,12 @@ def plot_positionwise_matrices(
|
|
|
509
509
|
|
|
510
510
|
# canonicalize sample/ref order
|
|
511
511
|
sseries = adata.obs[sample_col]
|
|
512
|
-
if not pd.
|
|
512
|
+
if not isinstance(sseries.dtype, pd.CategoricalDtype):
|
|
513
513
|
sseries = sseries.astype("category")
|
|
514
514
|
samples = list(sseries.cat.categories)
|
|
515
515
|
|
|
516
516
|
rseries = adata.obs[ref_col]
|
|
517
|
-
if not pd.
|
|
517
|
+
if not isinstance(rseries.dtype, pd.CategoricalDtype):
|
|
518
518
|
rseries = rseries.astype("category")
|
|
519
519
|
references = list(rseries.cat.categories)
|
|
520
520
|
|