smftools 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +18 -2
  4. smftools/cli/hmm_adata.py +18 -1
  5. smftools/cli/latent_adata.py +522 -67
  6. smftools/cli/load_adata.py +2 -2
  7. smftools/cli/preprocess_adata.py +32 -93
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +23 -109
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +41 -5
  12. smftools/config/conversion.yaml +0 -10
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +49 -13
  15. smftools/config/experiment_config.py +96 -3
  16. smftools/constants.py +4 -0
  17. smftools/hmm/call_hmm_peaks.py +1 -1
  18. smftools/informatics/binarize_converted_base_identities.py +2 -89
  19. smftools/informatics/converted_BAM_to_adata.py +53 -13
  20. smftools/informatics/h5ad_functions.py +83 -0
  21. smftools/informatics/modkit_extract_to_adata.py +4 -0
  22. smftools/plotting/__init__.py +26 -12
  23. smftools/plotting/autocorrelation_plotting.py +22 -4
  24. smftools/plotting/chimeric_plotting.py +1893 -0
  25. smftools/plotting/classifiers.py +28 -14
  26. smftools/plotting/general_plotting.py +58 -3362
  27. smftools/plotting/hmm_plotting.py +1586 -2
  28. smftools/plotting/latent_plotting.py +804 -0
  29. smftools/plotting/plotting_utils.py +243 -0
  30. smftools/plotting/position_stats.py +16 -8
  31. smftools/plotting/preprocess_plotting.py +281 -0
  32. smftools/plotting/qc_plotting.py +8 -3
  33. smftools/plotting/spatial_plotting.py +1134 -0
  34. smftools/plotting/variant_plotting.py +1231 -0
  35. smftools/preprocessing/__init__.py +3 -0
  36. smftools/preprocessing/append_base_context.py +1 -1
  37. smftools/preprocessing/append_mismatch_frequency_sites.py +35 -6
  38. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  39. smftools/preprocessing/append_variant_call_layer.py +480 -0
  40. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  41. smftools/preprocessing/invert_adata.py +1 -0
  42. smftools/readwrite.py +109 -85
  43. smftools/tools/__init__.py +6 -0
  44. smftools/tools/calculate_knn.py +121 -0
  45. smftools/tools/calculate_nmf.py +18 -7
  46. smftools/tools/calculate_pca.py +180 -0
  47. smftools/tools/calculate_umap.py +70 -154
  48. smftools/tools/position_stats.py +4 -4
  49. smftools/tools/rolling_nn_distance.py +640 -3
  50. smftools/tools/sequence_alignment.py +140 -0
  51. smftools/tools/tensor_factorization.py +52 -4
  52. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/METADATA +3 -1
  53. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/RECORD +56 -42
  54. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  55. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  56. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from smftools.logging_utils import get_logger
6
+ from smftools.optional_imports import require
7
+
8
+ if TYPE_CHECKING:
9
+ import anndata as ad
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ def calculate_knn(
15
+ adata: "ad.AnnData",
16
+ obsm: str = "X_pca",
17
+ knn_neighbors: int = 100,
18
+ overwrite: bool = True,
19
+ threads: int = 8,
20
+ random_state: int | None = 0,
21
+ symmetrize: bool = True,
22
+ ) -> "ad.AnnData":
23
+ """Compute a KNN distance graph on an embedding in `adata.obsm[obsm]`.
24
+
25
+ Stores:
26
+ - adata.obsp[f"knn_distances_{obsm}"] : CSR sparse matrix of distances
27
+ - adata.uns[f"knn_distances_{obsm}"]["params"] : metadata
28
+
29
+ Args:
30
+ adata: AnnData object to update.
31
+ obsm: Key in `adata.obsm` to use as the embedding.
32
+ knn_neighbors: Target number of neighbors (will be clipped to n_obs-1).
33
+ overwrite: If False and graph exists, do nothing.
34
+ threads: Parallel jobs for pynndescent.
35
+ random_state: Seed for pynndescent.
36
+ symmetrize: If True, make distance graph symmetric via min(A, A.T).
37
+
38
+ Returns:
39
+ Updated AnnData.
40
+ """
41
+ import numpy as np
42
+ import scipy.sparse as sp
43
+
44
+ if obsm not in adata.obsm:
45
+ raise KeyError(f"`{obsm}` not found in adata.obsm. Available: {list(adata.obsm.keys())}")
46
+
47
+ out_key = f"knn_distances_{obsm}"
48
+ if not overwrite and out_key in adata.obsp:
49
+ logger.info("KNN graph %r already exists and overwrite=False; skipping.", out_key)
50
+ return adata
51
+
52
+ data = adata.obsm[obsm]
53
+
54
+ if sp.issparse(data):
55
+ # Convert to float32 for pynndescent/numba friendliness if needed
56
+ data = data.astype(np.float32)
57
+ logger.info(
58
+ "Sparse embedding detected (%s). Proceeding without NaN check.", type(data).__name__
59
+ )
60
+ else:
61
+ data = np.asarray(data)
62
+ if np.isnan(data).any():
63
+ logger.warning("NaNs detected in %s; filling NaNs with 0.5 before KNN.", obsm)
64
+ data = np.nan_to_num(data, nan=0.5)
65
+ data = data.astype(np.float32, copy=False)
66
+
67
+ pynndescent = require("pynndescent", extra="umap", purpose="KNN graph computation")
68
+
69
+ n_obs = data.shape[0]
70
+ if n_obs < 2:
71
+ raise ValueError(f"Need at least 2 observations for KNN; got n_obs={n_obs}")
72
+
73
+ n_neighbors = min(int(knn_neighbors), n_obs - 1)
74
+ if n_neighbors < 1:
75
+ raise ValueError(f"Computed n_neighbors={n_neighbors}; check knn_neighbors and n_obs.")
76
+
77
+ logger.info(
78
+ "Running pynndescent KNN (obsm=%s, n_neighbors=%d, metric=euclidean, n_jobs=%d)",
79
+ obsm,
80
+ n_neighbors,
81
+ threads,
82
+ )
83
+
84
+ nn_index = pynndescent.NNDescent(
85
+ data,
86
+ n_neighbors=n_neighbors,
87
+ metric="euclidean",
88
+ random_state=random_state,
89
+ n_jobs=threads,
90
+ )
91
+ knn_indices, knn_dists = nn_index.neighbor_graph # shapes: (n_obs, n_neighbors)
92
+
93
+ rows = np.repeat(np.arange(n_obs, dtype=np.int64), n_neighbors)
94
+ cols = knn_indices.reshape(-1).astype(np.int64, copy=False)
95
+ vals = knn_dists.reshape(-1).astype(np.float32, copy=False)
96
+
97
+ distances = sp.coo_matrix((vals, (rows, cols)), shape=(n_obs, n_obs)).tocsr()
98
+
99
+ # Optional: ensure diagonal is 0 and (optionally) symmetrize
100
+ distances.setdiag(0.0)
101
+ distances.eliminate_zeros()
102
+
103
+ if symmetrize:
104
+ # Keep the smaller directed distance for each undirected edge
105
+ distances = distances.minimum(distances.T)
106
+
107
+ adata.obsp[out_key] = distances
108
+ adata.uns[out_key] = {
109
+ "params": {
110
+ "obsm": obsm,
111
+ "n_neighbors_requested": int(knn_neighbors),
112
+ "n_neighbors_used": int(n_neighbors),
113
+ "method": "pynndescent",
114
+ "metric": "euclidean",
115
+ "random_state": random_state,
116
+ "n_jobs": int(threads),
117
+ "symmetrize": bool(symmetrize),
118
+ }
119
+ }
120
+
121
+ return adata
@@ -9,6 +9,7 @@ from smftools.optional_imports import require
9
9
 
10
10
  if TYPE_CHECKING:
11
11
  import anndata as ad
12
+ import numpy as np
12
13
 
13
14
  logger = get_logger(__name__)
14
15
 
@@ -16,7 +17,7 @@ logger = get_logger(__name__)
16
17
  def calculate_nmf(
17
18
  adata: "ad.AnnData",
18
19
  layer: str | None = "nan_half",
19
- var_filters: Sequence[str] | None = None,
20
+ var_mask: "np.ndarray | Sequence[bool] | None" = None,
20
21
  n_components: int = 2,
21
22
  max_iter: int = 200,
22
23
  random_state: int = 0,
@@ -24,13 +25,14 @@ def calculate_nmf(
24
25
  embedding_key: str = "X_nmf",
25
26
  components_key: str = "H_nmf",
26
27
  uns_key: str = "nmf",
28
+ suffix: str | None = None,
27
29
  ) -> "ad.AnnData":
28
30
  """Compute a low-dimensional NMF embedding.
29
31
 
30
32
  Args:
31
33
  adata: AnnData object to update.
32
34
  layer: Layer name to use for NMF (``None`` uses ``adata.X``).
33
- var_filters: Optional list of var masks to subset features.
35
+ var_mask: Optional boolean mask to subset features.
34
36
  n_components: Number of NMF components to compute.
35
37
  max_iter: Maximum number of NMF iterations.
36
38
  random_state: Random seed for the NMF initializer.
@@ -47,6 +49,11 @@ def calculate_nmf(
47
49
  require("sklearn", extra="ml-base", purpose="NMF calculation")
48
50
  from sklearn.decomposition import NMF
49
51
 
52
+ if suffix:
53
+ embedding_key = f"{embedding_key}_{suffix}"
54
+ components_key = f"{components_key}_{suffix}"
55
+ uns_key = f"{uns_key}_{suffix}"
56
+
50
57
  has_embedding = embedding_key in adata.obsm
51
58
  has_components = components_key in adata.varm
52
59
  if has_embedding and has_components and not overwrite:
@@ -56,17 +63,21 @@ def calculate_nmf(
56
63
  logger.info("NMF embedding present without components; recomputing to store components.")
57
64
 
58
65
  subset_mask = None
59
- if var_filters:
60
- subset_mask = np.logical_or.reduce([adata.var[f].values for f in var_filters])
66
+ if var_mask is not None:
67
+ subset_mask = np.asarray(var_mask, dtype=bool)
68
+ if subset_mask.ndim != 1 or subset_mask.shape[0] != adata.n_vars:
69
+ raise ValueError(
70
+ "var_mask must be a 1D boolean array with length matching adata.n_vars."
71
+ )
61
72
  adata_subset = adata[:, subset_mask].copy()
62
73
  logger.info(
63
74
  "Subsetting adata: retained %s features based on filters %s",
64
75
  adata_subset.shape[1],
65
- var_filters,
76
+ "var_mask",
66
77
  )
67
78
  else:
68
79
  adata_subset = adata.copy()
69
- logger.info("No var filters provided. Using all features.")
80
+ logger.info("No var_mask provided. Using all features.")
70
81
 
71
82
  data = adata_subset.layers[layer] if layer else adata_subset.X
72
83
  if issparse(data):
@@ -107,7 +118,7 @@ def calculate_nmf(
107
118
  "max_iter": max_iter,
108
119
  "random_state": random_state,
109
120
  "layer": layer,
110
- "var_filters": list(var_filters) if var_filters else None,
121
+ "var_mask_provided": var_mask is not None,
111
122
  "components_key": components_key,
112
123
  }
113
124
 
@@ -0,0 +1,180 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Sequence
4
+
5
+ from smftools.logging_utils import get_logger
6
+ from smftools.optional_imports import require
7
+
8
+ if TYPE_CHECKING:
9
+ import anndata as ad
10
+ import numpy as np
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ def calculate_pca(
16
+ adata: "ad.AnnData",
17
+ layer: str | None = "nan_half",
18
+ var_mask: "np.ndarray | Sequence[bool] | None" = None,
19
+ n_pcs: int = 15,
20
+ overwrite: bool = True,
21
+ output_suffix: str | None = None,
22
+ fill_nan: float | None = 0.5,
23
+ ) -> "ad.AnnData":
24
+ """Compute PCA and store scores in `.obsm` and loadings in `.varm`."""
25
+
26
+ import numpy as np
27
+ import scipy.sparse as sp
28
+
29
+ obsm_output = f"X_pca_{output_suffix}" if output_suffix else "X_pca"
30
+ varm_output = f"PCs_{output_suffix}" if output_suffix else "PCs"
31
+
32
+ if not overwrite and obsm_output in adata.obsm and varm_output in adata.varm:
33
+ logger.info(
34
+ "PCA outputs already exist and overwrite=False; skipping (%s, %s).",
35
+ obsm_output,
36
+ varm_output,
37
+ )
38
+ return adata
39
+
40
+ # --- Build feature subset mask (over vars) ---
41
+ if var_mask is not None:
42
+ subset_mask = np.asarray(var_mask, dtype=bool)
43
+ if subset_mask.ndim != 1 or subset_mask.shape[0] != adata.n_vars:
44
+ raise ValueError(
45
+ "var_mask must be a 1D boolean array with length matching adata.n_vars."
46
+ )
47
+ n_vars_used = int(subset_mask.sum())
48
+ if n_vars_used == 0:
49
+ raise ValueError("var_mask retained 0 features.")
50
+ logger.info(
51
+ "Subsetting vars: retained %d / %d features from var_mask",
52
+ n_vars_used,
53
+ adata.n_vars,
54
+ )
55
+ else:
56
+ subset_mask = slice(None)
57
+ n_vars_used = adata.n_vars
58
+ logger.info("No var_mask provided; using all %d features.", adata.n_vars)
59
+
60
+ # --- Pull matrix view ---
61
+ if layer is None:
62
+ matrix = adata.X
63
+ layer_used = None
64
+ else:
65
+ if layer not in adata.layers:
66
+ raise KeyError(
67
+ f"Layer {layer!r} not found in adata.layers. Available: {list(adata.layers.keys())}"
68
+ )
69
+ matrix = adata.layers[layer]
70
+ layer_used = layer
71
+
72
+ matrix = matrix[:, subset_mask] # slice view (sparse OK)
73
+
74
+ n_obs = matrix.shape[0]
75
+ if n_obs < 2:
76
+ raise ValueError(f"PCA requires at least 2 observations; got n_obs={n_obs}")
77
+ if n_vars_used < 1:
78
+ raise ValueError("PCA requires at least 1 feature.")
79
+
80
+ n_pcs_requested = int(n_pcs)
81
+ n_pcs_used = min(n_pcs_requested, n_obs, n_vars_used)
82
+ if n_pcs_used < 1:
83
+ raise ValueError(f"n_pcs_used became {n_pcs_used}; check inputs.")
84
+
85
+ # --- NaN handling (dense only; sparse usually won’t store NaNs) ---
86
+ if not sp.issparse(matrix):
87
+ X = np.asarray(matrix, dtype=np.float32)
88
+ if fill_nan is not None and np.isnan(X).any():
89
+ logger.warning("NaNs detected; filling NaNs with %s before PCA.", fill_nan)
90
+ X = np.nan_to_num(X, nan=float(fill_nan))
91
+ else:
92
+ X = matrix # keep sparse
93
+
94
+ # --- PCA ---
95
+ # Prefer sklearn's randomized PCA for speed on big matrices.
96
+ used_sklearn = False
97
+ try:
98
+ sklearn = require("sklearn", extra="ml", purpose="PCA computation")
99
+ from sklearn.decomposition import PCA, TruncatedSVD
100
+
101
+ if sp.issparse(X):
102
+ # TruncatedSVD works on sparse without centering; good approximation.
103
+ # If you *need* centered PCA on sparse, you'd need different machinery.
104
+ logger.info("Running TruncatedSVD (sparse) with n_components=%d", n_pcs_used)
105
+ model = TruncatedSVD(n_components=n_pcs_used, random_state=0)
106
+ scores = model.fit_transform(X) # (n_obs, n_pcs)
107
+ loadings = model.components_.T # (n_vars_used, n_pcs)
108
+ mean = None
109
+ explained_variance_ratio = getattr(model, "explained_variance_ratio_", None)
110
+ else:
111
+ logger.info(
112
+ "Running sklearn PCA with n_components=%d (svd_solver=randomized)", n_pcs_used
113
+ )
114
+ model = PCA(n_components=n_pcs_used, svd_solver="randomized", random_state=0)
115
+ scores = model.fit_transform(X) # (n_obs, n_pcs)
116
+ loadings = model.components_.T # (n_vars_used, n_pcs)
117
+ mean = model.mean_
118
+ explained_variance_ratio = model.explained_variance_ratio_
119
+
120
+ used_sklearn = True
121
+
122
+ except Exception as e:
123
+ # Fallback to your manual SVD (dense only)
124
+ if sp.issparse(X):
125
+ raise RuntimeError(
126
+ "Sparse input PCA fallback is not implemented without sklearn. "
127
+ "Install scikit-learn (extra 'ml') or densify upstream."
128
+ ) from e
129
+
130
+ import scipy.linalg as spla
131
+
132
+ logger.warning(
133
+ "sklearn PCA unavailable; falling back to full SVD (can be slow). Reason: %s", e
134
+ )
135
+ Xd = np.asarray(X, dtype=np.float64)
136
+ mean = Xd.mean(axis=0)
137
+ centered = Xd - mean
138
+ u, s, vt = spla.svd(centered, full_matrices=False)
139
+ u = u[:, :n_pcs_used]
140
+ s = s[:n_pcs_used]
141
+ vt = vt[:n_pcs_used]
142
+ scores = u * s
143
+ loadings = vt.T
144
+ explained_variance_ratio = None
145
+
146
+ # --- Store scores (obsm) ---
147
+ adata.obsm[obsm_output] = scores
148
+
149
+ # --- Store loadings (varm) with original var dimension ---
150
+ pc_matrix = np.zeros((adata.n_vars, n_pcs_used), dtype=np.float32)
151
+ if isinstance(subset_mask, slice):
152
+ pc_matrix[:, :] = loadings
153
+ else:
154
+ pc_matrix[subset_mask, :] = loadings.astype(np.float32, copy=False)
155
+
156
+ adata.varm[varm_output] = pc_matrix
157
+
158
+ # --- Metadata ---
159
+ adata.uns[obsm_output] = {
160
+ "params": {
161
+ "layer": layer_used,
162
+ "var_mask_provided": var_mask is not None,
163
+ "n_pcs_requested": n_pcs_requested,
164
+ "n_pcs_used": int(n_pcs_used),
165
+ "used_sklearn": used_sklearn,
166
+ "fill_nan": fill_nan,
167
+ "note_sparse": bool(sp.issparse(matrix)),
168
+ },
169
+ "explained_variance_ratio": explained_variance_ratio,
170
+ "mean": mean.tolist() if (mean is not None and isinstance(mean, np.ndarray)) else None,
171
+ }
172
+
173
+ logger.info(
174
+ "Stored PCA: adata.obsm[%s] (%s) and adata.varm[%s] (%s)",
175
+ obsm_output,
176
+ scores.shape,
177
+ varm_output,
178
+ pc_matrix.shape,
179
+ )
180
+ return adata
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Sequence
3
+ from typing import TYPE_CHECKING
4
4
 
5
5
  from smftools.logging_utils import get_logger
6
6
  from smftools.optional_imports import require
@@ -13,174 +13,90 @@ logger = get_logger(__name__)
13
13
 
14
14
  def calculate_umap(
15
15
  adata: "ad.AnnData",
16
- layer: str | None = "nan_half",
17
- var_filters: Sequence[str] | None = None,
18
- n_pcs: int = 15,
19
- knn_neighbors: int = 100,
16
+ obsm: str | None = "X_pca",
20
17
  overwrite: bool = True,
21
18
  threads: int = 8,
22
19
  random_state: int | None = 0,
20
+ output_suffix: str | None = None,
23
21
  ) -> "ad.AnnData":
24
- """Compute PCA, neighbors, and UMAP embeddings.
25
-
26
- Args:
27
- adata: AnnData object to update.
28
- layer: Layer name to use for PCA/UMAP (``None`` uses ``adata.X``).
29
- var_filters: Optional list of var masks to subset features.
30
- n_pcs: Number of principal components.
31
- knn_neighbors: Number of neighbors for the graph.
32
- overwrite: Whether to recompute embeddings if they exist.
33
- threads: Number of OMP threads for computation.
34
-
35
- Returns:
36
- anndata.AnnData: Updated AnnData object.
37
- """
38
- import os
22
+ """Compute UMAP embedding from an `.obsm` embedding, and store connectivities."""
39
23
 
40
24
  import numpy as np
41
- import scipy.linalg as spla
42
25
  import scipy.sparse as sp
43
26
 
27
+ if obsm is None:
28
+ raise ValueError("obsm must be a key in adata.obsm (e.g., 'X_pca').")
29
+
30
+ if obsm not in adata.obsm:
31
+ raise KeyError(f"`{obsm}` not found in adata.obsm. Available: {list(adata.obsm.keys())}")
32
+
44
33
  umap = require("umap", extra="umap", purpose="UMAP calculation")
45
- pynndescent = require("pynndescent", extra="umap", purpose="KNN graph computation")
46
-
47
- os.environ["OMP_NUM_THREADS"] = str(threads)
48
-
49
- # Step 1: Apply var filter
50
- if var_filters:
51
- subset_mask = np.logical_or.reduce([adata.var[f].values for f in var_filters])
52
- adata_subset = adata[:, subset_mask].copy()
53
- logger.info(
54
- "Subsetting adata: retained %s features based on filters %s",
55
- adata_subset.shape[1],
56
- var_filters,
34
+
35
+ output_obsm = f"X_umap_{output_suffix}" if output_suffix else "X_umap"
36
+ conn_key = f"connectivities_{obsm}"
37
+
38
+ # Decide n_neighbors: prefer stored KNN params, else UMAP default-ish
39
+ n_neighbors = None
40
+ knn_uns_key = f"knn_distances_{obsm}"
41
+ if knn_uns_key in adata.uns:
42
+ params = adata.uns[knn_uns_key].get("params", {})
43
+ n_neighbors = params.get("n_neighbors_used", params.get("n_neighbors", None))
44
+ if n_neighbors is None:
45
+ n_neighbors = 15 # reasonable default if KNN wasn't precomputed
46
+ logger.warning(
47
+ "No %r found in adata.uns; defaulting n_neighbors=%d for UMAP.",
48
+ knn_uns_key,
49
+ n_neighbors,
57
50
  )
51
+
52
+ # Build input matrix X and handle NaNs locally
53
+ X = adata.obsm[obsm]
54
+ if sp.issparse(X):
55
+ # UMAP can accept sparse CSR; keep it sparse
56
+ pass
58
57
  else:
59
- adata_subset = adata.copy()
60
- logger.info("No var filters provided. Using all features.")
61
-
62
- # Step 2: NaN handling inside layer
63
- if layer:
64
- data = adata_subset.layers[layer]
65
- if not sp.issparse(data):
66
- if np.isnan(data).any():
67
- logger.warning("NaNs detected, filling with 0.5 before PCA + neighbors.")
68
- data = np.nan_to_num(data, nan=0.5)
69
- adata_subset.layers[layer] = data
70
- else:
71
- logger.info("No NaNs detected.")
72
- else:
73
- logger.info(
74
- "Sparse matrix detected; skipping NaN check (sparse formats typically do not store NaNs)."
75
- )
76
-
77
- # Step 3: PCA + neighbors + UMAP on subset
78
- if "X_umap" not in adata_subset.obsm or overwrite:
79
- n_pcs = min(adata_subset.shape[1], n_pcs)
80
- logger.info("Running PCA with n_pcs=%s", n_pcs)
81
-
82
- if layer:
83
- matrix = adata_subset.layers[layer]
84
- else:
85
- matrix = adata_subset.X
86
-
87
- if sp.issparse(matrix):
88
- logger.warning("Converting sparse matrix to dense for PCA.")
89
- matrix = matrix.toarray()
90
-
91
- matrix = np.asarray(matrix, dtype=float)
92
- mean = matrix.mean(axis=0)
93
- centered = matrix - mean
94
-
95
- if centered.shape[0] == 0 or centered.shape[1] == 0:
96
- raise ValueError("PCA requires a non-empty matrix.")
97
-
98
- if n_pcs <= 0:
99
- raise ValueError("n_pcs must be positive.")
100
-
101
- if centered.shape[1] <= n_pcs:
102
- n_pcs = centered.shape[1]
103
-
104
- if centered.shape[0] < n_pcs:
105
- n_pcs = centered.shape[0]
106
-
107
- u, s, vt = spla.svd(centered, full_matrices=False)
108
-
109
- u = u[:, :n_pcs]
110
- s = s[:n_pcs]
111
- vt = vt[:n_pcs]
112
-
113
- adata_subset.obsm["X_pca"] = u * s
114
- adata_subset.varm["PCs"] = vt.T
115
-
116
- logger.info("Running neighborhood graph with pynndescent (n_neighbors=%s)", knn_neighbors)
117
- n_neighbors = min(knn_neighbors, max(1, adata_subset.n_obs - 1))
118
- nn_index = pynndescent.NNDescent(
119
- adata_subset.obsm["X_pca"],
120
- n_neighbors=n_neighbors,
121
- metric="euclidean",
122
- random_state=random_state,
123
- n_jobs=threads,
124
- )
125
- knn_indices, knn_dists = nn_index.neighbor_graph
126
-
127
- rows = np.repeat(np.arange(adata_subset.n_obs), n_neighbors)
128
- cols = knn_indices.reshape(-1)
129
- distances = sp.coo_matrix(
130
- (knn_dists.reshape(-1), (rows, cols)),
131
- shape=(adata_subset.n_obs, adata_subset.n_obs),
132
- ).tocsr()
133
- adata_subset.obsp["distances"] = distances
134
-
135
- logger.info("Running UMAP")
136
- umap_model = umap.UMAP(
137
- n_neighbors=n_neighbors,
138
- n_components=2,
139
- metric="euclidean",
140
- random_state=random_state,
58
+ X = np.asarray(X)
59
+ if np.isnan(X).any():
60
+ logger.warning("NaNs detected in %s; filling NaNs with 0.5 for UMAP.", obsm)
61
+ X = np.nan_to_num(X, nan=0.5)
62
+
63
+ if (not overwrite) and (output_obsm in adata.obsm) and (conn_key in adata.obsp):
64
+ logger.info("UMAP + connectivities already exist and overwrite=False; skipping.")
65
+ return adata
66
+
67
+ logger.info("Running UMAP (obsm=%s, n_neighbors=%d, metric=euclidean)", obsm, n_neighbors)
68
+
69
+ # Note: umap-learn uses numba threading; n_jobs controls parallelism in UMAP
70
+ # and is ignored when random_state is set (umap-learn behavior).
71
+ umap_model = umap.UMAP(
72
+ n_neighbors=int(n_neighbors),
73
+ n_components=2,
74
+ metric="euclidean",
75
+ random_state=random_state,
76
+ n_jobs=int(threads),
77
+ )
78
+
79
+ embedding = umap_model.fit_transform(X)
80
+ adata.obsm[output_obsm] = embedding
81
+
82
+ # UMAP's computed fuzzy graph
83
+ connectivities = getattr(umap_model, "graph_", None)
84
+ if connectivities is not None:
85
+ adata.obsp[conn_key] = (
86
+ connectivities.tocsr() if sp.issparse(connectivities) else connectivities
141
87
  )
142
- adata_subset.obsm["X_umap"] = umap_model.fit_transform(adata_subset.obsm["X_pca"])
143
-
144
- try:
145
- from umap.umap_ import fuzzy_simplicial_set
146
-
147
- fuzzy_result = fuzzy_simplicial_set(
148
- adata_subset.obsm["X_pca"],
149
- n_neighbors=n_neighbors,
150
- random_state=random_state,
151
- metric="euclidean",
152
- knn_indices=knn_indices,
153
- knn_dists=knn_dists,
154
- )
155
- connectivities = fuzzy_result[0] if isinstance(fuzzy_result, tuple) else fuzzy_result
156
- except TypeError:
157
- connectivities = umap_model.graph_
158
-
159
- adata_subset.obsp["connectivities"] = connectivities
160
-
161
- # Step 4: Store results in original adata
162
- adata.obsm["X_pca"] = adata_subset.obsm["X_pca"]
163
- adata.obsm["X_umap"] = adata_subset.obsm["X_umap"]
164
- adata.obsp["distances"] = adata_subset.obsp["distances"]
165
- adata.obsp["connectivities"] = adata_subset.obsp["connectivities"]
166
- adata.uns["neighbors"] = {
88
+ else:
89
+ logger.warning("UMAP model did not expose graph_; connectivities not stored.")
90
+
91
+ adata.uns[output_obsm] = {
167
92
  "params": {
168
- "n_neighbors": knn_neighbors,
169
- "method": "pynndescent",
93
+ "obsm": obsm,
94
+ "n_neighbors": int(n_neighbors),
170
95
  "metric": "euclidean",
96
+ "random_state": random_state,
97
+ "n_jobs": int(threads),
171
98
  }
172
99
  }
173
100
 
174
- # Fix varm["PCs"] shape mismatch
175
- pc_matrix = np.zeros((adata.shape[1], adata_subset.varm["PCs"].shape[1]))
176
- if var_filters:
177
- subset_mask = np.logical_or.reduce([adata.var[f].values for f in var_filters])
178
- pc_matrix[subset_mask, :] = adata_subset.varm["PCs"]
179
- else:
180
- pc_matrix = adata_subset.varm["PCs"] # No subsetting case
181
-
182
- adata.varm["PCs"] = pc_matrix
183
-
184
- logger.info("Stored: adata.obsm['X_pca'] and adata.obsm['X_umap']")
185
-
101
+ logger.info("Stored: adata.obsm[%s]=%s", output_obsm, embedding.shape)
186
102
  return adata
@@ -320,12 +320,12 @@ def compute_positionwise_statistics(
320
320
 
321
321
  # samples / refs
322
322
  sseries = adata.obs[sample_col]
323
- if not pd.api.types.is_categorical_dtype(sseries):
323
+ if not isinstance(sseries.dtype, pd.CategoricalDtype):
324
324
  sseries = sseries.astype("category")
325
325
  samples = list(sseries.cat.categories)
326
326
 
327
327
  rseries = adata.obs[ref_col]
328
- if not pd.api.types.is_categorical_dtype(rseries):
328
+ if not isinstance(rseries.dtype, pd.CategoricalDtype):
329
329
  rseries = rseries.astype("category")
330
330
  references = list(rseries.cat.categories)
331
331
 
@@ -509,12 +509,12 @@ def plot_positionwise_matrices(
509
509
 
510
510
  # canonicalize sample/ref order
511
511
  sseries = adata.obs[sample_col]
512
- if not pd.api.types.is_categorical_dtype(sseries):
512
+ if not isinstance(sseries.dtype, pd.CategoricalDtype):
513
513
  sseries = sseries.astype("category")
514
514
  samples = list(sseries.cat.categories)
515
515
 
516
516
  rseries = adata.obs[ref_col]
517
- if not pd.api.types.is_categorical_dtype(rseries):
517
+ if not isinstance(rseries.dtype, pd.CategoricalDtype):
518
518
  rseries = rseries.astype("category")
519
519
  references = list(rseries.cat.categories)
520
520