smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +49 -7
  4. smftools/cli/hmm_adata.py +250 -32
  5. smftools/cli/latent_adata.py +773 -0
  6. smftools/cli/load_adata.py +78 -74
  7. smftools/cli/preprocess_adata.py +122 -58
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +74 -112
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +52 -4
  12. smftools/config/conversion.yaml +1 -1
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +85 -12
  15. smftools/config/experiment_config.py +146 -1
  16. smftools/constants.py +69 -0
  17. smftools/hmm/HMM.py +88 -0
  18. smftools/hmm/call_hmm_peaks.py +1 -1
  19. smftools/informatics/__init__.py +6 -0
  20. smftools/informatics/bam_functions.py +358 -8
  21. smftools/informatics/binarize_converted_base_identities.py +2 -89
  22. smftools/informatics/converted_BAM_to_adata.py +636 -175
  23. smftools/informatics/h5ad_functions.py +198 -2
  24. smftools/informatics/modkit_extract_to_adata.py +1007 -425
  25. smftools/informatics/sequence_encoding.py +72 -0
  26. smftools/logging_utils.py +21 -2
  27. smftools/metadata.py +1 -1
  28. smftools/plotting/__init__.py +26 -3
  29. smftools/plotting/autocorrelation_plotting.py +22 -4
  30. smftools/plotting/chimeric_plotting.py +1893 -0
  31. smftools/plotting/classifiers.py +28 -14
  32. smftools/plotting/general_plotting.py +62 -1583
  33. smftools/plotting/hmm_plotting.py +1670 -8
  34. smftools/plotting/latent_plotting.py +804 -0
  35. smftools/plotting/plotting_utils.py +243 -0
  36. smftools/plotting/position_stats.py +16 -8
  37. smftools/plotting/preprocess_plotting.py +281 -0
  38. smftools/plotting/qc_plotting.py +8 -3
  39. smftools/plotting/spatial_plotting.py +1134 -0
  40. smftools/plotting/variant_plotting.py +1231 -0
  41. smftools/preprocessing/__init__.py +4 -0
  42. smftools/preprocessing/append_base_context.py +18 -18
  43. smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
  44. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  45. smftools/preprocessing/append_variant_call_layer.py +480 -0
  46. smftools/preprocessing/calculate_consensus.py +1 -1
  47. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  48. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/readwrite.py +159 -99
  51. smftools/schema/anndata_schema_v1.yaml +15 -1
  52. smftools/tools/__init__.py +10 -0
  53. smftools/tools/calculate_knn.py +121 -0
  54. smftools/tools/calculate_leiden.py +57 -0
  55. smftools/tools/calculate_nmf.py +130 -0
  56. smftools/tools/calculate_pca.py +180 -0
  57. smftools/tools/calculate_umap.py +79 -80
  58. smftools/tools/position_stats.py +4 -4
  59. smftools/tools/rolling_nn_distance.py +872 -0
  60. smftools/tools/sequence_alignment.py +140 -0
  61. smftools/tools/tensor_factorization.py +217 -0
  62. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
  63. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
  64. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  65. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  66. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Sequence
3
+ from typing import TYPE_CHECKING
4
4
 
5
5
  from smftools.logging_utils import get_logger
6
6
  from smftools.optional_imports import require
@@ -13,91 +13,90 @@ logger = get_logger(__name__)
13
13
 
14
14
  def calculate_umap(
15
15
  adata: "ad.AnnData",
16
- layer: str | None = "nan_half",
17
- var_filters: Sequence[str] | None = None,
18
- n_pcs: int = 15,
19
- knn_neighbors: int = 100,
16
+ obsm: str | None = "X_pca",
20
17
  overwrite: bool = True,
21
18
  threads: int = 8,
19
+ random_state: int | None = 0,
20
+ output_suffix: str | None = None,
22
21
  ) -> "ad.AnnData":
23
- """Compute PCA, neighbors, and UMAP embeddings.
24
-
25
- Args:
26
- adata: AnnData object to update.
27
- layer: Layer name to use for PCA/UMAP (``None`` uses ``adata.X``).
28
- var_filters: Optional list of var masks to subset features.
29
- n_pcs: Number of principal components.
30
- knn_neighbors: Number of neighbors for the graph.
31
- overwrite: Whether to recompute embeddings if they exist.
32
- threads: Number of OMP threads for computation.
33
-
34
- Returns:
35
- anndata.AnnData: Updated AnnData object.
36
- """
37
- import os
22
+ """Compute UMAP embedding from an `.obsm` embedding, and store connectivities."""
38
23
 
39
24
  import numpy as np
40
-
41
- sc = require("scanpy", extra="scanpy", purpose="UMAP calculation")
42
- from scipy.sparse import issparse
43
-
44
- os.environ["OMP_NUM_THREADS"] = str(threads)
45
-
46
- # Step 1: Apply var filter
47
- if var_filters:
48
- subset_mask = np.logical_or.reduce([adata.var[f].values for f in var_filters])
49
- adata_subset = adata[:, subset_mask].copy()
50
- logger.info(
51
- "Subsetting adata: retained %s features based on filters %s",
52
- adata_subset.shape[1],
53
- var_filters,
25
+ import scipy.sparse as sp
26
+
27
+ if obsm is None:
28
+ raise ValueError("obsm must be a key in adata.obsm (e.g., 'X_pca').")
29
+
30
+ if obsm not in adata.obsm:
31
+ raise KeyError(f"`{obsm}` not found in adata.obsm. Available: {list(adata.obsm.keys())}")
32
+
33
+ umap = require("umap", extra="umap", purpose="UMAP calculation")
34
+
35
+ output_obsm = f"X_umap_{output_suffix}" if output_suffix else "X_umap"
36
+ conn_key = f"connectivities_{obsm}"
37
+
38
+ # Decide n_neighbors: prefer stored KNN params, else UMAP default-ish
39
+ n_neighbors = None
40
+ knn_uns_key = f"knn_distances_{obsm}"
41
+ if knn_uns_key in adata.uns:
42
+ params = adata.uns[knn_uns_key].get("params", {})
43
+ n_neighbors = params.get("n_neighbors_used", params.get("n_neighbors", None))
44
+ if n_neighbors is None:
45
+ n_neighbors = 15 # reasonable default if KNN wasn't precomputed
46
+ logger.warning(
47
+ "No %r found in adata.uns; defaulting n_neighbors=%d for UMAP.",
48
+ knn_uns_key,
49
+ n_neighbors,
54
50
  )
51
+
52
+ # Build input matrix X and handle NaNs locally
53
+ X = adata.obsm[obsm]
54
+ if sp.issparse(X):
55
+ # UMAP can accept sparse CSR; keep it sparse
56
+ pass
55
57
  else:
56
- adata_subset = adata.copy()
57
- logger.info("No var filters provided. Using all features.")
58
-
59
- # Step 2: NaN handling inside layer
60
- if layer:
61
- data = adata_subset.layers[layer]
62
- if not issparse(data):
63
- if np.isnan(data).any():
64
- logger.warning("NaNs detected, filling with 0.5 before PCA + neighbors.")
65
- data = np.nan_to_num(data, nan=0.5)
66
- adata_subset.layers[layer] = data
67
- else:
68
- logger.info("No NaNs detected.")
69
- else:
70
- logger.info(
71
- "Sparse matrix detected; skipping NaN check (sparse formats typically do not store NaNs)."
72
- )
73
-
74
- # Step 3: PCA + neighbors + UMAP on subset
75
- if "X_umap" not in adata_subset.obsm or overwrite:
76
- n_pcs = min(adata_subset.shape[1], n_pcs)
77
- logger.info("Running PCA with n_pcs=%s", n_pcs)
78
- sc.pp.pca(adata_subset, layer=layer)
79
- logger.info("Running neighborhood graph")
80
- sc.pp.neighbors(adata_subset, use_rep="X_pca", n_pcs=n_pcs, n_neighbors=knn_neighbors)
81
- logger.info("Running UMAP")
82
- sc.tl.umap(adata_subset)
83
-
84
- # Step 4: Store results in original adata
85
- adata.obsm["X_pca"] = adata_subset.obsm["X_pca"]
86
- adata.obsm["X_umap"] = adata_subset.obsm["X_umap"]
87
- adata.obsp["distances"] = adata_subset.obsp["distances"]
88
- adata.obsp["connectivities"] = adata_subset.obsp["connectivities"]
89
- adata.uns["neighbors"] = adata_subset.uns["neighbors"]
90
-
91
- # Fix varm["PCs"] shape mismatch
92
- pc_matrix = np.zeros((adata.shape[1], adata_subset.varm["PCs"].shape[1]))
93
- if var_filters:
94
- subset_mask = np.logical_or.reduce([adata.var[f].values for f in var_filters])
95
- pc_matrix[subset_mask, :] = adata_subset.varm["PCs"]
58
+ X = np.asarray(X)
59
+ if np.isnan(X).any():
60
+ logger.warning("NaNs detected in %s; filling NaNs with 0.5 for UMAP.", obsm)
61
+ X = np.nan_to_num(X, nan=0.5)
62
+
63
+ if (not overwrite) and (output_obsm in adata.obsm) and (conn_key in adata.obsp):
64
+ logger.info("UMAP + connectivities already exist and overwrite=False; skipping.")
65
+ return adata
66
+
67
+ logger.info("Running UMAP (obsm=%s, n_neighbors=%d, metric=euclidean)", obsm, n_neighbors)
68
+
69
+ # Note: umap-learn uses numba threading; n_jobs controls parallelism in UMAP
70
+ # and is ignored when random_state is set (umap-learn behavior).
71
+ umap_model = umap.UMAP(
72
+ n_neighbors=int(n_neighbors),
73
+ n_components=2,
74
+ metric="euclidean",
75
+ random_state=random_state,
76
+ n_jobs=int(threads),
77
+ )
78
+
79
+ embedding = umap_model.fit_transform(X)
80
+ adata.obsm[output_obsm] = embedding
81
+
82
+ # UMAP's computed fuzzy graph
83
+ connectivities = getattr(umap_model, "graph_", None)
84
+ if connectivities is not None:
85
+ adata.obsp[conn_key] = (
86
+ connectivities.tocsr() if sp.issparse(connectivities) else connectivities
87
+ )
96
88
  else:
97
- pc_matrix = adata_subset.varm["PCs"] # No subsetting case
98
-
99
- adata.varm["PCs"] = pc_matrix
100
-
101
- logger.info("Stored: adata.obsm['X_pca'] and adata.obsm['X_umap']")
102
-
89
+ logger.warning("UMAP model did not expose graph_; connectivities not stored.")
90
+
91
+ adata.uns[output_obsm] = {
92
+ "params": {
93
+ "obsm": obsm,
94
+ "n_neighbors": int(n_neighbors),
95
+ "metric": "euclidean",
96
+ "random_state": random_state,
97
+ "n_jobs": int(threads),
98
+ }
99
+ }
100
+
101
+ logger.info("Stored: adata.obsm[%s]=%s", output_obsm, embedding.shape)
103
102
  return adata
@@ -320,12 +320,12 @@ def compute_positionwise_statistics(
320
320
 
321
321
  # samples / refs
322
322
  sseries = adata.obs[sample_col]
323
- if not pd.api.types.is_categorical_dtype(sseries):
323
+ if not isinstance(sseries.dtype, pd.CategoricalDtype):
324
324
  sseries = sseries.astype("category")
325
325
  samples = list(sseries.cat.categories)
326
326
 
327
327
  rseries = adata.obs[ref_col]
328
- if not pd.api.types.is_categorical_dtype(rseries):
328
+ if not isinstance(rseries.dtype, pd.CategoricalDtype):
329
329
  rseries = rseries.astype("category")
330
330
  references = list(rseries.cat.categories)
331
331
 
@@ -509,12 +509,12 @@ def plot_positionwise_matrices(
509
509
 
510
510
  # canonicalize sample/ref order
511
511
  sseries = adata.obs[sample_col]
512
- if not pd.api.types.is_categorical_dtype(sseries):
512
+ if not isinstance(sseries.dtype, pd.CategoricalDtype):
513
513
  sseries = sseries.astype("category")
514
514
  samples = list(sseries.cat.categories)
515
515
 
516
516
  rseries = adata.obs[ref_col]
517
- if not pd.api.types.is_categorical_dtype(rseries):
517
+ if not isinstance(rseries.dtype, pd.CategoricalDtype):
518
518
  rseries = rseries.astype("category")
519
519
  references = list(rseries.cat.categories)
520
520