smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +34 -6
  7. smftools/cli/hmm_adata.py +239 -33
  8. smftools/cli/latent_adata.py +318 -0
  9. smftools/cli/load_adata.py +167 -131
  10. smftools/cli/preprocess_adata.py +180 -53
  11. smftools/cli/spatial_adata.py +152 -100
  12. smftools/cli_entry.py +38 -1
  13. smftools/config/__init__.py +2 -0
  14. smftools/config/conversion.yaml +11 -1
  15. smftools/config/default.yaml +42 -2
  16. smftools/config/experiment_config.py +59 -1
  17. smftools/constants.py +65 -0
  18. smftools/datasets/__init__.py +2 -0
  19. smftools/hmm/HMM.py +97 -3
  20. smftools/hmm/__init__.py +24 -13
  21. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  22. smftools/hmm/archived/calculate_distances.py +2 -0
  23. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  24. smftools/hmm/archived/train_hmm.py +2 -0
  25. smftools/hmm/call_hmm_peaks.py +5 -2
  26. smftools/hmm/display_hmm.py +4 -1
  27. smftools/hmm/hmm_readwrite.py +7 -2
  28. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  29. smftools/informatics/__init__.py +59 -34
  30. smftools/informatics/archived/bam_conversion.py +2 -0
  31. smftools/informatics/archived/bam_direct.py +2 -0
  32. smftools/informatics/archived/basecall_pod5s.py +2 -0
  33. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  34. smftools/informatics/archived/conversion_smf.py +2 -0
  35. smftools/informatics/archived/deaminase_smf.py +1 -0
  36. smftools/informatics/archived/direct_smf.py +2 -0
  37. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  38. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  39. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  40. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  41. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  42. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  43. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  44. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  45. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  48. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  49. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  50. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  52. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  53. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  54. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  55. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  56. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  57. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  58. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  59. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  60. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  61. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  62. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  63. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  64. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  65. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  66. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  67. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  68. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  69. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  70. smftools/informatics/archived/subsample_pod5.py +2 -0
  71. smftools/informatics/bam_functions.py +1093 -176
  72. smftools/informatics/basecalling.py +2 -0
  73. smftools/informatics/bed_functions.py +271 -61
  74. smftools/informatics/binarize_converted_base_identities.py +3 -0
  75. smftools/informatics/complement_base_list.py +2 -0
  76. smftools/informatics/converted_BAM_to_adata.py +641 -176
  77. smftools/informatics/fasta_functions.py +94 -10
  78. smftools/informatics/h5ad_functions.py +123 -4
  79. smftools/informatics/modkit_extract_to_adata.py +1019 -431
  80. smftools/informatics/modkit_functions.py +2 -0
  81. smftools/informatics/ohe.py +2 -0
  82. smftools/informatics/pod5_functions.py +3 -2
  83. smftools/informatics/sequence_encoding.py +72 -0
  84. smftools/logging_utils.py +21 -2
  85. smftools/machine_learning/__init__.py +22 -6
  86. smftools/machine_learning/data/__init__.py +2 -0
  87. smftools/machine_learning/data/anndata_data_module.py +18 -4
  88. smftools/machine_learning/data/preprocessing.py +2 -0
  89. smftools/machine_learning/evaluation/__init__.py +2 -0
  90. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  91. smftools/machine_learning/evaluation/evaluators.py +14 -9
  92. smftools/machine_learning/inference/__init__.py +2 -0
  93. smftools/machine_learning/inference/inference_utils.py +2 -0
  94. smftools/machine_learning/inference/lightning_inference.py +6 -1
  95. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  96. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  97. smftools/machine_learning/models/__init__.py +2 -0
  98. smftools/machine_learning/models/base.py +7 -2
  99. smftools/machine_learning/models/cnn.py +7 -2
  100. smftools/machine_learning/models/lightning_base.py +16 -11
  101. smftools/machine_learning/models/mlp.py +5 -1
  102. smftools/machine_learning/models/positional.py +7 -2
  103. smftools/machine_learning/models/rnn.py +5 -1
  104. smftools/machine_learning/models/sklearn_models.py +14 -9
  105. smftools/machine_learning/models/transformer.py +7 -2
  106. smftools/machine_learning/models/wrappers.py +6 -2
  107. smftools/machine_learning/training/__init__.py +2 -0
  108. smftools/machine_learning/training/train_lightning_model.py +13 -3
  109. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  110. smftools/machine_learning/utils/__init__.py +2 -0
  111. smftools/machine_learning/utils/device.py +5 -1
  112. smftools/machine_learning/utils/grl.py +5 -1
  113. smftools/metadata.py +1 -1
  114. smftools/optional_imports.py +31 -0
  115. smftools/plotting/__init__.py +41 -31
  116. smftools/plotting/autocorrelation_plotting.py +9 -5
  117. smftools/plotting/classifiers.py +16 -4
  118. smftools/plotting/general_plotting.py +2415 -629
  119. smftools/plotting/hmm_plotting.py +97 -9
  120. smftools/plotting/position_stats.py +15 -7
  121. smftools/plotting/qc_plotting.py +6 -1
  122. smftools/preprocessing/__init__.py +36 -37
  123. smftools/preprocessing/append_base_context.py +17 -17
  124. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  125. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  126. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  127. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  128. smftools/preprocessing/archived/preprocessing.py +2 -0
  129. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  130. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  131. smftools/preprocessing/calculate_complexity_II.py +4 -1
  132. smftools/preprocessing/calculate_consensus.py +1 -1
  133. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  134. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  135. smftools/preprocessing/calculate_position_Youden.py +9 -2
  136. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  137. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  138. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  139. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  140. smftools/preprocessing/make_dirs.py +2 -1
  141. smftools/preprocessing/min_non_diagonal.py +2 -0
  142. smftools/preprocessing/recipes.py +2 -0
  143. smftools/readwrite.py +53 -17
  144. smftools/schema/anndata_schema_v1.yaml +15 -1
  145. smftools/tools/__init__.py +30 -18
  146. smftools/tools/archived/apply_hmm.py +2 -0
  147. smftools/tools/archived/classifiers.py +2 -0
  148. smftools/tools/archived/classify_methylated_features.py +2 -0
  149. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  150. smftools/tools/archived/subset_adata_v1.py +2 -0
  151. smftools/tools/archived/subset_adata_v2.py +2 -0
  152. smftools/tools/calculate_leiden.py +57 -0
  153. smftools/tools/calculate_nmf.py +119 -0
  154. smftools/tools/calculate_umap.py +93 -8
  155. smftools/tools/cluster_adata_on_methylation.py +7 -1
  156. smftools/tools/position_stats.py +17 -27
  157. smftools/tools/rolling_nn_distance.py +235 -0
  158. smftools/tools/tensor_factorization.py +169 -0
  159. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
  160. smftools-0.3.1.dist-info/RECORD +189 -0
  161. smftools-0.2.5.dist-info/RECORD +0 -181
  162. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  163. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  164. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,51 +1,64 @@
1
+ from __future__ import annotations
2
+
1
3
  # duplicate_detection_with_hier_and_plots.py
2
4
  import copy
3
5
  import math
4
6
  import os
5
7
  import warnings
6
8
  from collections import defaultdict
7
- from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
9
+ from importlib.util import find_spec
10
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
8
11
 
9
- import anndata as ad
10
- import matplotlib.pyplot as plt
11
12
  import numpy as np
12
13
  import pandas as pd
13
- import torch
14
+ from scipy.cluster import hierarchy as sch
15
+ from scipy.spatial.distance import pdist, squareform
16
+ from scipy.stats import gaussian_kde
14
17
 
15
18
  from smftools.logging_utils import get_logger
19
+ from smftools.optional_imports import require
16
20
 
17
21
  from ..readwrite import make_dirs
18
22
 
19
23
  logger = get_logger(__name__)
20
24
 
21
- # optional imports for clustering / PCA / KDE
22
- try:
23
- from scipy.cluster import hierarchy as sch
24
- from scipy.spatial.distance import pdist, squareform
25
-
26
- SCIPY_AVAILABLE = True
27
- except Exception:
28
- sch = None
29
- pdist = None
30
- squareform = None
31
- SCIPY_AVAILABLE = False
25
+ plt = require("matplotlib.pyplot", extra="plotting", purpose="duplicate read plots")
26
+ torch = require("torch", extra="torch", purpose="duplicate read detection")
32
27
 
33
- try:
34
- from sklearn.cluster import DBSCAN, KMeans
35
- from sklearn.decomposition import PCA
36
- from sklearn.metrics import silhouette_score
37
- from sklearn.mixture import GaussianMixture
28
+ if TYPE_CHECKING:
29
+ import anndata as ad
38
30
 
39
- SKLEARN_AVAILABLE = True
40
- except Exception:
41
- PCA = None
42
- KMeans = DBSCAN = GaussianMixture = silhouette_score = None
43
- SKLEARN_AVAILABLE = False
31
+ SCIPY_AVAILABLE = True
32
+ SKLEARN_AVAILABLE = find_spec("sklearn") is not None
44
33
 
45
- try:
46
- from scipy.stats import gaussian_kde
47
- except Exception:
48
- gaussian_kde = None
34
+ PCA = None
35
+ KMeans = DBSCAN = GaussianMixture = silhouette_score = None
36
+ if SKLEARN_AVAILABLE:
37
+ sklearn_cluster = require(
38
+ "sklearn.cluster",
39
+ extra="ml-base",
40
+ purpose="duplicate read clustering",
41
+ )
42
+ sklearn_decomp = require(
43
+ "sklearn.decomposition",
44
+ extra="ml-base",
45
+ purpose="duplicate read PCA",
46
+ )
47
+ sklearn_metrics = require(
48
+ "sklearn.metrics",
49
+ extra="ml-base",
50
+ purpose="duplicate read clustering diagnostics",
51
+ )
52
+ sklearn_mixture = require(
53
+ "sklearn.mixture",
54
+ extra="ml-base",
55
+ purpose="duplicate read clustering",
56
+ )
57
+ DBSCAN = sklearn_cluster.DBSCAN
58
+ KMeans = sklearn_cluster.KMeans
59
+ PCA = sklearn_decomp.PCA
60
+ silhouette_score = sklearn_metrics.silhouette_score
61
+ GaussianMixture = sklearn_mixture.GaussianMixture
49
62
 
50
63
 
51
64
  def merge_uns_preserve(orig_uns: dict, new_uns: dict, prefer: str = "orig") -> dict:
@@ -153,24 +166,6 @@ def flag_duplicate_reads(
153
166
  import numpy as np
154
167
  import pandas as pd
155
168
 
156
- # optional imports already guarded at module import time, but re-check
157
- try:
158
- from scipy.cluster import hierarchy as sch
159
- from scipy.spatial.distance import pdist
160
-
161
- SCIPY_AVAILABLE = True
162
- except Exception:
163
- sch = None
164
- pdist = None
165
- SCIPY_AVAILABLE = False
166
- try:
167
- from sklearn.decomposition import PCA
168
-
169
- SKLEARN_AVAILABLE = True
170
- except Exception:
171
- PCA = None
172
- SKLEARN_AVAILABLE = False
173
-
174
169
  # -------- helper: demux-aware keeper selection --------
175
170
  def _choose_keeper_with_demux_preference(
176
171
  members_idx: List[int],
@@ -1577,13 +1572,6 @@ def _run_clustering(
1577
1572
  Run clustering on 2D points (x,y). Returns labels (len = npoints) and diagnostics dict.
1578
1573
  Labels follow sklearn conventions (noise -> -1 for DBSCAN/HDBSCAN).
1579
1574
  """
1580
- try:
1581
- from sklearn.cluster import DBSCAN, KMeans
1582
- from sklearn.metrics import silhouette_score
1583
- from sklearn.mixture import GaussianMixture
1584
- except Exception:
1585
- KMeans = DBSCAN = GaussianMixture = silhouette_score = None
1586
-
1587
1575
  pts = np.column_stack([x, y])
1588
1576
  diagnostics: Dict[str, Any] = {"method": method, "n_input": len(x)}
1589
1577
  if len(x) < min_points:
@@ -1,5 +1,6 @@
1
- ## make_dirs
1
+ from __future__ import annotations
2
2
 
3
+ ## make_dirs
3
4
  from smftools.logging_utils import get_logger
4
5
 
5
6
  logger = get_logger(__name__)
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## min_non_diagonal
2
4
 
3
5
 
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # recipes
2
4
 
3
5
 
smftools/readwrite.py CHANGED
@@ -431,6 +431,8 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
431
431
  "layers_skipped": [],
432
432
  "obsm_converted": [],
433
433
  "obsm_skipped": [],
434
+ "varm_converted": [],
435
+ "varm_skipped": [],
434
436
  "X_replaced_or_converted": None,
435
437
  "errors": [],
436
438
  }
@@ -605,10 +607,16 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
605
607
 
606
608
  def _sanitize_layers_obsm(src_dict, which: str):
607
609
  """
608
- Ensure arrays in layers/obsm are numeric and non-object dtype.
610
+ Ensure arrays in layers/obsm/varm are numeric and non-object dtype.
609
611
  Returns a cleaned dict suitable to pass into AnnData(...)
610
612
  If an entry is not convertible, it is backed up & skipped.
611
613
  """
614
+ report_map = {
615
+ "layers": ("layers_converted", "layers_skipped"),
616
+ "obsm": ("obsm_converted", "obsm_skipped"),
617
+ "varm": ("varm_converted", "varm_skipped"),
618
+ }
619
+ converted_key, skipped_key = report_map[which]
612
620
  cleaned = {}
613
621
  for k, v in src_dict.items():
614
622
  try:
@@ -618,9 +626,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
618
626
  arr_f = arr.astype(float)
619
627
  cleaned[k] = arr_f
620
628
  report_key = f"{which}.{k}"
621
- report["layers_converted"].append(
622
- report_key
623
- ) if which == "layers" else report["obsm_converted"].append(report_key)
629
+ report[converted_key].append(report_key)
624
630
  if verbose:
625
631
  print(f" {which}.{k} object array coerced to float.")
626
632
  except Exception:
@@ -628,18 +634,13 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
628
634
  arr_i = arr.astype(int)
629
635
  cleaned[k] = arr_i
630
636
  report_key = f"{which}.{k}"
631
- report["layers_converted"].append(
632
- report_key
633
- ) if which == "layers" else report["obsm_converted"].append(report_key)
637
+ report[converted_key].append(report_key)
634
638
  if verbose:
635
639
  print(f" {which}.{k} object array coerced to int.")
636
640
  except Exception:
637
641
  if backup:
638
642
  _backup(v, f"{which}_{k}_backup")
639
- if which == "layers":
640
- report["layers_skipped"].append(k)
641
- else:
642
- report["obsm_skipped"].append(k)
643
+ report[skipped_key].append(k)
643
644
  if verbose:
644
645
  print(
645
646
  f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}"
@@ -650,10 +651,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
650
651
  except Exception as e:
651
652
  if backup:
652
653
  _backup(v, f"{which}_{k}_backup")
653
- if which == "layers":
654
- report["layers_skipped"].append(k)
655
- else:
656
- report["obsm_skipped"].append(k)
654
+ report[skipped_key].append(k)
657
655
  msg = f" SKIPPING {which}.{k} due to conversion error: {e}"
658
656
  report["errors"].append(msg)
659
657
  if verbose:
@@ -693,6 +691,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
693
691
  # ---------- sanitize layers and obsm ----------
694
692
  layers_src = getattr(adata, "layers", {})
695
693
  obsm_src = getattr(adata, "obsm", {})
694
+ varm_src = getattr(adata, "varm", {})
696
695
 
697
696
  try:
698
697
  layers_clean = _sanitize_layers_obsm(layers_src, "layers")
@@ -712,6 +711,15 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
712
711
  print(msg)
713
712
  obsm_clean = {}
714
713
 
714
+ try:
715
+ varm_clean = _sanitize_layers_obsm(varm_src, "varm")
716
+ except Exception as e:
717
+ msg = f"Failed to sanitize varm: {e}"
718
+ report["errors"].append(msg)
719
+ if verbose:
720
+ print(msg)
721
+ varm_clean = {}
722
+
715
723
  # ---------- handle X ----------
716
724
  X_to_use = adata.X
717
725
  try:
@@ -747,7 +755,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
747
755
  layers=layers_clean,
748
756
  uns=uns_clean,
749
757
  obsm=obsm_clean,
750
- varm=getattr(adata, "varm", None),
758
+ varm=varm_clean,
751
759
  )
752
760
 
753
761
  # preserve names (as strings)
@@ -872,6 +880,16 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
872
880
  }
873
881
  )
874
882
 
883
+ # varm
884
+ for k, v in adata_copy.varm.items():
885
+ meta_rows.append(
886
+ {
887
+ "kind": "varm",
888
+ "name": k,
889
+ "dtype": str(np.asarray(v).dtype),
890
+ }
891
+ )
892
+
875
893
  # uns
876
894
  for k, v in adata_copy.uns.items():
877
895
  meta_rows.append(
@@ -977,6 +995,7 @@ def safe_read_h5ad(
977
995
  "parsed_uns_json_keys": [],
978
996
  "restored_layers": [],
979
997
  "restored_obsm": [],
998
+ "restored_varm": [],
980
999
  "recategorized_obs": [],
981
1000
  "recategorized_var": [],
982
1001
  "missing_backups": [],
@@ -1215,7 +1234,7 @@ def safe_read_h5ad(
1215
1234
  print(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
1216
1235
 
1217
1236
  # 5) Restore layers and obsm from backups if present
1218
- # expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl
1237
+ # expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl, varm_<name>_backup.pkl
1219
1238
  if os.path.isdir(backup_dir):
1220
1239
  for fname in os.listdir(backup_dir):
1221
1240
  if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
@@ -1248,6 +1267,21 @@ def safe_read_h5ad(
1248
1267
  f"Failed to restore obsm['{obsm_name}'] from {full}: {e}"
1249
1268
  )
1250
1269
 
1270
+ if fname.startswith("varm_") and fname.endswith("_backup.pkl"):
1271
+ varm_name = fname[len("varm_") : -len("_backup.pkl")]
1272
+ full = os.path.join(backup_dir, fname)
1273
+ val = _load_pickle_if_exists(full)
1274
+ if val is not None:
1275
+ try:
1276
+ adata.varm[varm_name] = np.asarray(val)
1277
+ report["restored_varm"].append((varm_name, full))
1278
+ if verbose:
1279
+ print(f"[safe_read_h5ad] restored varm['{varm_name}'] from {full}")
1280
+ except Exception as e:
1281
+ report["errors"].append(
1282
+ f"Failed to restore varm['{varm_name}'] from {full}: {e}"
1283
+ )
1284
+
1251
1285
  # 6) If restore_backups True but some expected backups missing, note them
1252
1286
  if restore_backups and os.path.isdir(backup_dir):
1253
1287
  # detect common expected names from obs/var/uns/layers in adata
@@ -1297,6 +1331,8 @@ def safe_read_h5ad(
1297
1331
  print("Restored layers:", report["restored_layers"])
1298
1332
  if report["restored_obsm"]:
1299
1333
  print("Restored obsm:", report["restored_obsm"])
1334
+ if report["restored_varm"]:
1335
+ print("Restored varm:", report["restored_varm"])
1300
1336
  if report["recategorized_obs"] or report["recategorized_var"]:
1301
1337
  print(
1302
1338
  "Recategorized columns (obs/var):",
@@ -60,6 +60,20 @@ stages:
60
60
  notes: "Mapping quality score."
61
61
  requires: []
62
62
  optional_inputs: []
63
+ reference_start:
64
+ dtype: "float"
65
+ created_by: "smftools.informatics.h5ad_functions.add_read_length_and_mapping_qc"
66
+ modified_by: []
67
+ notes: "0-based reference start position for the alignment."
68
+ requires: []
69
+ optional_inputs: []
70
+ reference_end:
71
+ dtype: "float"
72
+ created_by: "smftools.informatics.h5ad_functions.add_read_length_and_mapping_qc"
73
+ modified_by: []
74
+ notes: "0-based reference end position (exclusive) for the alignment."
75
+ requires: []
76
+ optional_inputs: []
63
77
  read_length_to_reference_length_ratio:
64
78
  dtype: "float"
65
79
  created_by: "smftools.informatics.h5ad_functions.add_read_length_and_mapping_qc"
@@ -179,7 +193,7 @@ stages:
179
193
  obs:
180
194
  leiden:
181
195
  dtype: "category"
182
- created_by: "smftools.tools.calculate_umap"
196
+ created_by: "smftools.tools.calculate_leiden"
183
197
  modified_by: []
184
198
  notes: "Leiden cluster assignments."
185
199
  requires: [["obsm.X_umap"]]
@@ -1,19 +1,31 @@
1
- from .calculate_umap import calculate_umap
2
- from .cluster_adata_on_methylation import cluster_adata_on_methylation
3
- from .general_tools import combine_layers, create_nan_mask_from_X, create_nan_or_non_gpc_mask
4
- from .position_stats import calculate_relative_risk_on_activity, compute_positionwise_statistics
5
- from .read_stats import calculate_row_entropy
6
- from .spatial_autocorrelation import *
7
- from .subset_adata import subset_adata
1
+ from __future__ import annotations
8
2
 
9
- __all__ = [
10
- "compute_positionwise_statistics",
11
- "calculate_row_entropy",
12
- "calculate_umap",
13
- "calculate_relative_risk_on_activity",
14
- "cluster_adata_on_methylation",
15
- "create_nan_mask_from_X",
16
- "create_nan_or_non_gpc_mask",
17
- "combine_layers",
18
- "subset_adata",
19
- ]
3
+ from importlib import import_module
4
+
5
+ _LAZY_ATTRS = {
6
+ "calculate_leiden": "smftools.tools.calculate_leiden",
7
+ "calculate_nmf": "smftools.tools.calculate_nmf",
8
+ "calculate_sequence_cp_decomposition": "smftools.tools.tensor_factorization",
9
+ "calculate_umap": "smftools.tools.calculate_umap",
10
+ "cluster_adata_on_methylation": "smftools.tools.cluster_adata_on_methylation",
11
+ "combine_layers": "smftools.tools.general_tools",
12
+ "create_nan_mask_from_X": "smftools.tools.general_tools",
13
+ "create_nan_or_non_gpc_mask": "smftools.tools.general_tools",
14
+ "calculate_relative_risk_on_activity": "smftools.tools.position_stats",
15
+ "compute_positionwise_statistics": "smftools.tools.position_stats",
16
+ "calculate_row_entropy": "smftools.tools.read_stats",
17
+ "rolling_window_nn_distance": "smftools.tools.rolling_nn_distance",
18
+ "subset_adata": "smftools.tools.subset_adata",
19
+ }
20
+
21
+
22
+ def __getattr__(name: str):
23
+ if name in _LAZY_ATTRS:
24
+ module = import_module(_LAZY_ATTRS[name])
25
+ attr = getattr(module, name)
26
+ globals()[name] = attr
27
+ return attr
28
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
29
+
30
+
31
+ __all__ = list(_LAZY_ATTRS.keys())
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import numpy as np
2
4
  import pandas as pd
3
5
  import torch
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  ## Train CNN, RNN, Random Forest models on double barcoded, low contamination datasets
2
4
  import torch
3
5
  import torch.nn as nn
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # classify_methylated_features
2
4
 
3
5
  def classify_methylated_features(read, model, coordinates, classification_mapping={}):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # classify_non_methylated_features
2
4
 
3
5
  def classify_non_methylated_features(read, model, coordinates, classification_mapping={}):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # subset_adata
2
4
 
3
5
  def subset_adata(adata, obs_columns):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # subset_adata
2
4
 
3
5
  def subset_adata(adata, columns, cat_type='obs'):
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from smftools.logging_utils import get_logger
9
+ from smftools.optional_imports import require
10
+
11
+ if TYPE_CHECKING:
12
+ import anndata as ad
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ def calculate_leiden(
18
+ adata: "ad.AnnData",
19
+ *,
20
+ resolution: float = 0.1,
21
+ key_added: str = "leiden",
22
+ connectivities_key: str = "connectivities",
23
+ ) -> "ad.AnnData":
24
+ """Compute Leiden clusters from a connectivity graph.
25
+
26
+ Args:
27
+ adata: AnnData object with ``obsp[connectivities_key]`` set.
28
+ resolution: Resolution parameter for Leiden clustering.
29
+ key_added: Column name to store cluster assignments in ``adata.obs``.
30
+ connectivities_key: Key in ``adata.obsp`` containing a sparse adjacency matrix.
31
+
32
+ Returns:
33
+ Updated AnnData object with Leiden labels in ``adata.obs``.
34
+ """
35
+ if connectivities_key not in adata.obsp:
36
+ raise KeyError(f"Missing connectivities '{connectivities_key}' in adata.obsp.")
37
+
38
+ igraph = require("igraph", extra="cluster", purpose="Leiden clustering")
39
+ leidenalg = require("leidenalg", extra="cluster", purpose="Leiden clustering")
40
+
41
+ connectivities = adata.obsp[connectivities_key]
42
+ coo = connectivities.tocoo()
43
+ edges = list(zip(coo.row.tolist(), coo.col.tolist()))
44
+ graph = igraph.Graph(n=connectivities.shape[0], edges=edges, directed=False)
45
+ graph.es["weight"] = coo.data.tolist()
46
+
47
+ partition = leidenalg.find_partition(
48
+ graph,
49
+ leidenalg.RBConfigurationVertexPartition,
50
+ weights=graph.es["weight"],
51
+ resolution_parameter=resolution,
52
+ )
53
+
54
+ labels = np.array(partition.membership, dtype=str)
55
+ adata.obs[key_added] = pd.Categorical(labels)
56
+ logger.info("Stored Leiden clusters in adata.obs['%s'].", key_added)
57
+ return adata
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Sequence
4
+
5
+ import numpy as np
6
+
7
+ from smftools.logging_utils import get_logger
8
+ from smftools.optional_imports import require
9
+
10
+ if TYPE_CHECKING:
11
+ import anndata as ad
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ def calculate_nmf(
17
+ adata: "ad.AnnData",
18
+ layer: str | None = "nan_half",
19
+ var_filters: Sequence[str] | None = None,
20
+ n_components: int = 2,
21
+ max_iter: int = 200,
22
+ random_state: int = 0,
23
+ overwrite: bool = True,
24
+ embedding_key: str = "X_nmf",
25
+ components_key: str = "H_nmf",
26
+ uns_key: str = "nmf",
27
+ ) -> "ad.AnnData":
28
+ """Compute a low-dimensional NMF embedding.
29
+
30
+ Args:
31
+ adata: AnnData object to update.
32
+ layer: Layer name to use for NMF (``None`` uses ``adata.X``).
33
+ var_filters: Optional list of var masks to subset features.
34
+ n_components: Number of NMF components to compute.
35
+ max_iter: Maximum number of NMF iterations.
36
+ random_state: Random seed for the NMF initializer.
37
+ overwrite: Whether to recompute if the embedding already exists.
38
+ embedding_key: Key for the embedding in ``adata.obsm``.
39
+ components_key: Key for the components matrix in ``adata.varm``.
40
+ uns_key: Key for metadata stored in ``adata.uns``.
41
+
42
+ Returns:
43
+ anndata.AnnData: Updated AnnData object.
44
+ """
45
+ from scipy.sparse import issparse
46
+
47
+ require("sklearn", extra="ml-base", purpose="NMF calculation")
48
+ from sklearn.decomposition import NMF
49
+
50
+ has_embedding = embedding_key in adata.obsm
51
+ has_components = components_key in adata.varm
52
+ if has_embedding and has_components and not overwrite:
53
+ logger.info("NMF embedding and components already present; skipping recomputation.")
54
+ return adata
55
+ if has_embedding and not has_components and not overwrite:
56
+ logger.info("NMF embedding present without components; recomputing to store components.")
57
+
58
+ subset_mask = None
59
+ if var_filters:
60
+ subset_mask = np.logical_or.reduce([adata.var[f].values for f in var_filters])
61
+ adata_subset = adata[:, subset_mask].copy()
62
+ logger.info(
63
+ "Subsetting adata: retained %s features based on filters %s",
64
+ adata_subset.shape[1],
65
+ var_filters,
66
+ )
67
+ else:
68
+ adata_subset = adata.copy()
69
+ logger.info("No var filters provided. Using all features.")
70
+
71
+ data = adata_subset.layers[layer] if layer else adata_subset.X
72
+ if issparse(data):
73
+ data = data.copy()
74
+ if data.data.size and np.isnan(data.data).any():
75
+ logger.warning("NaNs detected in sparse data, filling with 0.5 before NMF.")
76
+ data.data = np.nan_to_num(data.data, nan=0.5)
77
+ if data.data.size and (data.data < 0).any():
78
+ logger.warning("Negative values detected in sparse data, clipping to 0 for NMF.")
79
+ data.data[data.data < 0] = 0
80
+ else:
81
+ if np.isnan(data).any():
82
+ logger.warning("NaNs detected, filling with 0.5 before NMF.")
83
+ data = np.nan_to_num(data, nan=0.5)
84
+ if (data < 0).any():
85
+ logger.warning("Negative values detected, clipping to 0 for NMF.")
86
+ data = np.clip(data, a_min=0, a_max=None)
87
+
88
+ model = NMF(
89
+ n_components=n_components,
90
+ init="nndsvda",
91
+ max_iter=max_iter,
92
+ random_state=random_state,
93
+ )
94
+ embedding = model.fit_transform(data)
95
+ components = model.components_.T
96
+
97
+ if subset_mask is not None:
98
+ components_matrix = np.zeros((adata.shape[1], components.shape[1]))
99
+ components_matrix[subset_mask, :] = components
100
+ else:
101
+ components_matrix = components
102
+
103
+ adata.obsm[embedding_key] = embedding
104
+ adata.varm[components_key] = components_matrix
105
+ adata.uns[uns_key] = {
106
+ "n_components": n_components,
107
+ "max_iter": max_iter,
108
+ "random_state": random_state,
109
+ "layer": layer,
110
+ "var_filters": list(var_filters) if var_filters else None,
111
+ "components_key": components_key,
112
+ }
113
+
114
+ logger.info(
115
+ "Stored: adata.obsm['%s'] and adata.varm['%s']",
116
+ embedding_key,
117
+ components_key,
118
+ )
119
+ return adata