smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +49 -7
  4. smftools/cli/hmm_adata.py +250 -32
  5. smftools/cli/latent_adata.py +773 -0
  6. smftools/cli/load_adata.py +78 -74
  7. smftools/cli/preprocess_adata.py +122 -58
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +74 -112
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +52 -4
  12. smftools/config/conversion.yaml +1 -1
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +85 -12
  15. smftools/config/experiment_config.py +146 -1
  16. smftools/constants.py +69 -0
  17. smftools/hmm/HMM.py +88 -0
  18. smftools/hmm/call_hmm_peaks.py +1 -1
  19. smftools/informatics/__init__.py +6 -0
  20. smftools/informatics/bam_functions.py +358 -8
  21. smftools/informatics/binarize_converted_base_identities.py +2 -89
  22. smftools/informatics/converted_BAM_to_adata.py +636 -175
  23. smftools/informatics/h5ad_functions.py +198 -2
  24. smftools/informatics/modkit_extract_to_adata.py +1007 -425
  25. smftools/informatics/sequence_encoding.py +72 -0
  26. smftools/logging_utils.py +21 -2
  27. smftools/metadata.py +1 -1
  28. smftools/plotting/__init__.py +26 -3
  29. smftools/plotting/autocorrelation_plotting.py +22 -4
  30. smftools/plotting/chimeric_plotting.py +1893 -0
  31. smftools/plotting/classifiers.py +28 -14
  32. smftools/plotting/general_plotting.py +62 -1583
  33. smftools/plotting/hmm_plotting.py +1670 -8
  34. smftools/plotting/latent_plotting.py +804 -0
  35. smftools/plotting/plotting_utils.py +243 -0
  36. smftools/plotting/position_stats.py +16 -8
  37. smftools/plotting/preprocess_plotting.py +281 -0
  38. smftools/plotting/qc_plotting.py +8 -3
  39. smftools/plotting/spatial_plotting.py +1134 -0
  40. smftools/plotting/variant_plotting.py +1231 -0
  41. smftools/preprocessing/__init__.py +4 -0
  42. smftools/preprocessing/append_base_context.py +18 -18
  43. smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
  44. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  45. smftools/preprocessing/append_variant_call_layer.py +480 -0
  46. smftools/preprocessing/calculate_consensus.py +1 -1
  47. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  48. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/readwrite.py +159 -99
  51. smftools/schema/anndata_schema_v1.yaml +15 -1
  52. smftools/tools/__init__.py +10 -0
  53. smftools/tools/calculate_knn.py +121 -0
  54. smftools/tools/calculate_leiden.py +57 -0
  55. smftools/tools/calculate_nmf.py +130 -0
  56. smftools/tools/calculate_pca.py +180 -0
  57. smftools/tools/calculate_umap.py +79 -80
  58. smftools/tools/position_stats.py +4 -4
  59. smftools/tools/rolling_nn_distance.py +872 -0
  60. smftools/tools/sequence_alignment.py +140 -0
  61. smftools/tools/tensor_factorization.py +217 -0
  62. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
  63. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
  64. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  65. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  66. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from pathlib import Path
4
5
  from typing import Optional, Tuple
5
6
 
6
7
  import anndata as ad
7
8
 
8
- from smftools.logging_utils import get_logger
9
- from smftools.optional_imports import require
9
+ from smftools.constants import LOGGING_DIR, SPATIAL_DIR
10
+ from smftools.logging_utils import get_logger, setup_logging
10
11
 
11
12
  logger = get_logger(__name__)
12
13
 
@@ -34,64 +35,61 @@ def spatial_adata(
34
35
  spatial_adata_path : Path | None
35
36
  Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
36
37
  """
37
- from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
38
- from .helpers import get_adata_paths
39
- from .load_adata import load_adata
40
- from .preprocess_adata import preprocess_adata
38
+ from ..readwrite import safe_read_h5ad
39
+ from .helpers import get_adata_paths, load_experiment_config
41
40
 
42
41
  # 1) Ensure config + basic paths via load_adata
43
- loaded_adata, loaded_path, cfg = load_adata(config_path)
42
+ cfg = load_experiment_config(config_path)
43
+
44
44
  paths = get_adata_paths(cfg)
45
45
 
46
- raw_path = paths.raw
47
46
  pp_path = paths.pp
48
47
  pp_dedup_path = paths.pp_dedup
49
48
  spatial_path = paths.spatial
49
+ chimeric_path = paths.chimeric
50
+ variant_path = paths.variant
50
51
  hmm_path = paths.hmm
52
+ latent_path = paths.latent
51
53
 
52
54
  # Stage-skipping logic for spatial
53
55
  if not getattr(cfg, "force_redo_spatial_analyses", False):
54
- # If HMM exists, it's the most processed stage — reuse it.
55
- if hmm_path.exists():
56
- logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
57
- return None, hmm_path
58
-
59
56
  # If spatial exists, we consider spatial analyses already done.
60
57
  if spatial_path.exists():
61
58
  logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
62
59
  return None, spatial_path
63
60
 
64
- # 2) Ensure preprocessing has been run
65
- # This will create pp/pp_dedup as needed or return them if they already exist.
66
- pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
67
- config_path
68
- )
69
-
70
61
  # Helper to load from disk, reusing loaded_adata if it matches
71
62
  def _load(path: Path):
72
- if loaded_adata is not None and loaded_path == path:
73
- return loaded_adata
74
63
  adata, _ = safe_read_h5ad(path)
75
64
  return adata
76
65
 
77
66
  # 3) Decide which AnnData to use as the *starting point* for spatial analyses
78
- # Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
79
- if pp_dedup_adata is not None:
80
- start_adata = pp_dedup_adata
81
- source_path = pp_dedup_adata_path_ret
67
+ if hmm_path.exists():
68
+ start_adata = _load(hmm_path)
69
+ source_path = hmm_path
70
+ elif latent_path.exists():
71
+ start_adata = _load(latent_path)
72
+ source_path = latent_path
73
+ elif spatial_path.exists():
74
+ start_adata = _load(spatial_path)
75
+ source_path = spatial_path
76
+ elif chimeric_path.exists():
77
+ start_adata = _load(chimeric_path)
78
+ source_path = chimeric_path
79
+ elif variant_path.exists():
80
+ start_adata = _load(variant_path)
81
+ source_path = variant_path
82
+ elif pp_dedup_path.exists():
83
+ start_adata = _load(pp_dedup_path)
84
+ source_path = pp_dedup_path
85
+ elif pp_path.exists():
86
+ start_adata = _load(pp_path)
87
+ source_path = pp_path
82
88
  else:
83
- if pp_dedup_path.exists():
84
- start_adata = _load(pp_dedup_path)
85
- source_path = pp_dedup_path
86
- elif pp_path.exists():
87
- start_adata = _load(pp_path)
88
- source_path = pp_path
89
- elif raw_path.exists():
90
- start_adata = _load(raw_path)
91
- source_path = raw_path
92
- else:
93
- logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
94
- return None, None
89
+ logger.warning(
90
+ "No suitable AnnData found for spatial analyses (need at least preprocessed)."
91
+ )
92
+ return None, None
95
93
 
96
94
  # 4) Run the spatial core
97
95
  adata_spatial, spatial_path = spatial_adata_core(
@@ -99,15 +97,10 @@ def spatial_adata(
99
97
  cfg=cfg,
100
98
  spatial_adata_path=spatial_path,
101
99
  pp_adata_path=pp_path,
102
- pp_dup_rem_adata_path=pp_dedup_path,
103
- pp_adata_in_memory=pp_adata,
104
100
  source_adata_path=source_path,
105
101
  config_path=config_path,
106
102
  )
107
103
 
108
- # 5) Register spatial path in summary CSV
109
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
110
-
111
104
  return adata_spatial, spatial_path
112
105
 
113
106
 
@@ -116,8 +109,6 @@ def spatial_adata_core(
116
109
  cfg,
117
110
  spatial_adata_path: Path,
118
111
  pp_adata_path: Path,
119
- pp_dup_rem_adata_path: Path,
120
- pp_adata_in_memory: Optional[ad.AnnData] = None,
121
112
  source_adata_path: Optional[Path] = None,
122
113
  config_path: Optional[str] = None,
123
114
  ) -> Tuple[ad.AnnData, Path]:
@@ -129,8 +120,6 @@ def spatial_adata_core(
129
120
  - `cfg` is the ExperimentConfig.
130
121
  - `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
131
122
  from `get_adata_paths`.
132
- - `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
133
- the same run of `preprocess_adata`, to avoid re-reading from disk.
134
123
 
135
124
  Does:
136
125
  - Optional sample sheet load.
@@ -152,13 +141,12 @@ def spatial_adata_core(
152
141
  """
153
142
  import os
154
143
  import warnings
144
+ from datetime import datetime
155
145
  from pathlib import Path
156
146
 
157
147
  import numpy as np
158
148
  import pandas as pd
159
149
 
160
- sc = require("scanpy", extra="scanpy", purpose="spatial analyses")
161
-
162
150
  from ..metadata import record_smftools_metadata
163
151
  from ..plotting import (
164
152
  combined_raw_clustermap,
@@ -171,7 +159,6 @@ def spatial_adata_core(
171
159
  reindex_references_adata,
172
160
  )
173
161
  from ..readwrite import make_dirs, safe_read_h5ad
174
- from ..tools import calculate_umap
175
162
  from ..tools.position_stats import (
176
163
  compute_positionwise_statistics,
177
164
  plot_positionwise_matrices,
@@ -187,8 +174,24 @@ def spatial_adata_core(
187
174
  # -----------------------------
188
175
  # General setup
189
176
  # -----------------------------
177
+ date_str = datetime.today().strftime("%y%m%d")
178
+ now = datetime.now()
179
+ time_str = now.strftime("%H%M%S")
180
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
181
+
190
182
  output_directory = Path(cfg.output_directory)
191
- make_dirs([output_directory])
183
+ spatial_directory = output_directory / SPATIAL_DIR
184
+ logging_directory = spatial_directory / LOGGING_DIR
185
+
186
+ make_dirs([output_directory, spatial_directory])
187
+
188
+ if cfg.emit_log_file:
189
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
190
+ make_dirs([logging_directory])
191
+ else:
192
+ log_file = None
193
+
194
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
192
195
 
193
196
  smf_modality = cfg.smf_modality
194
197
  if smf_modality == "conversion":
@@ -196,8 +199,6 @@ def spatial_adata_core(
196
199
  else:
197
200
  deaminase = True
198
201
 
199
- first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
200
-
201
202
  # -----------------------------
202
203
  # Optional sample sheet metadata
203
204
  # -----------------------------
@@ -231,17 +232,16 @@ def spatial_adata_core(
231
232
  else:
232
233
  reindex_suffix = None
233
234
 
234
- pp_dir = output_directory / "preprocessed"
235
235
  references = adata.obs[cfg.reference_column].cat.categories
236
236
 
237
237
  # ============================================================
238
- # 1) Clustermaps (non-direct modalities) on *preprocessed* data
238
+ # 1) Clustermaps (non-direct modalities) on preprocessed adata
239
239
  # ============================================================
240
240
  if smf_modality != "direct":
241
241
  preprocessed_version_available = pp_adata_path.exists()
242
242
 
243
243
  if preprocessed_version_available:
244
- pp_clustermap_dir = pp_dir / "06_clustermaps"
244
+ pp_clustermap_dir = spatial_directory / "01_clustermaps"
245
245
 
246
246
  if pp_clustermap_dir.is_dir() and not getattr(
247
247
  cfg, "force_redo_spatial_analyses", False
@@ -250,12 +250,9 @@ def spatial_adata_core(
250
250
  f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
251
251
  )
252
252
  else:
253
- make_dirs([pp_dir, pp_clustermap_dir])
253
+ make_dirs([spatial_directory, pp_clustermap_dir])
254
254
 
255
- if first_pp_run and (pp_adata_in_memory is not None):
256
- pp_adata = pp_adata_in_memory
257
- else:
258
- pp_adata, _ = safe_read_h5ad(pp_adata_path)
255
+ pp_adata, _ = safe_read_h5ad(pp_adata_path)
259
256
 
260
257
  # -----------------------------
261
258
  # Optional sample sheet metadata
@@ -303,8 +300,8 @@ def spatial_adata_core(
303
300
  min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
304
301
  0
305
302
  ],
306
- min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
307
- demux_types=("double", "already"),
303
+ min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
304
+ demux_types=cfg.clustermap_demux_types_to_plot,
308
305
  bins=None,
309
306
  sample_mapping=None,
310
307
  save_path=pp_clustermap_dir,
@@ -314,19 +311,18 @@ def spatial_adata_core(
314
311
  )
315
312
 
316
313
  # ============================================================
317
- # 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
314
+ # 2) Clustermaps on deduplicated preprocessed AnnDatas
318
315
  # ============================================================
319
- pp_dir_dedup = pp_dir / "deduplicated"
320
- pp_clustermap_dir_dedup = pp_dir_dedup / "06_clustermaps"
321
- pp_umap_dir = pp_dir_dedup / "07_umaps"
316
+ spatial_dir_dedup = spatial_directory / "deduplicated"
317
+ clustermap_dir_dedup = spatial_dir_dedup / "01_clustermaps"
322
318
 
323
319
  # Clustermaps on deduplicated adata
324
- if pp_clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
320
+ if clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
325
321
  logger.debug(
326
- f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
322
+ f"{clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
327
323
  )
328
324
  else:
329
- make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
325
+ make_dirs([spatial_dir_dedup, clustermap_dir_dedup])
330
326
  combined_raw_clustermap(
331
327
  adata,
332
328
  sample_col=cfg.sample_name_col_for_plotting,
@@ -346,53 +342,19 @@ def spatial_adata_core(
346
342
  0
347
343
  ],
348
344
  min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
349
- demux_types=("double", "already"),
345
+ demux_types=cfg.clustermap_demux_types_to_plot,
350
346
  bins=None,
351
347
  sample_mapping=None,
352
- save_path=pp_clustermap_dir_dedup,
348
+ save_path=clustermap_dir_dedup,
353
349
  sort_by=cfg.spatial_clustermap_sortby,
354
350
  deaminase=deaminase,
355
351
  index_col_suffix=reindex_suffix,
356
352
  )
357
353
 
358
- # UMAP / Leiden
359
- if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
360
- logger.debug(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
361
- else:
362
- make_dirs([pp_umap_dir])
363
-
364
- var_filters = []
365
- if smf_modality == "direct":
366
- for ref in references:
367
- for base in cfg.mod_target_bases:
368
- var_filters.append(f"{ref}_{base}_site")
369
- elif deaminase:
370
- for ref in references:
371
- var_filters.append(f"{ref}_C_site")
372
- else:
373
- for ref in references:
374
- for base in cfg.mod_target_bases:
375
- var_filters.append(f"{ref}_{base}_site")
376
-
377
- adata = calculate_umap(
378
- adata,
379
- layer=cfg.layer_for_umap_plotting,
380
- var_filters=var_filters,
381
- n_pcs=10,
382
- knn_neighbors=15,
383
- )
384
-
385
- sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
386
-
387
- sc.settings.figdir = pp_umap_dir
388
- umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
389
- umap_layers += cfg.umap_layers_to_plot
390
- sc.pl.umap(adata, color=umap_layers, show=False, save=True)
391
-
392
354
  # ============================================================
393
355
  # 3) Spatial autocorrelation + rolling metrics
394
356
  # ============================================================
395
- pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
357
+ pp_autocorr_dir = spatial_dir_dedup / "02_autocorrelations"
396
358
 
397
359
  if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
398
360
  logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
@@ -735,10 +697,10 @@ def spatial_adata_core(
735
697
  # ============================================================
736
698
  # 4) Pearson / correlation matrices
737
699
  # ============================================================
738
- pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
700
+ corr_dir = spatial_dir_dedup / "03_correlation_matrices"
739
701
 
740
- if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
741
- logger.debug(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
702
+ if corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
703
+ logger.debug(f"{corr_dir} already exists. Skipping correlation matrix plotting.")
742
704
  else:
743
705
  compute_positionwise_statistics(
744
706
  adata,
@@ -763,15 +725,15 @@ def spatial_adata_core(
763
725
  cmaps=cfg.correlation_matrix_cmaps,
764
726
  vmin=None,
765
727
  vmax=None,
766
- output_dir=pp_corr_dir,
728
+ output_dir=corr_dir,
767
729
  output_key="positionwise_result",
768
730
  )
769
731
 
770
732
  # ============================================================
771
- # 5) Save spatial AnnData
733
+ # 4) Save spatial AnnData
772
734
  # ============================================================
773
735
  if (not spatial_adata_path.exists()) or getattr(cfg, "force_redo_spatial_analyses", False):
774
- logger.info("Saving spatial analyzed AnnData (post preprocessing and duplicate removal).")
736
+ logger.info("Saving spatial analyzed AnnData.")
775
737
  record_smftools_metadata(
776
738
  adata,
777
739
  step_name="spatial",