smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +34 -6
  7. smftools/cli/hmm_adata.py +239 -33
  8. smftools/cli/latent_adata.py +318 -0
  9. smftools/cli/load_adata.py +167 -131
  10. smftools/cli/preprocess_adata.py +180 -53
  11. smftools/cli/spatial_adata.py +152 -100
  12. smftools/cli_entry.py +38 -1
  13. smftools/config/__init__.py +2 -0
  14. smftools/config/conversion.yaml +11 -1
  15. smftools/config/default.yaml +42 -2
  16. smftools/config/experiment_config.py +59 -1
  17. smftools/constants.py +65 -0
  18. smftools/datasets/__init__.py +2 -0
  19. smftools/hmm/HMM.py +97 -3
  20. smftools/hmm/__init__.py +24 -13
  21. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  22. smftools/hmm/archived/calculate_distances.py +2 -0
  23. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  24. smftools/hmm/archived/train_hmm.py +2 -0
  25. smftools/hmm/call_hmm_peaks.py +5 -2
  26. smftools/hmm/display_hmm.py +4 -1
  27. smftools/hmm/hmm_readwrite.py +7 -2
  28. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  29. smftools/informatics/__init__.py +59 -34
  30. smftools/informatics/archived/bam_conversion.py +2 -0
  31. smftools/informatics/archived/bam_direct.py +2 -0
  32. smftools/informatics/archived/basecall_pod5s.py +2 -0
  33. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  34. smftools/informatics/archived/conversion_smf.py +2 -0
  35. smftools/informatics/archived/deaminase_smf.py +1 -0
  36. smftools/informatics/archived/direct_smf.py +2 -0
  37. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  38. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  39. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  40. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  41. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  42. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  43. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  44. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  45. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  48. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  49. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  50. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  52. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  53. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  54. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  55. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  56. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  57. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  58. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  59. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  60. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  61. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  62. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  63. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  64. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  65. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  66. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  67. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  68. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  69. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  70. smftools/informatics/archived/subsample_pod5.py +2 -0
  71. smftools/informatics/bam_functions.py +1093 -176
  72. smftools/informatics/basecalling.py +2 -0
  73. smftools/informatics/bed_functions.py +271 -61
  74. smftools/informatics/binarize_converted_base_identities.py +3 -0
  75. smftools/informatics/complement_base_list.py +2 -0
  76. smftools/informatics/converted_BAM_to_adata.py +641 -176
  77. smftools/informatics/fasta_functions.py +94 -10
  78. smftools/informatics/h5ad_functions.py +123 -4
  79. smftools/informatics/modkit_extract_to_adata.py +1019 -431
  80. smftools/informatics/modkit_functions.py +2 -0
  81. smftools/informatics/ohe.py +2 -0
  82. smftools/informatics/pod5_functions.py +3 -2
  83. smftools/informatics/sequence_encoding.py +72 -0
  84. smftools/logging_utils.py +21 -2
  85. smftools/machine_learning/__init__.py +22 -6
  86. smftools/machine_learning/data/__init__.py +2 -0
  87. smftools/machine_learning/data/anndata_data_module.py +18 -4
  88. smftools/machine_learning/data/preprocessing.py +2 -0
  89. smftools/machine_learning/evaluation/__init__.py +2 -0
  90. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  91. smftools/machine_learning/evaluation/evaluators.py +14 -9
  92. smftools/machine_learning/inference/__init__.py +2 -0
  93. smftools/machine_learning/inference/inference_utils.py +2 -0
  94. smftools/machine_learning/inference/lightning_inference.py +6 -1
  95. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  96. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  97. smftools/machine_learning/models/__init__.py +2 -0
  98. smftools/machine_learning/models/base.py +7 -2
  99. smftools/machine_learning/models/cnn.py +7 -2
  100. smftools/machine_learning/models/lightning_base.py +16 -11
  101. smftools/machine_learning/models/mlp.py +5 -1
  102. smftools/machine_learning/models/positional.py +7 -2
  103. smftools/machine_learning/models/rnn.py +5 -1
  104. smftools/machine_learning/models/sklearn_models.py +14 -9
  105. smftools/machine_learning/models/transformer.py +7 -2
  106. smftools/machine_learning/models/wrappers.py +6 -2
  107. smftools/machine_learning/training/__init__.py +2 -0
  108. smftools/machine_learning/training/train_lightning_model.py +13 -3
  109. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  110. smftools/machine_learning/utils/__init__.py +2 -0
  111. smftools/machine_learning/utils/device.py +5 -1
  112. smftools/machine_learning/utils/grl.py +5 -1
  113. smftools/metadata.py +1 -1
  114. smftools/optional_imports.py +31 -0
  115. smftools/plotting/__init__.py +41 -31
  116. smftools/plotting/autocorrelation_plotting.py +9 -5
  117. smftools/plotting/classifiers.py +16 -4
  118. smftools/plotting/general_plotting.py +2415 -629
  119. smftools/plotting/hmm_plotting.py +97 -9
  120. smftools/plotting/position_stats.py +15 -7
  121. smftools/plotting/qc_plotting.py +6 -1
  122. smftools/preprocessing/__init__.py +36 -37
  123. smftools/preprocessing/append_base_context.py +17 -17
  124. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  125. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  126. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  127. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  128. smftools/preprocessing/archived/preprocessing.py +2 -0
  129. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  130. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  131. smftools/preprocessing/calculate_complexity_II.py +4 -1
  132. smftools/preprocessing/calculate_consensus.py +1 -1
  133. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  134. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  135. smftools/preprocessing/calculate_position_Youden.py +9 -2
  136. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  137. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  138. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  139. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  140. smftools/preprocessing/make_dirs.py +2 -1
  141. smftools/preprocessing/min_non_diagonal.py +2 -0
  142. smftools/preprocessing/recipes.py +2 -0
  143. smftools/readwrite.py +53 -17
  144. smftools/schema/anndata_schema_v1.yaml +15 -1
  145. smftools/tools/__init__.py +30 -18
  146. smftools/tools/archived/apply_hmm.py +2 -0
  147. smftools/tools/archived/classifiers.py +2 -0
  148. smftools/tools/archived/classify_methylated_features.py +2 -0
  149. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  150. smftools/tools/archived/subset_adata_v1.py +2 -0
  151. smftools/tools/archived/subset_adata_v2.py +2 -0
  152. smftools/tools/calculate_leiden.py +57 -0
  153. smftools/tools/calculate_nmf.py +119 -0
  154. smftools/tools/calculate_umap.py +93 -8
  155. smftools/tools/cluster_adata_on_methylation.py +7 -1
  156. smftools/tools/position_stats.py +17 -27
  157. smftools/tools/rolling_nn_distance.py +235 -0
  158. smftools/tools/tensor_factorization.py +169 -0
  159. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
  160. smftools-0.3.1.dist-info/RECORD +189 -0
  161. smftools-0.2.5.dist-info/RECORD +0 -181
  162. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  163. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  164. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
1
4
  from pathlib import Path
2
5
  from typing import Optional, Tuple
3
6
 
4
7
  import anndata as ad
5
8
 
6
- from smftools.logging_utils import get_logger
9
+ from smftools.constants import LOGGING_DIR, SEQUENCE_INTEGER_ENCODING, SPATIAL_DIR
10
+ from smftools.logging_utils import get_logger, setup_logging
11
+ from smftools.optional_imports import require
7
12
 
8
13
  logger = get_logger(__name__)
9
14
 
@@ -32,15 +37,13 @@ def spatial_adata(
32
37
  Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
33
38
  """
34
39
  from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
35
- from .helpers import get_adata_paths
36
- from .load_adata import load_adata
37
- from .preprocess_adata import preprocess_adata
40
+ from .helpers import get_adata_paths, load_experiment_config
38
41
 
39
42
  # 1) Ensure config + basic paths via load_adata
40
- loaded_adata, loaded_path, cfg = load_adata(config_path)
43
+ cfg = load_experiment_config(config_path)
44
+
41
45
  paths = get_adata_paths(cfg)
42
46
 
43
- raw_path = paths.raw
44
47
  pp_path = paths.pp
45
48
  pp_dedup_path = paths.pp_dedup
46
49
  spatial_path = paths.spatial
@@ -48,47 +51,34 @@ def spatial_adata(
48
51
 
49
52
  # Stage-skipping logic for spatial
50
53
  if not getattr(cfg, "force_redo_spatial_analyses", False):
51
- # If HMM exists, it's the most processed stage — reuse it.
52
- if hmm_path.exists():
53
- logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
54
- return None, hmm_path
55
-
56
54
  # If spatial exists, we consider spatial analyses already done.
57
55
  if spatial_path.exists():
58
56
  logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
59
57
  return None, spatial_path
60
58
 
61
- # 2) Ensure preprocessing has been run
62
- # This will create pp/pp_dedup as needed or return them if they already exist.
63
- pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
64
- config_path
65
- )
66
-
67
59
  # Helper to load from disk, reusing loaded_adata if it matches
68
60
  def _load(path: Path):
69
- if loaded_adata is not None and loaded_path == path:
70
- return loaded_adata
71
61
  adata, _ = safe_read_h5ad(path)
72
62
  return adata
73
63
 
74
64
  # 3) Decide which AnnData to use as the *starting point* for spatial analyses
75
- # Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
76
- if pp_dedup_adata is not None:
77
- start_adata = pp_dedup_adata
78
- source_path = pp_dedup_adata_path_ret
65
+ if hmm_path.exists():
66
+ start_adata = _load(hmm_path)
67
+ source_path = hmm_path
68
+ elif spatial_path.exists():
69
+ start_adata = _load(spatial_path)
70
+ source_path = spatial_path
71
+ elif pp_dedup_path.exists():
72
+ start_adata = _load(pp_dedup_path)
73
+ source_path = pp_dedup_path
74
+ elif pp_path.exists():
75
+ start_adata = _load(pp_path)
76
+ source_path = pp_path
79
77
  else:
80
- if pp_dedup_path.exists():
81
- start_adata = _load(pp_dedup_path)
82
- source_path = pp_dedup_path
83
- elif pp_path.exists():
84
- start_adata = _load(pp_path)
85
- source_path = pp_path
86
- elif raw_path.exists():
87
- start_adata = _load(raw_path)
88
- source_path = raw_path
89
- else:
90
- logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
91
- return None, None
78
+ logger.warning(
79
+ "No suitable AnnData found for spatial analyses (need at least preprocessed)."
80
+ )
81
+ return None, None
92
82
 
93
83
  # 4) Run the spatial core
94
84
  adata_spatial, spatial_path = spatial_adata_core(
@@ -96,15 +86,10 @@ def spatial_adata(
96
86
  cfg=cfg,
97
87
  spatial_adata_path=spatial_path,
98
88
  pp_adata_path=pp_path,
99
- pp_dup_rem_adata_path=pp_dedup_path,
100
- pp_adata_in_memory=pp_adata,
101
89
  source_adata_path=source_path,
102
90
  config_path=config_path,
103
91
  )
104
92
 
105
- # 5) Register spatial path in summary CSV
106
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
107
-
108
93
  return adata_spatial, spatial_path
109
94
 
110
95
 
@@ -113,8 +98,6 @@ def spatial_adata_core(
113
98
  cfg,
114
99
  spatial_adata_path: Path,
115
100
  pp_adata_path: Path,
116
- pp_dup_rem_adata_path: Path,
117
- pp_adata_in_memory: Optional[ad.AnnData] = None,
118
101
  source_adata_path: Optional[Path] = None,
119
102
  config_path: Optional[str] = None,
120
103
  ) -> Tuple[ad.AnnData, Path]:
@@ -126,8 +109,6 @@ def spatial_adata_core(
126
109
  - `cfg` is the ExperimentConfig.
127
110
  - `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
128
111
  from `get_adata_paths`.
129
- - `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
130
- the same run of `preprocess_adata`, to avoid re-reading from disk.
131
112
 
132
113
  Does:
133
114
  - Optional sample sheet load.
@@ -149,16 +130,17 @@ def spatial_adata_core(
149
130
  """
150
131
  import os
151
132
  import warnings
133
+ from datetime import datetime
152
134
  from pathlib import Path
153
135
 
154
136
  import numpy as np
155
137
  import pandas as pd
156
- import scanpy as sc
157
138
 
158
139
  from ..metadata import record_smftools_metadata
159
140
  from ..plotting import (
160
141
  combined_raw_clustermap,
161
142
  plot_rolling_grid,
143
+ plot_rolling_nn_and_layer,
162
144
  plot_spatial_autocorr_grid,
163
145
  )
164
146
  from ..preprocessing import (
@@ -167,11 +149,12 @@ def spatial_adata_core(
167
149
  reindex_references_adata,
168
150
  )
169
151
  from ..readwrite import make_dirs, safe_read_h5ad
170
- from ..tools import calculate_umap
152
+ from ..tools import rolling_window_nn_distance
171
153
  from ..tools.position_stats import (
172
154
  compute_positionwise_statistics,
173
155
  plot_positionwise_matrices,
174
156
  )
157
+ from ..tools.rolling_nn_distance import assign_rolling_nn_results
175
158
  from ..tools.spatial_autocorrelation import (
176
159
  analyze_autocorr_matrix,
177
160
  binary_autocorrelation_with_spacing,
@@ -183,8 +166,24 @@ def spatial_adata_core(
183
166
  # -----------------------------
184
167
  # General setup
185
168
  # -----------------------------
169
+ date_str = datetime.today().strftime("%y%m%d")
170
+ now = datetime.now()
171
+ time_str = now.strftime("%H%M%S")
172
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
173
+
186
174
  output_directory = Path(cfg.output_directory)
187
- make_dirs([output_directory])
175
+ spatial_directory = output_directory / SPATIAL_DIR
176
+ logging_directory = spatial_directory / LOGGING_DIR
177
+
178
+ make_dirs([output_directory, spatial_directory])
179
+
180
+ if cfg.emit_log_file:
181
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
182
+ make_dirs([logging_directory])
183
+ else:
184
+ log_file = None
185
+
186
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
188
187
 
189
188
  smf_modality = cfg.smf_modality
190
189
  if smf_modality == "conversion":
@@ -192,8 +191,6 @@ def spatial_adata_core(
192
191
  else:
193
192
  deaminase = True
194
193
 
195
- first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
196
-
197
194
  # -----------------------------
198
195
  # Optional sample sheet metadata
199
196
  # -----------------------------
@@ -227,7 +224,6 @@ def spatial_adata_core(
227
224
  else:
228
225
  reindex_suffix = None
229
226
 
230
- pp_dir = output_directory / "preprocessed"
231
227
  references = adata.obs[cfg.reference_column].cat.categories
232
228
 
233
229
  # ============================================================
@@ -237,7 +233,7 @@ def spatial_adata_core(
237
233
  preprocessed_version_available = pp_adata_path.exists()
238
234
 
239
235
  if preprocessed_version_available:
240
- pp_clustermap_dir = pp_dir / "06_clustermaps"
236
+ pp_clustermap_dir = spatial_directory / "06_clustermaps"
241
237
 
242
238
  if pp_clustermap_dir.is_dir() and not getattr(
243
239
  cfg, "force_redo_spatial_analyses", False
@@ -246,12 +242,9 @@ def spatial_adata_core(
246
242
  f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
247
243
  )
248
244
  else:
249
- make_dirs([pp_dir, pp_clustermap_dir])
245
+ make_dirs([spatial_directory, pp_clustermap_dir])
250
246
 
251
- if first_pp_run and (pp_adata_in_memory is not None):
252
- pp_adata = pp_adata_in_memory
253
- else:
254
- pp_adata, _ = safe_read_h5ad(pp_adata_path)
247
+ pp_adata, _ = safe_read_h5ad(pp_adata_path)
255
248
 
256
249
  # -----------------------------
257
250
  # Optional sample sheet metadata
@@ -300,7 +293,7 @@ def spatial_adata_core(
300
293
  0
301
294
  ],
302
295
  min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
303
- demux_types=("double", "already"),
296
+ demux_types=cfg.clustermap_demux_types_to_plot,
304
297
  bins=None,
305
298
  sample_mapping=None,
306
299
  save_path=pp_clustermap_dir,
@@ -310,19 +303,18 @@ def spatial_adata_core(
310
303
  )
311
304
 
312
305
  # ============================================================
313
- # 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
306
+ # 2) Clustermaps on *deduplicated* preprocessed AnnData
314
307
  # ============================================================
315
- pp_dir_dedup = pp_dir / "deduplicated"
316
- pp_clustermap_dir_dedup = pp_dir_dedup / "06_clustermaps"
317
- pp_umap_dir = pp_dir_dedup / "07_umaps"
308
+ spatial_dir_dedup = spatial_directory / "deduplicated"
309
+ clustermap_dir_dedup = spatial_dir_dedup / "06_clustermaps"
318
310
 
319
311
  # Clustermaps on deduplicated adata
320
- if pp_clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
312
+ if clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
321
313
  logger.debug(
322
- f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
314
+ f"{clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
323
315
  )
324
316
  else:
325
- make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
317
+ make_dirs([spatial_dir_dedup, clustermap_dir_dedup])
326
318
  combined_raw_clustermap(
327
319
  adata,
328
320
  sample_col=cfg.sample_name_col_for_plotting,
@@ -342,53 +334,113 @@ def spatial_adata_core(
342
334
  0
343
335
  ],
344
336
  min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
345
- demux_types=("double", "already"),
337
+ demux_types=cfg.clustermap_demux_types_to_plot,
346
338
  bins=None,
347
339
  sample_mapping=None,
348
- save_path=pp_clustermap_dir_dedup,
340
+ save_path=clustermap_dir_dedup,
349
341
  sort_by=cfg.spatial_clustermap_sortby,
350
342
  deaminase=deaminase,
351
343
  index_col_suffix=reindex_suffix,
352
344
  )
353
345
 
354
- # UMAP / Leiden
355
- if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
356
- logger.debug(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
346
+ # ============================================================
347
+ # 2b) Rolling NN distances + layer clustermaps
348
+ # ============================================================
349
+ pp_rolling_nn_dir = spatial_dir_dedup / "06b_rolling_nn_clustermaps"
350
+
351
+ if pp_rolling_nn_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
352
+ logger.debug(f"{pp_rolling_nn_dir} already exists. Skipping rolling NN distance plots.")
357
353
  else:
358
- make_dirs([pp_umap_dir])
359
-
360
- var_filters = []
361
- if smf_modality == "direct":
362
- for ref in references:
363
- for base in cfg.mod_target_bases:
364
- var_filters.append(f"{ref}_{base}_site")
365
- elif deaminase:
366
- for ref in references:
367
- var_filters.append(f"{ref}_C_site")
368
- else:
369
- for ref in references:
370
- for base in cfg.mod_target_bases:
371
- var_filters.append(f"{ref}_{base}_site")
372
-
373
- adata = calculate_umap(
374
- adata,
375
- layer=cfg.layer_for_umap_plotting,
376
- var_filters=var_filters,
377
- n_pcs=10,
378
- knn_neighbors=15,
354
+ make_dirs([pp_rolling_nn_dir])
355
+ samples = (
356
+ adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
379
357
  )
358
+ references = adata.obs[cfg.reference_column].astype("category").cat.categories.tolist()
380
359
 
381
- sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
360
+ for reference in references:
361
+ for sample in samples:
362
+ mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (
363
+ adata.obs[cfg.reference_column] == reference
364
+ )
365
+ if not mask.any():
366
+ continue
367
+
368
+ subset = adata[mask]
369
+ site_mask = (
370
+ adata.var[[f"{reference}_{st}_site" for st in cfg.rolling_nn_site_types]]
371
+ .fillna(False)
372
+ .any(axis=1)
373
+ )
374
+ subset = subset[:, site_mask].copy()
375
+ try:
376
+ rolling_values, rolling_starts = rolling_window_nn_distance(
377
+ subset,
378
+ layer=cfg.rolling_nn_layer,
379
+ window=cfg.rolling_nn_window,
380
+ step=cfg.rolling_nn_step,
381
+ min_overlap=cfg.rolling_nn_min_overlap,
382
+ return_fraction=cfg.rolling_nn_return_fraction,
383
+ store_obsm=cfg.rolling_nn_obsm_key,
384
+ )
385
+ except Exception as exc:
386
+ logger.warning(
387
+ "Rolling NN distance computation failed for sample=%s ref=%s: %s",
388
+ sample,
389
+ reference,
390
+ exc,
391
+ )
392
+ continue
382
393
 
383
- sc.settings.figdir = pp_umap_dir
384
- umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
385
- umap_layers += cfg.umap_layers_to_plot
386
- sc.pl.umap(adata, color=umap_layers, show=False, save=True)
394
+ safe_sample = str(sample).replace(os.sep, "_")
395
+ safe_ref = str(reference).replace(os.sep, "_")
396
+ parent_obsm_key = f"{cfg.rolling_nn_obsm_key}__{safe_ref}"
397
+ try:
398
+ assign_rolling_nn_results(
399
+ adata,
400
+ subset,
401
+ rolling_values,
402
+ rolling_starts,
403
+ obsm_key=parent_obsm_key,
404
+ window=cfg.rolling_nn_window,
405
+ step=cfg.rolling_nn_step,
406
+ min_overlap=cfg.rolling_nn_min_overlap,
407
+ return_fraction=cfg.rolling_nn_return_fraction,
408
+ layer=cfg.rolling_nn_layer,
409
+ )
410
+ except Exception as exc:
411
+ logger.warning(
412
+ "Failed to merge rolling NN results for sample=%s ref=%s: %s",
413
+ sample,
414
+ reference,
415
+ exc,
416
+ )
417
+ adata.uns.setdefault(f"{cfg.rolling_nn_obsm_key}_reference_map", {})[reference] = (
418
+ parent_obsm_key
419
+ )
420
+ out_png = pp_rolling_nn_dir / f"{safe_sample}__{safe_ref}.png"
421
+ title = f"{sample} {reference}"
422
+ try:
423
+ plot_rolling_nn_and_layer(
424
+ subset,
425
+ obsm_key=cfg.rolling_nn_obsm_key,
426
+ layer_key=cfg.rolling_nn_plot_layer,
427
+ max_nan_fraction=cfg.position_max_nan_threshold,
428
+ var_valid_fraction_col=f"{reference}_valid_fraction",
429
+ title=title,
430
+ save_name=out_png,
431
+ )
432
+ except Exception as exc:
433
+ logger.warning(
434
+ "Failed rolling NN plot for sample=%s ref=%s: %s",
435
+ sample,
436
+ reference,
437
+ exc,
438
+ )
387
439
 
388
440
  # ============================================================
389
441
  # 3) Spatial autocorrelation + rolling metrics
390
442
  # ============================================================
391
- pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
443
+ pp_autocorr_dir = spatial_dir_dedup / "08_autocorrelations"
392
444
 
393
445
  if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
394
446
  logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
@@ -731,10 +783,10 @@ def spatial_adata_core(
731
783
  # ============================================================
732
784
  # 4) Pearson / correlation matrices
733
785
  # ============================================================
734
- pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
786
+ corr_dir = spatial_dir_dedup / "09_correlation_matrices"
735
787
 
736
- if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
737
- logger.debug(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
788
+ if corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
789
+ logger.debug(f"{corr_dir} already exists. Skipping correlation matrix plotting.")
738
790
  else:
739
791
  compute_positionwise_statistics(
740
792
  adata,
@@ -759,7 +811,7 @@ def spatial_adata_core(
759
811
  cmaps=cfg.correlation_matrix_cmaps,
760
812
  vmin=None,
761
813
  vmax=None,
762
- output_dir=pp_corr_dir,
814
+ output_dir=corr_dir,
763
815
  output_key="positionwise_result",
764
816
  )
765
817
 
smftools/cli_entry.py CHANGED
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  from pathlib import Path
3
5
  from typing import Sequence
@@ -6,14 +8,37 @@ import click
6
8
  import pandas as pd
7
9
 
8
10
  from .cli.hmm_adata import hmm_adata
11
+ from .cli.latent_adata import latent_adata
9
12
  from .cli.load_adata import load_adata
10
13
  from .cli.preprocess_adata import preprocess_adata
11
14
  from .cli.spatial_adata import spatial_adata
12
15
  from .informatics.pod5_functions import subsample_pod5
13
- from .logging_utils import setup_logging
16
+ from .logging_utils import get_logger, setup_logging
14
17
  from .readwrite import concatenate_h5ads
15
18
 
16
19
 
20
+ def _configure_multiprocessing() -> None:
21
+ import multiprocessing as mp
22
+ import sys
23
+
24
+ logger = get_logger(__name__)
25
+
26
+ try:
27
+ if sys.platform == "win32":
28
+ mp.set_start_method("spawn")
29
+ logger.debug("Setting multiprocessing start method to spawn")
30
+ else:
31
+ # try forkserver first, fallback to spawn
32
+ try:
33
+ mp.set_start_method("forkserver")
34
+ logger.debug("Setting multiprocessing start method to forkserver")
35
+ except ValueError:
36
+ mp.set_start_method("spawn")
37
+ logger.debug("Setting multiprocessing start method to spawn")
38
+ except RuntimeError:
39
+ logger.warning("Could not set multiprocessing start method")
40
+
41
+
17
42
  @click.group()
18
43
  @click.option(
19
44
  "--log-file",
@@ -32,6 +57,7 @@ def cli(log_file: Path | None, log_level: str):
32
57
  """Command-line interface for smftools."""
33
58
  level = getattr(logging, log_level.upper(), logging.INFO)
34
59
  setup_logging(level=level, log_file=log_file)
60
+ _configure_multiprocessing()
35
61
 
36
62
 
37
63
  ####### Load anndata from raw data ###########
@@ -78,6 +104,17 @@ def hmm(config_path):
78
104
  ##########################################
79
105
 
80
106
 
107
+ ####### Latent ###########
108
+ @cli.command()
109
+ @click.argument("config_path", type=click.Path(exists=True))
110
+ def latent(config_path):
111
+ """Process data from CONFIG_PATH."""
112
+ latent_adata(config_path)
113
+
114
+
115
+ ##########################################
116
+
117
+
81
118
  ####### batch command ###########
82
119
  @cli.command()
83
120
  @click.argument(
@@ -1 +1,3 @@
1
+ from __future__ import annotations
2
+
1
3
  from .experiment_config import ExperimentConfig, LoadExperimentConfig
@@ -15,6 +15,16 @@ autocorr_site_types:
15
15
 
16
16
  # Spatial Analysis - Clustermap params
17
17
  layer_for_clustermap_plotting: 'nan0_0minus1'
18
+ rolling_nn_layer: "nan0_0minus1"
19
+ rolling_nn_plot_layer: "nan0_0minus1"
20
+ rolling_nn_window: 30
21
+ rolling_nn_step: 2
22
+ rolling_nn_min_overlap: 20
23
+ rolling_nn_return_fraction: true
24
+ rolling_nn_obsm_key: "rolling_nn_dist"
25
+ rolling_nn_site_types:
26
+ - "GpC"
27
+ - "CpG"
18
28
  clustermap_cmap_c: "coolwarm"
19
29
  clustermap_cmap_gpc: "coolwarm"
20
30
  clustermap_cmap_cpg: "viridis"
@@ -46,4 +56,4 @@ hmm_feature_sets:
46
56
  cpg_patch: [0, inf]
47
57
 
48
58
  hmm_merge_layer_features:
49
- - ["all_accessible_features", 60]
59
+ - ["all_accessible_features", 60]
@@ -18,8 +18,9 @@ conversions:
18
18
  fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
19
19
  fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
20
20
  input_already_demuxed: False # If the input files are already demultiplexed.
21
+
21
22
  delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
22
- delete_intermediate_bams: True # Whether to delete intermediate BAM files.
23
+ delete_intermediate_bams: False # Whether to delete intermediate BAM files.
23
24
  delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
24
25
 
25
26
  # Sequencing modality and general experiment params
@@ -77,6 +78,10 @@ aligner_args:
77
78
  # Sorted BAM and BED specific handling
78
79
  make_bigwigs: False # Whether to make coverage bigwigs
79
80
  make_beds: False # Whether to make beds from the aligned bams
81
+ annotate_secondary_supplementary: True # Whether to annotate reads with secondary/supplementary alignments from the aligned BAM
82
+ samtools_backend: auto # auto|python|cli for samtools-compatible operations
83
+ bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
84
+ bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
80
85
 
81
86
  # Nanopore specific demultiplexing
82
87
  barcode_both_ends: False # dorado demultiplexing
@@ -87,6 +92,12 @@ mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall w
87
92
  reference_column: 'Reference_strand'
88
93
  sample_column: 'Experiment_name_and_barcode'
89
94
 
95
+ # Plotting params
96
+ clustermap_demux_types_to_plot:
97
+ - "single"
98
+ - "double"
99
+ - "already"
100
+
90
101
  ######## smftools preprocess params #########
91
102
  # Read length, quality, and mapping filtering params
92
103
  read_coord_filter:
@@ -137,6 +148,10 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
137
148
  - "CpG"
138
149
  - "ambiguous_GpC_CpG"
139
150
  duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
151
+ duplicate_detection_demux_types_to_use:
152
+ - "single"
153
+ - "double"
154
+ - "already"
140
155
  hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
141
156
  - Fraction_C_site_modified
142
157
  duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
@@ -148,6 +163,11 @@ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkag
148
163
 
149
164
  # Position QC params
150
165
  position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
166
+ mismatch_frequency_range:
167
+ - 0.01
168
+ - 0.99
169
+ mismatch_frequency_layer: "mismatch_integer_encoding"
170
+ mismatch_frequency_read_span_layer: "read_span_mask"
151
171
 
152
172
  ######## smftools spatial params #########
153
173
  invert_adata: False # Whether to invert the AnnData along the positions axis.
@@ -166,6 +186,9 @@ clustermap_cmap_gpc: "coolwarm"
166
186
  clustermap_cmap_cpg: "coolwarm"
167
187
  clustermap_cmap_a: "coolwarm"
168
188
  spatial_clustermap_sortby: "gpc"
189
+ rolling_nn_site_types:
190
+ - "GpC"
191
+ - "CpG"
169
192
 
170
193
  # Spatial Analysis - UMAP/Leiden params
171
194
  layer_for_umap_plotting: 'nan_half'
@@ -240,6 +263,18 @@ hmm_feature_sets:
240
263
  mid_accessible_patch: [20, 40]
241
264
  large_accessible_patch: [40, 110]
242
265
  nucleosome_depleted_region: [110, inf]
266
+ hmm_feature_colormaps:
267
+ small_accessible_patch: "#A5D6A7"
268
+ mid_accessible_patch: "#2E7D32"
269
+ large_accessible_patch: "#006400"
270
+ nucleosome_depleted_region: "#00441B"
271
+ all_accessible_features: "#2E7D32"
272
+ small_bound_stretch: "#1E88E5"
273
+ medium_bound_stretch: "#6A1B9A"
274
+ large_bound_stretch: "#FB8C00"
275
+ putative_nucleosome: "#6D4C41"
276
+ all_footprint_features: "#6A1B9A"
277
+ cpg_patch: "#6D4C41"
243
278
  hmm_merge_layer_features:
244
279
  - ["all_accessible_features", 60]
245
280
  clustermap_cmap_hmm: "coolwarm"
@@ -256,6 +291,11 @@ hmm_clustermap_feature_layers:
256
291
  - medium_bound_stretch
257
292
  - putative_nucleosome
258
293
  - large_bound_stretch
294
+ - all_footprint_features
295
+ hmm_clustermap_length_layers:
296
+ - all_accessible_features
297
+ - all_accessible_features_merged
298
+ - all_footprint_features
259
299
  hmm_clustermap_sortby: "hmm"
260
300
  hmm_peak_feature_configs:
261
301
  all_accessible_features:
@@ -370,4 +410,4 @@ force_redo_matrix_corr_plotting: False # Whether to force redo basic correlation
370
410
  bypass_hmm_fit: False # Whether to skip HMM fitting for each sample/reference
371
411
  force_redo_hmm_fit: False # Whether to redo HMM fitting for each sample/reference
372
412
  bypass_hmm_apply: False # Whether to skip HMM application for each sample/reference
373
- force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
413
+ force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference