smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,16 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  from typing import Optional, Tuple
3
5
 
4
6
  import anndata as ad
5
7
 
8
+ from smftools.logging_utils import get_logger
9
+ from smftools.optional_imports import require
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
6
14
  def spatial_adata(
7
15
  config_path: str,
8
16
  ) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
@@ -26,10 +34,10 @@ def spatial_adata(
26
34
  spatial_adata_path : Path | None
27
35
  Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
28
36
  """
29
- from ..readwrite import safe_read_h5ad, make_dirs, add_or_update_column_in_csv
37
+ from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
38
+ from .helpers import get_adata_paths
30
39
  from .load_adata import load_adata
31
40
  from .preprocess_adata import preprocess_adata
32
- from .helpers import get_adata_paths
33
41
 
34
42
  # 1) Ensure config + basic paths via load_adata
35
43
  loaded_adata, loaded_path, cfg = load_adata(config_path)
@@ -45,21 +53,22 @@ def spatial_adata(
45
53
  if not getattr(cfg, "force_redo_spatial_analyses", False):
46
54
  # If HMM exists, it's the most processed stage — reuse it.
47
55
  if hmm_path.exists():
48
- print(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
56
+ logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
49
57
  return None, hmm_path
50
58
 
51
59
  # If spatial exists, we consider spatial analyses already done.
52
60
  if spatial_path.exists():
53
- print(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
61
+ logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
54
62
  return None, spatial_path
55
63
 
56
64
  # 2) Ensure preprocessing has been run
57
65
  # This will create pp/pp_dedup as needed or return them if they already exist.
58
- pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(config_path)
66
+ pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
67
+ config_path
68
+ )
59
69
 
60
70
  # Helper to load from disk, reusing loaded_adata if it matches
61
71
  def _load(path: Path):
62
- from ..readwrite import safe_read_h5ad
63
72
  if loaded_adata is not None and loaded_path == path:
64
73
  return loaded_adata
65
74
  adata, _ = safe_read_h5ad(path)
@@ -69,15 +78,19 @@ def spatial_adata(
69
78
  # Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
70
79
  if pp_dedup_adata is not None:
71
80
  start_adata = pp_dedup_adata
81
+ source_path = pp_dedup_adata_path_ret
72
82
  else:
73
83
  if pp_dedup_path.exists():
74
84
  start_adata = _load(pp_dedup_path)
85
+ source_path = pp_dedup_path
75
86
  elif pp_path.exists():
76
87
  start_adata = _load(pp_path)
88
+ source_path = pp_path
77
89
  elif raw_path.exists():
78
90
  start_adata = _load(raw_path)
91
+ source_path = raw_path
79
92
  else:
80
- print("No suitable AnnData found for spatial analyses (need at least raw).")
93
+ logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
81
94
  return None, None
82
95
 
83
96
  # 4) Run the spatial core
@@ -88,6 +101,8 @@ def spatial_adata(
88
101
  pp_adata_path=pp_path,
89
102
  pp_dup_rem_adata_path=pp_dedup_path,
90
103
  pp_adata_in_memory=pp_adata,
104
+ source_adata_path=source_path,
105
+ config_path=config_path,
91
106
  )
92
107
 
93
108
  # 5) Register spatial path in summary CSV
@@ -103,6 +118,8 @@ def spatial_adata_core(
103
118
  pp_adata_path: Path,
104
119
  pp_dup_rem_adata_path: Path,
105
120
  pp_adata_in_memory: Optional[ad.AnnData] = None,
121
+ source_adata_path: Optional[Path] = None,
122
+ config_path: Optional[str] = None,
106
123
  ) -> Tuple[ad.AnnData, Path]:
107
124
  """
108
125
  Core spatial analysis pipeline.
@@ -139,32 +156,33 @@ def spatial_adata_core(
139
156
 
140
157
  import numpy as np
141
158
  import pandas as pd
142
- import scanpy as sc
143
159
 
144
- from ..readwrite import make_dirs, safe_read_h5ad
145
- from .helpers import write_gz_h5ad
160
+ sc = require("scanpy", extra="scanpy", purpose="spatial analyses")
146
161
 
147
- from ..preprocessing import (
148
- load_sample_sheet,
149
- invert_adata,
150
- reindex_references_adata,
151
- )
162
+ from ..metadata import record_smftools_metadata
152
163
  from ..plotting import (
153
164
  combined_raw_clustermap,
154
165
  plot_rolling_grid,
155
166
  plot_spatial_autocorr_grid,
156
167
  )
168
+ from ..preprocessing import (
169
+ invert_adata,
170
+ load_sample_sheet,
171
+ reindex_references_adata,
172
+ )
173
+ from ..readwrite import make_dirs, safe_read_h5ad
157
174
  from ..tools import calculate_umap
175
+ from ..tools.position_stats import (
176
+ compute_positionwise_statistics,
177
+ plot_positionwise_matrices,
178
+ )
158
179
  from ..tools.spatial_autocorrelation import (
159
- binary_autocorrelation_with_spacing,
160
180
  analyze_autocorr_matrix,
181
+ binary_autocorrelation_with_spacing,
161
182
  bootstrap_periodicity,
162
183
  rolling_autocorr_metrics,
163
184
  )
164
- from ..tools.position_stats import (
165
- compute_positionwise_statistics,
166
- plot_positionwise_matrices,
167
- )
185
+ from .helpers import write_gz_h5ad
168
186
 
169
187
  # -----------------------------
170
188
  # General setup
@@ -207,7 +225,12 @@ def spatial_adata_core(
207
225
  offsets=cfg.reindexing_offsets,
208
226
  new_col=cfg.reindexed_var_suffix,
209
227
  )
210
-
228
+
229
+ if adata.uns.get("reindex_references_adata_performed", False):
230
+ reindex_suffix = cfg.reindexed_var_suffix
231
+ else:
232
+ reindex_suffix = None
233
+
211
234
  pp_dir = output_directory / "preprocessed"
212
235
  references = adata.obs[cfg.reference_column].cat.categories
213
236
 
@@ -223,7 +246,9 @@ def spatial_adata_core(
223
246
  if pp_clustermap_dir.is_dir() and not getattr(
224
247
  cfg, "force_redo_spatial_analyses", False
225
248
  ):
226
- print(f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData.")
249
+ logger.debug(
250
+ f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
251
+ )
227
252
  else:
228
253
  make_dirs([pp_dir, pp_clustermap_dir])
229
254
 
@@ -232,6 +257,34 @@ def spatial_adata_core(
232
257
  else:
233
258
  pp_adata, _ = safe_read_h5ad(pp_adata_path)
234
259
 
260
+ # -----------------------------
261
+ # Optional sample sheet metadata
262
+ # -----------------------------
263
+ if getattr(cfg, "sample_sheet_path", None):
264
+ load_sample_sheet(
265
+ pp_adata,
266
+ cfg.sample_sheet_path,
267
+ mapping_key_column=cfg.sample_sheet_mapping_column,
268
+ as_category=True,
269
+ force_reload=cfg.force_reload_sample_sheet,
270
+ )
271
+
272
+ # -----------------------------
273
+ # Optional inversion along positions axis
274
+ # -----------------------------
275
+ if getattr(cfg, "invert_adata", False):
276
+ pp_adata = invert_adata(pp_adata)
277
+
278
+ # -----------------------------
279
+ # Optional reindexing by reference
280
+ # -----------------------------
281
+ reindex_references_adata(
282
+ pp_adata,
283
+ reference_col=cfg.reference_column,
284
+ offsets=cfg.reindexing_offsets,
285
+ new_col=cfg.reindexed_var_suffix,
286
+ )
287
+
235
288
  combined_raw_clustermap(
236
289
  pp_adata,
237
290
  sample_col=cfg.sample_name_col_for_plotting,
@@ -247,16 +300,19 @@ def spatial_adata_core(
247
300
  cmap_a=cfg.clustermap_cmap_a,
248
301
  min_quality=cfg.read_quality_filter_thresholds[0],
249
302
  min_length=cfg.read_len_filter_thresholds[0],
250
- min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
303
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
304
+ 0
305
+ ],
251
306
  min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
307
+ demux_types=("double", "already"),
252
308
  bins=None,
253
309
  sample_mapping=None,
254
310
  save_path=pp_clustermap_dir,
255
311
  sort_by=cfg.spatial_clustermap_sortby,
256
312
  deaminase=deaminase,
257
- index_col_suffix=cfg.reindexed_var_suffix,
313
+ index_col_suffix=reindex_suffix,
258
314
  )
259
-
315
+
260
316
  # ============================================================
261
317
  # 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
262
318
  # ============================================================
@@ -265,10 +321,10 @@ def spatial_adata_core(
265
321
  pp_umap_dir = pp_dir_dedup / "07_umaps"
266
322
 
267
323
  # Clustermaps on deduplicated adata
268
- if pp_clustermap_dir_dedup.is_dir() and not getattr(
269
- cfg, "force_redo_spatial_analyses", False
270
- ):
271
- print(f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData.")
324
+ if pp_clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
325
+ logger.debug(
326
+ f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
327
+ )
272
328
  else:
273
329
  make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
274
330
  combined_raw_clustermap(
@@ -286,19 +342,22 @@ def spatial_adata_core(
286
342
  cmap_a=cfg.clustermap_cmap_a,
287
343
  min_quality=cfg.read_quality_filter_thresholds[0],
288
344
  min_length=cfg.read_len_filter_thresholds[0],
289
- min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
345
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
346
+ 0
347
+ ],
290
348
  min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
349
+ demux_types=("double", "already"),
291
350
  bins=None,
292
351
  sample_mapping=None,
293
352
  save_path=pp_clustermap_dir_dedup,
294
353
  sort_by=cfg.spatial_clustermap_sortby,
295
354
  deaminase=deaminase,
296
- index_col_suffix=cfg.reindexed_var_suffix,
355
+ index_col_suffix=reindex_suffix,
297
356
  )
298
357
 
299
358
  # UMAP / Leiden
300
359
  if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
301
- print(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
360
+ logger.debug(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
302
361
  else:
303
362
  make_dirs([pp_umap_dir])
304
363
 
@@ -336,40 +395,48 @@ def spatial_adata_core(
336
395
  pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
337
396
 
338
397
  if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
339
- print(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
398
+ logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
340
399
  else:
341
400
  positions = adata.var_names.astype(int).values
342
401
  lags = np.arange(cfg.autocorr_max_lag + 1)
343
402
 
344
403
  try:
345
404
  from joblib import Parallel, delayed
405
+
346
406
  _have_joblib = True
347
407
  except Exception:
348
408
  _have_joblib = False
349
409
 
350
- samples = adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
410
+ samples = (
411
+ adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
412
+ )
351
413
  ref_col = getattr(cfg, "reference_strand_col", "Reference_strand")
352
414
  refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
353
415
 
354
416
  for site_type in cfg.autocorr_site_types:
355
417
  layer_key = f"{site_type}_site_binary"
356
418
  if layer_key not in adata.layers:
357
- print(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
419
+ logger.debug(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
358
420
  continue
359
421
 
360
422
  X = adata.layers[layer_key]
361
423
  if getattr(X, "shape", (0,))[0] == 0:
362
- print(f"Layer {layer_key} empty — skipping {site_type}.")
424
+ logger.debug(f"Layer {layer_key} empty — skipping {site_type}.")
363
425
  continue
364
426
 
365
427
  rows = []
366
428
  counts = []
367
429
 
368
430
  if _have_joblib:
431
+
369
432
  def _worker(row):
370
433
  try:
371
434
  ac, cnts = binary_autocorrelation_with_spacing(
372
- row, positions, max_lag=cfg.autocorr_max_lag, return_counts=True
435
+ row,
436
+ positions,
437
+ max_lag=cfg.autocorr_max_lag,
438
+ return_counts=True,
439
+ normalize=cfg.autocorr_normalization_method,
373
440
  )
374
441
  except Exception:
375
442
  ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
@@ -385,7 +452,11 @@ def spatial_adata_core(
385
452
  else:
386
453
  for i in range(X.shape[0]):
387
454
  ac, cnts = binary_autocorrelation_with_spacing(
388
- X[i], positions, max_lag=cfg.autocorr_max_lag, return_counts=True
455
+ X[i],
456
+ positions,
457
+ max_lag=cfg.autocorr_max_lag,
458
+ return_counts=True,
459
+ normalize=cfg.autocorr_normalization_method,
389
460
  )
390
461
  rows.append(ac)
391
462
  counts.append(cnts)
@@ -474,7 +545,9 @@ def spatial_adata_core(
474
545
  try:
475
546
  r = analyze_autocorr_matrix(
476
547
  ac_sel,
477
- cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
548
+ cnt_sel
549
+ if cnt_sel is not None
550
+ else np.zeros_like(ac_sel, dtype=int),
478
551
  lags,
479
552
  nrl_search_bp=(120, 260),
480
553
  pad_factor=4,
@@ -489,7 +562,9 @@ def spatial_adata_core(
489
562
 
490
563
  adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
491
564
 
492
- global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get("nrl_bp", None)
565
+ global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get(
566
+ "nrl_bp", None
567
+ )
493
568
 
494
569
  rolling_cfg = {
495
570
  "window_size": getattr(
@@ -554,27 +629,31 @@ def spatial_adata_core(
554
629
  fixed_nrl_bp=global_nrl,
555
630
  )
556
631
  except Exception as e:
557
- warnings.warn(
632
+ logger.warning(
558
633
  f"rolling_autocorr_metrics failed for {site_type} "
559
634
  f"{sample_name} {ref_label}: {e}"
560
635
  )
561
636
  continue
562
637
 
563
638
  if "center" not in df_roll.columns:
564
- warnings.warn(
639
+ logger.warning(
565
640
  f"rolling_autocorr_metrics returned unexpected schema "
566
641
  f"for {site_type} {sample_name} {ref_label}"
567
642
  )
568
643
  continue
569
644
 
570
- compact_df = df_roll[["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]].copy()
645
+ compact_df = df_roll[
646
+ ["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]
647
+ ].copy()
571
648
  compact_df["site"] = site_type
572
649
  compact_df["sample"] = sample_name
573
650
  compact_df["reference"] = ref_label if ref_label != "all" else "all"
574
651
 
575
652
  if write_csvs:
576
653
  safe_sample = str(sample_name).replace(os.sep, "_")
577
- safe_ref = str(ref_label if ref_label != "all" else "all").replace(os.sep, "_")
654
+ safe_ref = str(ref_label if ref_label != "all" else "all").replace(
655
+ os.sep, "_"
656
+ )
578
657
  out_csv = os.path.join(
579
658
  site_out_dir,
580
659
  f"{safe_sample}__{safe_ref}__rolling_metrics.csv",
@@ -582,7 +661,7 @@ def spatial_adata_core(
582
661
  try:
583
662
  compact_df.to_csv(out_csv, index=False)
584
663
  except Exception as e:
585
- warnings.warn(f"Failed to write rolling CSV {out_csv}: {e}")
664
+ logger.warning(f"Failed to write rolling CSV {out_csv}: {e}")
586
665
 
587
666
  if write_plots:
588
667
  try:
@@ -604,7 +683,7 @@ def spatial_adata_core(
604
683
  show=False,
605
684
  )
606
685
  except Exception as e:
607
- warnings.warn(
686
+ logger.warning(
608
687
  f"Failed to create rolling plot for {site_type} "
609
688
  f"{sample_name} {ref_label}: {e}"
610
689
  )
@@ -612,7 +691,9 @@ def spatial_adata_core(
612
691
  combined_rows.append(
613
692
  compact_df.assign(site=site_type, sample=sample_name, reference=ref_label)
614
693
  )
615
- rolling_results_by_group[(sample_name, None if ref_label == "all" else ref_label)] = compact_df
694
+ rolling_results_by_group[
695
+ (sample_name, None if ref_label == "all" else ref_label)
696
+ ] = compact_df
616
697
 
617
698
  adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
618
699
 
@@ -624,9 +705,7 @@ def spatial_adata_core(
624
705
  try:
625
706
  combined_df_site.to_csv(combined_out_csv, index=False)
626
707
  except Exception as e:
627
- warnings.warn(
628
- f"Failed to write combined rolling CSV for {site_type}: {e}"
629
- )
708
+ logger.warning(f"Failed to write combined rolling CSV for {site_type}: {e}")
630
709
 
631
710
  rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
632
711
  plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
@@ -650,6 +729,7 @@ def spatial_adata_core(
650
729
  sample_col=cfg.sample_name_col_for_plotting,
651
730
  window=cfg.autocorr_rolling_window_size,
652
731
  rows_per_fig=cfg.rows_per_qc_autocorr_grid,
732
+ normalization_method=cfg.autocorr_normalization_method,
653
733
  )
654
734
 
655
735
  # ============================================================
@@ -658,7 +738,7 @@ def spatial_adata_core(
658
738
  pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
659
739
 
660
740
  if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
661
- print(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
741
+ logger.debug(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
662
742
  else:
663
743
  compute_positionwise_statistics(
664
744
  adata,
@@ -691,7 +771,15 @@ def spatial_adata_core(
691
771
  # 5) Save spatial AnnData
692
772
  # ============================================================
693
773
  if (not spatial_adata_path.exists()) or getattr(cfg, "force_redo_spatial_analyses", False):
694
- print("Saving spatial analyzed AnnData (post preprocessing and duplicate removal).")
774
+ logger.info("Saving spatial analyzed AnnData (post preprocessing and duplicate removal).")
775
+ record_smftools_metadata(
776
+ adata,
777
+ step_name="spatial",
778
+ cfg=cfg,
779
+ config_path=config_path,
780
+ input_paths=[source_adata_path] if source_adata_path else None,
781
+ output_path=spatial_adata_path,
782
+ )
695
783
  write_gz_h5ad(adata, spatial_adata_path)
696
784
 
697
- return adata, spatial_adata_path
785
+ return adata, spatial_adata_path