smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/helpers.py +48 -0
  3. smftools/cli/hmm_adata.py +168 -145
  4. smftools/cli/load_adata.py +155 -95
  5. smftools/cli/preprocess_adata.py +222 -130
  6. smftools/cli/spatial_adata.py +441 -308
  7. smftools/cli_entry.py +4 -5
  8. smftools/config/conversion.yaml +12 -5
  9. smftools/config/deaminase.yaml +11 -9
  10. smftools/config/default.yaml +123 -19
  11. smftools/config/direct.yaml +3 -0
  12. smftools/config/experiment_config.py +120 -19
  13. smftools/hmm/HMM.py +12 -1
  14. smftools/hmm/__init__.py +0 -6
  15. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  16. smftools/hmm/call_hmm_peaks.py +318 -90
  17. smftools/informatics/bam_functions.py +28 -29
  18. smftools/informatics/h5ad_functions.py +1 -1
  19. smftools/plotting/general_plotting.py +97 -51
  20. smftools/plotting/position_stats.py +3 -3
  21. smftools/preprocessing/__init__.py +2 -4
  22. smftools/preprocessing/append_base_context.py +34 -25
  23. smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
  24. smftools/preprocessing/binarize_on_Youden.py +10 -8
  25. smftools/preprocessing/calculate_complexity_II.py +1 -1
  26. smftools/preprocessing/calculate_coverage.py +16 -13
  27. smftools/preprocessing/calculate_position_Youden.py +41 -25
  28. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  29. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  30. smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
  31. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  32. smftools/preprocessing/invert_adata.py +1 -1
  33. smftools/preprocessing/load_sample_sheet.py +1 -1
  34. smftools/preprocessing/reindex_references_adata.py +37 -0
  35. smftools/readwrite.py +94 -0
  36. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
  37. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
  38. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  39. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  40. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  41. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  42. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  43. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  44. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  45. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
  46. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,239 +1,356 @@
1
- def spatial_adata(config_path):
2
- """
3
- High-level function to call for spatial analysis of an adata object.
4
- Command line accesses this through smftools spatial <config_path>
1
+ from pathlib import Path
2
+ from typing import Optional, Tuple
5
3
 
6
- Parameters:
7
- config_path (str): A string representing the file path to the experiment configuration csv file.
4
+ import anndata as ad
8
5
 
9
- Returns:
10
- (pp_dedup_spatial_adata, pp_dedup_spatial_adata_path)
6
+ def spatial_adata(
7
+ config_path: str,
8
+ ) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
9
+ """
10
+ CLI-facing wrapper for spatial analyses.
11
+
12
+ Called by: `smftools spatial <config_path>`
13
+
14
+ Responsibilities:
15
+ - Ensure a usable AnnData exists via `load_adata` + `preprocess_adata`.
16
+ - Determine which AnnData stages exist (raw, pp, pp_dedup, spatial, hmm).
17
+ - Respect cfg.force_redo_spatial_analyses.
18
+ - Decide whether to skip (return existing) or run the spatial core.
19
+ - Call `spatial_adata_core(...)` when actual work is needed.
20
+
21
+ Returns
22
+ -------
23
+ spatial_adata : AnnData | None
24
+ AnnData with spatial analyses, or None if we skipped because a later-stage
25
+ AnnData already exists.
26
+ spatial_adata_path : Path | None
27
+ Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
11
28
  """
12
- from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
29
+ from ..readwrite import safe_read_h5ad, make_dirs, add_or_update_column_in_csv
13
30
  from .load_adata import load_adata
14
31
  from .preprocess_adata import preprocess_adata
32
+ from .helpers import get_adata_paths
33
+
34
+ # 1) Ensure config + basic paths via load_adata
35
+ loaded_adata, loaded_path, cfg = load_adata(config_path)
36
+ paths = get_adata_paths(cfg)
37
+
38
+ raw_path = paths.raw
39
+ pp_path = paths.pp
40
+ pp_dedup_path = paths.pp_dedup
41
+ spatial_path = paths.spatial
42
+ hmm_path = paths.hmm
43
+
44
+ # Stage-skipping logic for spatial
45
+ if not getattr(cfg, "force_redo_spatial_analyses", False):
46
+ # If HMM exists, it's the most processed stage — reuse it.
47
+ if hmm_path.exists():
48
+ print(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
49
+ return None, hmm_path
50
+
51
+ # If spatial exists, we consider spatial analyses already done.
52
+ if spatial_path.exists():
53
+ print(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
54
+ return None, spatial_path
55
+
56
+ # 2) Ensure preprocessing has been run
57
+ # This will create pp/pp_dedup as needed or return them if they already exist.
58
+ pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(config_path)
59
+
60
+ # Helper to load from disk, reusing loaded_adata if it matches
61
+ def _load(path: Path):
62
+ from ..readwrite import safe_read_h5ad
63
+ if loaded_adata is not None and loaded_path == path:
64
+ return loaded_adata
65
+ adata, _ = safe_read_h5ad(path)
66
+ return adata
67
+
68
+ # 3) Decide which AnnData to use as the *starting point* for spatial analyses
69
+ # Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
70
+ if pp_dedup_adata is not None:
71
+ start_adata = pp_dedup_adata
72
+ else:
73
+ if pp_dedup_path.exists():
74
+ start_adata = _load(pp_dedup_path)
75
+ elif pp_path.exists():
76
+ start_adata = _load(pp_path)
77
+ elif raw_path.exists():
78
+ start_adata = _load(raw_path)
79
+ else:
80
+ print("No suitable AnnData found for spatial analyses (need at least raw).")
81
+ return None, None
82
+
83
+ # 4) Run the spatial core
84
+ adata_spatial, spatial_path = spatial_adata_core(
85
+ adata=start_adata,
86
+ cfg=cfg,
87
+ spatial_adata_path=spatial_path,
88
+ pp_adata_path=pp_path,
89
+ pp_dup_rem_adata_path=pp_dedup_path,
90
+ pp_adata_in_memory=pp_adata,
91
+ )
92
+
93
+ # 5) Register spatial path in summary CSV
94
+ add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
95
+
96
+ return adata_spatial, spatial_path
97
+
98
+
99
+ def spatial_adata_core(
100
+ adata: ad.AnnData,
101
+ cfg,
102
+ spatial_adata_path: Path,
103
+ pp_adata_path: Path,
104
+ pp_dup_rem_adata_path: Path,
105
+ pp_adata_in_memory: Optional[ad.AnnData] = None,
106
+ ) -> Tuple[ad.AnnData, Path]:
107
+ """
108
+ Core spatial analysis pipeline.
109
+
110
+ Assumes:
111
+ - `adata` is (typically) the preprocessed, duplicate-removed AnnData.
112
+ - `cfg` is the ExperimentConfig.
113
+ - `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
114
+ from `get_adata_paths`.
115
+ - `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
116
+ the same run of `preprocess_adata`, to avoid re-reading from disk.
117
+
118
+ Does:
119
+ - Optional sample sheet load.
120
+ - Optional inversion & reindexing.
121
+ - Clustermaps on:
122
+ * preprocessed (non-dedup) AnnData (for non-direct modalities), and
123
+ * deduplicated preprocessed AnnData.
124
+ - PCA/UMAP/Leiden.
125
+ - Autocorrelation + rolling metrics + grids.
126
+ - Positionwise correlation matrices (non-direct modalities).
127
+ - Save spatial AnnData to `spatial_adata_path`.
128
+
129
+ Returns
130
+ -------
131
+ adata : AnnData
132
+ Spatially analyzed AnnData (same object, modified in-place).
133
+ spatial_adata_path : Path
134
+ Path where spatial AnnData was written.
135
+ """
136
+ import os
137
+ import warnings
138
+ from pathlib import Path
15
139
 
16
140
  import numpy as np
17
141
  import pandas as pd
18
- import anndata as ad
19
142
  import scanpy as sc
20
143
 
21
- import os
22
- from importlib import resources
23
- from pathlib import Path
24
-
25
- from datetime import datetime
26
- date_str = datetime.today().strftime("%y%m%d")
27
-
28
- ############################################### smftools load start ###############################################
29
- adata, adata_path, cfg = load_adata(config_path)
30
- # General config variable init - Necessary user passed inputs
31
- smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
32
- output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
33
- # Make initial output directory
144
+ from ..readwrite import make_dirs, safe_read_h5ad
145
+ from .helpers import write_gz_h5ad
146
+
147
+ from ..preprocessing import (
148
+ load_sample_sheet,
149
+ invert_adata,
150
+ reindex_references_adata,
151
+ )
152
+ from ..plotting import (
153
+ combined_raw_clustermap,
154
+ plot_rolling_grid,
155
+ plot_spatial_autocorr_grid,
156
+ )
157
+ from ..tools import calculate_umap
158
+ from ..tools.spatial_autocorrelation import (
159
+ binary_autocorrelation_with_spacing,
160
+ analyze_autocorr_matrix,
161
+ bootstrap_periodicity,
162
+ rolling_autocorr_metrics,
163
+ )
164
+ from ..tools.position_stats import (
165
+ compute_positionwise_statistics,
166
+ plot_positionwise_matrices,
167
+ )
168
+
169
+ # -----------------------------
170
+ # General setup
171
+ # -----------------------------
172
+ output_directory = Path(cfg.output_directory)
34
173
  make_dirs([output_directory])
35
- ############################################### smftools load end ###############################################
36
-
37
- ############################################### smftools preprocess start ###############################################
38
- pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
39
- ############################################### smftools preprocess end ###############################################
40
174
 
41
- ############################################### smftools spatial start ###############################################
42
- input_manager_df = pd.read_csv(cfg.summary_file)
43
- initial_adata_path = Path(input_manager_df['load_adata'][0])
44
- pp_adata_path = Path(input_manager_df['pp_adata'][0])
45
- pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
46
- spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
47
- hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
48
-
49
- if smf_modality == 'conversion':
175
+ smf_modality = cfg.smf_modality
176
+ if smf_modality == "conversion":
50
177
  deaminase = False
51
178
  else:
52
179
  deaminase = True
53
180
 
54
- if pp_adata and pp_dedup_adata:
55
- # This happens on first run of the preprocessing pipeline
56
- first_pp_run = True
57
- adata = pp_adata
58
- adata_unique = pp_dedup_adata
59
- else:
60
- # If an anndata is saved, check which stages of the anndata are available
61
- first_pp_run = False
62
- initial_version_available = initial_adata_path.exists()
63
- preprocessed_version_available = pp_adata_path.exists()
64
- preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
65
- preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
66
- hmm_version_available = hmm_adata_path.exists()
67
-
68
- if cfg.force_redo_basic_analyses:
69
- print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
70
- if preprocessed_dup_removed_version_available:
71
- adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
72
- adata_version = "pp_dedup"
73
- elif preprocessed_version_available:
74
- adata, load_report = safe_read_h5ad(pp_adata_path)
75
- adata_version = "pp"
76
- elif initial_version_available:
77
- adata, load_report = safe_read_h5ad(initial_adata_path)
78
- adata_version = "initial"
79
- else:
80
- print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
81
- return
82
- elif preprocessed_dedup_spatial_version_available:
83
- print(f"Preprocessed deduplicated spatial anndata found: {spatial_adata_path}")
84
- return None, spatial_adata_path
85
- elif preprocessed_dup_removed_version_available:
86
- adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
87
- adata_version = "pp_dedup"
88
- elif preprocessed_version_available:
89
- adata, load_report = safe_read_h5ad(pp_adata_path)
90
- adata_version = "pp"
91
- elif initial_version_available:
92
- adata, load_report = safe_read_h5ad(initial_adata_path)
93
- adata_version = "initial"
94
- else:
95
- print(f"No adata available.")
96
- return
181
+ first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
182
+
183
+ # -----------------------------
184
+ # Optional sample sheet metadata
185
+ # -----------------------------
186
+ if getattr(cfg, "sample_sheet_path", None):
187
+ load_sample_sheet(
188
+ adata,
189
+ cfg.sample_sheet_path,
190
+ mapping_key_column=cfg.sample_sheet_mapping_column,
191
+ as_category=True,
192
+ force_reload=cfg.force_reload_sample_sheet,
193
+ )
194
+
195
+ # -----------------------------
196
+ # Optional inversion along positions axis
197
+ # -----------------------------
198
+ if getattr(cfg, "invert_adata", False):
199
+ adata = invert_adata(adata)
200
+
201
+ # -----------------------------
202
+ # Optional reindexing by reference
203
+ # -----------------------------
204
+ reindex_references_adata(
205
+ adata,
206
+ reference_col=cfg.reference_column,
207
+ offsets=cfg.reindexing_offsets,
208
+ new_col=cfg.reindexed_var_suffix,
209
+ )
97
210
 
98
211
  pp_dir = output_directory / "preprocessed"
99
212
  references = adata.obs[cfg.reference_column].cat.categories
100
213
 
101
- if smf_modality != 'direct':
102
- ######### Clustermaps #########
214
+ # ============================================================
215
+ # 1) Clustermaps (non-direct modalities) on *preprocessed* data
216
+ # ============================================================
217
+ if smf_modality != "direct":
218
+ preprocessed_version_available = pp_adata_path.exists()
219
+
103
220
  if preprocessed_version_available:
104
221
  pp_clustermap_dir = pp_dir / "06_clustermaps"
105
222
 
106
- if pp_clustermap_dir.is_dir():
107
- print(f'{pp_clustermap_dir} already exists. Skipping clustermap plotting.')
223
+ if pp_clustermap_dir.is_dir() and not getattr(
224
+ cfg, "force_redo_spatial_analyses", False
225
+ ):
226
+ print(f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData.")
108
227
  else:
109
- from ..plotting import combined_raw_clustermap
110
228
  make_dirs([pp_dir, pp_clustermap_dir])
111
229
 
112
- if not first_pp_run:
113
- pp_adata, load_report = safe_read_h5ad(pp_adata_path)
230
+ if first_pp_run and (pp_adata_in_memory is not None):
231
+ pp_adata = pp_adata_in_memory
114
232
  else:
115
- pp_adata = adata
116
-
117
- clustermap_results = combined_raw_clustermap(pp_adata,
118
- sample_col=cfg.sample_name_col_for_plotting,
119
- reference_col=cfg.reference_column,
120
- mod_target_bases=cfg.mod_target_bases,
121
- layer_any_c=cfg.layer_for_clustermap_plotting,
122
- layer_gpc=cfg.layer_for_clustermap_plotting,
123
- layer_cpg=cfg.layer_for_clustermap_plotting,
124
- layer_a=cfg.layer_for_clustermap_plotting,
125
- cmap_any_c="coolwarm",
126
- cmap_gpc="coolwarm",
127
- cmap_cpg="viridis",
128
- cmap_a="coolwarm",
129
- min_quality=cfg.read_quality_filter_thresholds[0],
130
- min_length=cfg.read_len_filter_thresholds[0],
131
- min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
132
- min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
133
- bins=None,
134
- sample_mapping=None,
135
- save_path=pp_clustermap_dir,
136
- sort_by='gpc',
137
- deaminase=deaminase)
138
- if first_pp_run:
139
- adata = adata_unique
140
- else:
141
- pass
142
-
143
- else:
144
- pass
233
+ pp_adata, _ = safe_read_h5ad(pp_adata_path)
234
+
235
+ combined_raw_clustermap(
236
+ pp_adata,
237
+ sample_col=cfg.sample_name_col_for_plotting,
238
+ reference_col=cfg.reference_column,
239
+ mod_target_bases=cfg.mod_target_bases,
240
+ layer_c=cfg.layer_for_clustermap_plotting,
241
+ layer_gpc=cfg.layer_for_clustermap_plotting,
242
+ layer_cpg=cfg.layer_for_clustermap_plotting,
243
+ layer_a=cfg.layer_for_clustermap_plotting,
244
+ cmap_c=cfg.clustermap_cmap_c,
245
+ cmap_gpc=cfg.clustermap_cmap_gpc,
246
+ cmap_cpg=cfg.clustermap_cmap_cpg,
247
+ cmap_a=cfg.clustermap_cmap_a,
248
+ min_quality=cfg.read_quality_filter_thresholds[0],
249
+ min_length=cfg.read_len_filter_thresholds[0],
250
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
251
+ min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
252
+ bins=None,
253
+ sample_mapping=None,
254
+ save_path=pp_clustermap_dir,
255
+ sort_by=cfg.spatial_clustermap_sortby,
256
+ deaminase=deaminase,
257
+ index_col_suffix=cfg.reindexed_var_suffix,
258
+ )
145
259
 
146
- #### Proceed with dedeuplicated preprocessed anndata ###
147
- pp_dir = pp_dir / "deduplicated"
148
- pp_clustermap_dir = pp_dir / "06_clustermaps"
149
- pp_umap_dir = pp_dir / "07_umaps"
150
-
151
- if pp_clustermap_dir.is_dir():
152
- print(f'{pp_clustermap_dir} already exists. Skipping clustermap plotting.')
260
+ # ============================================================
261
+ # 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
262
+ # ============================================================
263
+ pp_dir_dedup = pp_dir / "deduplicated"
264
+ pp_clustermap_dir_dedup = pp_dir_dedup / "06_clustermaps"
265
+ pp_umap_dir = pp_dir_dedup / "07_umaps"
266
+
267
+ # Clustermaps on deduplicated adata
268
+ if pp_clustermap_dir_dedup.is_dir() and not getattr(
269
+ cfg, "force_redo_spatial_analyses", False
270
+ ):
271
+ print(f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData.")
153
272
  else:
154
- from ..plotting import combined_raw_clustermap
155
- make_dirs([pp_dir, pp_clustermap_dir])
156
- if smf_modality != 'direct':
157
- sort_by = 'gpc'
158
- else:
159
- sort_by = 'any_a'
160
- clustermap_results = combined_raw_clustermap(adata,
161
- sample_col=cfg.sample_name_col_for_plotting,
162
- reference_col=cfg.reference_column,
163
- mod_target_bases=cfg.mod_target_bases,
164
- layer_any_c=cfg.layer_for_clustermap_plotting,
165
- layer_gpc=cfg.layer_for_clustermap_plotting,
166
- layer_cpg=cfg.layer_for_clustermap_plotting,
167
- layer_a=cfg.layer_for_clustermap_plotting,
168
- cmap_any_c="coolwarm",
169
- cmap_gpc="coolwarm",
170
- cmap_cpg="viridis",
171
- cmap_a="coolwarm",
172
- min_quality=cfg.read_quality_filter_thresholds[0],
173
- min_length=cfg.read_len_filter_thresholds[0],
174
- min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
175
- min_position_valid_fraction=1-cfg.position_max_nan_threshold,
176
- bins=None,
177
- sample_mapping=None,
178
- save_path=pp_clustermap_dir,
179
- sort_by=sort_by,
180
- deaminase=deaminase)
181
-
182
- ######### PCA/UMAP/Leiden #########
183
- if pp_umap_dir.is_dir():
184
- print(f'{pp_umap_dir} already exists. Skipping UMAP plotting.')
273
+ make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
274
+ combined_raw_clustermap(
275
+ adata,
276
+ sample_col=cfg.sample_name_col_for_plotting,
277
+ reference_col=cfg.reference_column,
278
+ mod_target_bases=cfg.mod_target_bases,
279
+ layer_c=cfg.layer_for_clustermap_plotting,
280
+ layer_gpc=cfg.layer_for_clustermap_plotting,
281
+ layer_cpg=cfg.layer_for_clustermap_plotting,
282
+ layer_a=cfg.layer_for_clustermap_plotting,
283
+ cmap_c=cfg.clustermap_cmap_c,
284
+ cmap_gpc=cfg.clustermap_cmap_gpc,
285
+ cmap_cpg=cfg.clustermap_cmap_cpg,
286
+ cmap_a=cfg.clustermap_cmap_a,
287
+ min_quality=cfg.read_quality_filter_thresholds[0],
288
+ min_length=cfg.read_len_filter_thresholds[0],
289
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
290
+ min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
291
+ bins=None,
292
+ sample_mapping=None,
293
+ save_path=pp_clustermap_dir_dedup,
294
+ sort_by=cfg.spatial_clustermap_sortby,
295
+ deaminase=deaminase,
296
+ index_col_suffix=cfg.reindexed_var_suffix,
297
+ )
298
+
299
+ # UMAP / Leiden
300
+ if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
301
+ print(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
185
302
  else:
186
- from ..tools import calculate_umap
187
303
  make_dirs([pp_umap_dir])
188
304
 
189
305
  var_filters = []
190
- if smf_modality == 'direct':
306
+ if smf_modality == "direct":
191
307
  for ref in references:
192
308
  for base in cfg.mod_target_bases:
193
- var_filters += [f'{ref}_{base}_site']
309
+ var_filters.append(f"{ref}_{base}_site")
194
310
  elif deaminase:
195
311
  for ref in references:
196
- var_filters += [f'{ref}_any_C_site']
312
+ var_filters.append(f"{ref}_C_site")
197
313
  else:
198
314
  for ref in references:
199
315
  for base in cfg.mod_target_bases:
200
- var_filters += [f'{ref}_{base}_site']
316
+ var_filters.append(f"{ref}_{base}_site")
201
317
 
202
- adata = calculate_umap(adata,
203
- layer=cfg.layer_for_umap_plotting,
204
- var_filters=var_filters,
205
- n_pcs=10,
206
- knn_neighbors=15)
318
+ adata = calculate_umap(
319
+ adata,
320
+ layer=cfg.layer_for_umap_plotting,
321
+ var_filters=var_filters,
322
+ n_pcs=10,
323
+ knn_neighbors=15,
324
+ )
207
325
 
208
- ## Clustering
209
326
  sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
210
327
 
211
- # Plotting UMAP
212
328
  sc.settings.figdir = pp_umap_dir
213
- umap_layers = ['leiden', cfg.sample_name_col_for_plotting, 'Reference_strand']
329
+ umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
214
330
  umap_layers += cfg.umap_layers_to_plot
215
331
  sc.pl.umap(adata, color=umap_layers, show=False, save=True)
216
332
 
217
- ########## Spatial autocorrelation analyses ###########
218
- from ..tools.spatial_autocorrelation import binary_autocorrelation_with_spacing, analyze_autocorr_matrix, bootstrap_periodicity, rolling_autocorr_metrics
219
- from ..plotting import plot_rolling_grid
220
- import warnings
221
-
222
- pp_autocorr_dir = pp_dir / "08_autocorrelations"
333
+ # ============================================================
334
+ # 3) Spatial autocorrelation + rolling metrics
335
+ # ============================================================
336
+ pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
223
337
 
224
- if pp_autocorr_dir.is_dir():
225
- print(f'{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.')
338
+ if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
339
+ print(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
226
340
  else:
227
341
  positions = adata.var_names.astype(int).values
228
342
  lags = np.arange(cfg.autocorr_max_lag + 1)
229
343
 
230
- # optional: try to parallelize autocorr per-row with joblib
231
344
  try:
232
345
  from joblib import Parallel, delayed
233
346
  _have_joblib = True
234
347
  except Exception:
235
348
  _have_joblib = False
236
349
 
350
+ samples = adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
351
+ ref_col = getattr(cfg, "reference_strand_col", "Reference_strand")
352
+ refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
353
+
237
354
  for site_type in cfg.autocorr_site_types:
238
355
  layer_key = f"{site_type}_site_binary"
239
356
  if layer_key not in adata.layers:
@@ -245,30 +362,27 @@ def spatial_adata(config_path):
245
362
  print(f"Layer {layer_key} empty — skipping {site_type}.")
246
363
  continue
247
364
 
248
- # compute per-molecule autocorrs (and counts)
249
365
  rows = []
250
366
  counts = []
367
+
251
368
  if _have_joblib:
252
- # parallel map
253
369
  def _worker(row):
254
370
  try:
255
371
  ac, cnts = binary_autocorrelation_with_spacing(
256
372
  row, positions, max_lag=cfg.autocorr_max_lag, return_counts=True
257
373
  )
258
- except Exception as e:
259
- # on error return NaN arrays
374
+ except Exception:
260
375
  ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
261
376
  cnts = np.zeros(cfg.autocorr_max_lag + 1, dtype=np.int32)
262
377
  return ac, cnts
263
378
 
264
- res = Parallel(n_jobs=cfg.n_jobs if hasattr(cfg, "n_jobs") else -1)(
379
+ res = Parallel(n_jobs=getattr(cfg, "n_jobs", -1))(
265
380
  delayed(_worker)(X[i]) for i in range(X.shape[0])
266
381
  )
267
382
  for ac, cnts in res:
268
383
  rows.append(ac)
269
384
  counts.append(cnts)
270
385
  else:
271
- # sequential fallback
272
386
  for i in range(X.shape[0]):
273
387
  ac, cnts = binary_autocorrelation_with_spacing(
274
388
  X[i], positions, max_lag=cfg.autocorr_max_lag, return_counts=True
@@ -279,21 +393,23 @@ def spatial_adata(config_path):
279
393
  autocorr_matrix = np.asarray(rows, dtype=np.float32)
280
394
  counts_matrix = np.asarray(counts, dtype=np.int32)
281
395
 
282
- # store raw per-molecule arrays (keep memory format compact)
283
396
  adata.obsm[f"{site_type}_spatial_autocorr"] = autocorr_matrix
284
397
  adata.obsm[f"{site_type}_spatial_autocorr_counts"] = counts_matrix
285
398
  adata.uns[f"{site_type}_spatial_autocorr_lags"] = lags
286
399
 
287
- # compute global periodicity metrics across all molecules for this site_type
288
400
  try:
289
401
  results = analyze_autocorr_matrix(
290
- autocorr_matrix, counts_matrix, lags,
291
- nrl_search_bp=(120, 260), pad_factor=4, min_count=20, max_harmonics=6
402
+ autocorr_matrix,
403
+ counts_matrix,
404
+ lags,
405
+ nrl_search_bp=(120, 260),
406
+ pad_factor=4,
407
+ min_count=20,
408
+ max_harmonics=6,
292
409
  )
293
410
  except Exception as e:
294
411
  results = {"error": str(e)}
295
412
 
296
- # store global metrics (same keys you used)
297
413
  global_metrics = {
298
414
  "nrl_bp": results.get("nrl_bp", np.nan),
299
415
  "xi": results.get("xi", np.nan),
@@ -305,13 +421,16 @@ def spatial_adata(config_path):
305
421
  }
306
422
  adata.uns[f"{site_type}_spatial_periodicity_metrics"] = global_metrics
307
423
 
308
- # bootstrap for CI (use a reasonable default; set low only for debugging)
309
424
  n_boot = getattr(cfg, "autocorr_bootstrap_n", 200)
310
- # if user intentionally set very low n_boot in cfg, we keep that; otherwise default 200
311
425
  try:
312
426
  bs = bootstrap_periodicity(
313
- autocorr_matrix, counts_matrix, lags,
314
- n_boot=n_boot, nrl_search_bp=(120, 260), pad_factor=4, min_count=20
427
+ autocorr_matrix,
428
+ counts_matrix,
429
+ lags,
430
+ n_boot=n_boot,
431
+ nrl_search_bp=(120, 260),
432
+ pad_factor=4,
433
+ min_count=20,
315
434
  )
316
435
  adata.uns[f"{site_type}_spatial_periodicity_boot"] = {
317
436
  "nrl_boot": np.asarray(bs["nrl_boot"]).tolist(),
@@ -320,57 +439,70 @@ def spatial_adata(config_path):
320
439
  except Exception as e:
321
440
  adata.uns[f"{site_type}_spatial_periodicity_boot_error"] = str(e)
322
441
 
323
- # ----------------------------
324
- # Compute group-level metrics for plotting (per sample × reference)
325
- # ----------------------------
326
442
  metrics_by_group = {}
327
443
  sample_col = cfg.sample_name_col_for_plotting
328
- ref_col = cfg.reference_strand_col if hasattr(cfg, "reference_strand_col") else "Reference_strand"
329
- samples = adata.obs[sample_col].astype("category").cat.categories.tolist()
330
- refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
331
444
 
332
- # iterate groups and run analyzer on each group's subset; cache errors
333
445
  for sample_name in samples:
334
- sample_mask = (adata.obs[sample_col].values == sample_name)
446
+ sample_mask = adata.obs[sample_col].values == sample_name
447
+
335
448
  # combined group
336
449
  mask = sample_mask
337
450
  ac_sel = autocorr_matrix[mask, :]
338
451
  cnt_sel = counts_matrix[mask, :] if counts_matrix is not None else None
339
452
  if ac_sel.size:
340
453
  try:
341
- r = analyze_autocorr_matrix(ac_sel, cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
342
- lags, nrl_search_bp=(120,260), pad_factor=4, min_count=10, max_harmonics=6)
454
+ r = analyze_autocorr_matrix(
455
+ ac_sel,
456
+ cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
457
+ lags,
458
+ nrl_search_bp=(120, 260),
459
+ pad_factor=4,
460
+ min_count=10,
461
+ max_harmonics=6,
462
+ )
343
463
  except Exception as e:
344
464
  r = {"error": str(e)}
345
465
  else:
346
466
  r = {"error": "no_data"}
347
467
  metrics_by_group[(sample_name, None)] = r
348
468
 
349
- # per-reference groups
350
469
  for ref in refs:
351
470
  mask_ref = sample_mask & (adata.obs[ref_col].values == ref)
352
471
  ac_sel = autocorr_matrix[mask_ref, :]
353
472
  cnt_sel = counts_matrix[mask_ref, :] if counts_matrix is not None else None
354
473
  if ac_sel.size:
355
474
  try:
356
- r = analyze_autocorr_matrix(ac_sel, cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
357
- lags, nrl_search_bp=(120,260), pad_factor=4, min_count=10, max_harmonics=6)
475
+ r = analyze_autocorr_matrix(
476
+ ac_sel,
477
+ cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
478
+ lags,
479
+ nrl_search_bp=(120, 260),
480
+ pad_factor=4,
481
+ min_count=10,
482
+ max_harmonics=6,
483
+ )
358
484
  except Exception as e:
359
485
  r = {"error": str(e)}
360
486
  else:
361
487
  r = {"error": "no_data"}
362
488
  metrics_by_group[(sample_name, ref)] = r
363
489
 
364
- # persist group metrics
365
490
  adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
366
491
 
367
492
  global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get("nrl_bp", None)
368
493
 
369
- # configuration / sensible defaults (override in cfg if present)
370
494
  rolling_cfg = {
371
- "window_size": getattr(cfg, "rolling_window_size", getattr(cfg, "autocorr_rolling_window_size", 600)),
495
+ "window_size": getattr(
496
+ cfg,
497
+ "rolling_window_size",
498
+ getattr(cfg, "autocorr_rolling_window_size", 600),
499
+ ),
372
500
  "step": getattr(cfg, "rolling_step", 100),
373
- "max_lag": getattr(cfg, "rolling_max_lag", cfg.autocorr_max_lag if hasattr(cfg, "autocorr_max_lag") else 500),
501
+ "max_lag": getattr(
502
+ cfg,
503
+ "rolling_max_lag",
504
+ getattr(cfg, "autocorr_max_lag", 500),
505
+ ),
374
506
  "min_molecules_per_window": getattr(cfg, "rolling_min_molecules_per_window", 10),
375
507
  "nrl_search_bp": getattr(cfg, "rolling_nrl_search_bp", (120, 240)),
376
508
  "pad_factor": getattr(cfg, "rolling_pad_factor", 4),
@@ -381,23 +513,19 @@ def spatial_adata(config_path):
381
513
 
382
514
  write_plots = getattr(cfg, "rolling_write_plots", True)
383
515
  write_csvs = getattr(cfg, "rolling_write_csvs", True)
384
- min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30) # only run rolling if group has >= this many molecules
516
+ min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30)
385
517
 
386
518
  rolling_out_dir = os.path.join(pp_autocorr_dir, "rolling_metrics")
387
519
  os.makedirs(rolling_out_dir, exist_ok=True)
388
- # also a per-site subfolder
389
520
  site_out_dir = os.path.join(rolling_out_dir, site_type)
390
521
  os.makedirs(site_out_dir, exist_ok=True)
391
522
 
392
- combined_rows = [] # accumulate one row per window for combined CSV
393
- rolling_results_by_group = {} # store DataFrame per group in memory (persist later to adata.uns)
523
+ combined_rows = []
524
+ rolling_results_by_group = {}
394
525
 
395
- # iterate groups (samples × refs). `samples` and `refs` were computed above.
396
526
  for sample_name in samples:
397
- sample_mask = (adata.obs[sample_col].values == sample_name)
398
- # first the combined group ("all refs")
527
+ sample_mask = adata.obs[sample_col].values == sample_name
399
528
  group_masks = [("all", sample_mask)]
400
- # then per-reference groups
401
529
  for ref in refs:
402
530
  ref_mask = sample_mask & (adata.obs[ref_col].values == ref)
403
531
  group_masks.append((ref, ref_mask))
@@ -405,17 +533,10 @@ def spatial_adata(config_path):
405
533
  for ref_label, mask in group_masks:
406
534
  n_group = int(mask.sum())
407
535
  if n_group < min_molecules_for_group:
408
- # skip tiny groups
409
- if cfg.get("verbosity", 0) if hasattr(cfg, "get") else False:
410
- print(f"Skipping rolling for {site_type} {sample_name} {ref_label}: only {n_group} molecules (<{min_molecules_for_group})")
411
- # still write an empty CSV row set if desired; here we skip
412
536
  continue
413
537
 
414
- # extract group matrix X_group (works with dense or sparse adata.layers)
415
538
  X_group = X[mask, :]
416
- # positions already set above
417
539
  try:
418
- # call your rolling function (this may be slow; it uses cfg.n_jobs)
419
540
  df_roll = rolling_autocorr_metrics(
420
541
  X_group,
421
542
  positions,
@@ -430,17 +551,20 @@ def spatial_adata(config_path):
430
551
  max_harmonics=rolling_cfg["max_harmonics"],
431
552
  n_jobs=rolling_cfg["n_jobs"],
432
553
  verbose=False,
433
- fixed_nrl_bp=global_nrl
554
+ fixed_nrl_bp=global_nrl,
434
555
  )
435
556
  except Exception as e:
436
- warnings.warn(f"rolling_autocorr_metrics failed for {site_type} {sample_name} {ref_label}: {e}")
557
+ warnings.warn(
558
+ f"rolling_autocorr_metrics failed for {site_type} "
559
+ f"{sample_name} {ref_label}: {e}"
560
+ )
437
561
  continue
438
562
 
439
- # normalize column names and keep only the compact set you want
440
- # keep: center, n_molecules, nrl_bp, snr, xi, fwhm_bp
441
563
  if "center" not in df_roll.columns:
442
- # defensive: if the rolling function returned different schema, skip
443
- warnings.warn(f"rolling_autocorr_metrics returned unexpected schema for {site_type} {sample_name} {ref_label}")
564
+ warnings.warn(
565
+ f"rolling_autocorr_metrics returned unexpected schema "
566
+ f"for {site_type} {sample_name} {ref_label}"
567
+ )
444
568
  continue
445
569
 
446
570
  compact_df = df_roll[["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]].copy()
@@ -448,117 +572,126 @@ def spatial_adata(config_path):
448
572
  compact_df["sample"] = sample_name
449
573
  compact_df["reference"] = ref_label if ref_label != "all" else "all"
450
574
 
451
- # save per-group CSV
452
575
  if write_csvs:
453
576
  safe_sample = str(sample_name).replace(os.sep, "_")
454
577
  safe_ref = str(ref_label if ref_label != "all" else "all").replace(os.sep, "_")
455
- out_csv = os.path.join(site_out_dir, f"{safe_sample}__{safe_ref}__rolling_metrics.csv")
578
+ out_csv = os.path.join(
579
+ site_out_dir,
580
+ f"{safe_sample}__{safe_ref}__rolling_metrics.csv",
581
+ )
456
582
  try:
457
583
  compact_df.to_csv(out_csv, index=False)
458
584
  except Exception as e:
459
585
  warnings.warn(f"Failed to write rolling CSV {out_csv}: {e}")
460
586
 
461
- # save a plot per-group (NRL and SNR vs center)
462
587
  if write_plots:
463
588
  try:
464
- # use your plot helper; if it's in a different module, import accordingly
465
589
  from ..plotting import plot_rolling_metrics as _plot_roll
466
590
  except Exception:
467
- _plot_roll = globals().get("plot_rolling_metrics", None)
591
+ _plot_roll = None
468
592
  if _plot_roll is not None:
469
- plot_png = os.path.join(site_out_dir, f"{safe_sample}__{safe_ref}__rolling_metrics.png")
593
+ plot_png = os.path.join(
594
+ site_out_dir,
595
+ f"{safe_sample}__{safe_ref}__rolling_metrics.png",
596
+ )
470
597
  try:
471
- _plot_roll(compact_df, out_png=plot_png,
472
- title=f"{site_type} {sample_name} {ref_label}",
473
- figsize=(10,3.5), dpi=160, show=False)
598
+ _plot_roll(
599
+ compact_df,
600
+ out_png=plot_png,
601
+ title=f"{site_type} {sample_name} {ref_label}",
602
+ figsize=(10, 3.5),
603
+ dpi=160,
604
+ show=False,
605
+ )
474
606
  except Exception as e:
475
- warnings.warn(f"Failed to create rolling plot for {site_type} {sample_name} {ref_label}: {e}")
607
+ warnings.warn(
608
+ f"Failed to create rolling plot for {site_type} "
609
+ f"{sample_name} {ref_label}: {e}"
610
+ )
476
611
 
477
- # store in combined_rows and in-memory dict
478
- combined_rows.append(compact_df.assign(site=site_type, sample=sample_name, reference=ref_label))
612
+ combined_rows.append(
613
+ compact_df.assign(site=site_type, sample=sample_name, reference=ref_label)
614
+ )
479
615
  rolling_results_by_group[(sample_name, None if ref_label == "all" else ref_label)] = compact_df
480
616
 
481
- # persist per-site rolling metrics into adata.uns as dict of DataFrames (or empty dict)
482
617
  adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
483
618
 
484
- # write combined CSV for this site across all groups
485
- if len(combined_rows):
619
+ if combined_rows:
486
620
  combined_df_site = pd.concat(combined_rows, ignore_index=True, sort=False)
487
- combined_out_csv = os.path.join(rolling_out_dir, f"{site_type}__rolling_metrics_combined.csv")
621
+ combined_out_csv = os.path.join(
622
+ rolling_out_dir, f"{site_type}__rolling_metrics_combined.csv"
623
+ )
488
624
  try:
489
625
  combined_df_site.to_csv(combined_out_csv, index=False)
490
626
  except Exception as e:
491
- warnings.warn(f"Failed to write combined rolling CSV for {site_type}: {e}")
627
+ warnings.warn(
628
+ f"Failed to write combined rolling CSV for {site_type}: {e}"
629
+ )
492
630
 
493
631
  rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
494
632
  plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
495
633
  os.makedirs(plot_out_dir, exist_ok=True)
496
- pages = plot_rolling_grid(rolling_dict, plot_out_dir, site_type,
497
- rows_per_page=cfg.rows_per_qc_autocorr_grid,
498
- cols_per_page=len(refs),
499
- dpi=160,
500
- metrics=("nrl_bp","snr", "xi"),
501
- per_metric_ylim={"snr": (0, 25)})
502
-
503
- from ..plotting import plot_spatial_autocorr_grid
504
- make_dirs([pp_autocorr_dir, pp_autocorr_dir])
505
-
506
- plot_spatial_autocorr_grid(adata,
507
- pp_autocorr_dir,
508
- site_types=cfg.autocorr_site_types,
509
- sample_col=cfg.sample_name_col_for_plotting,
510
- window=cfg.autocorr_rolling_window_size,
511
- rows_per_fig=cfg.rows_per_qc_autocorr_grid)
512
-
513
- ############ Pearson analyses ###############
514
- if smf_modality != 'direct':
515
- from ..tools.position_stats import compute_positionwise_statistics, plot_positionwise_matrices
516
-
517
- pp_corr_dir = pp_dir / "09_correlation_matrices"
518
-
519
- if pp_corr_dir.is_dir():
520
- print(f'{pp_corr_dir} already exists. Skipping correlation matrix plotting.')
521
- else:
522
- compute_positionwise_statistics(
523
- adata,
524
- layer="nan0_0minus1",
525
- methods=cfg.correlation_matrix_types,
526
- sample_col=cfg.sample_name_col_for_plotting,
527
- ref_col=cfg.reference_column,
528
- output_key="positionwise_result",
529
- site_types=cfg.correlation_matrix_site_types,
530
- encoding="signed",
531
- max_threads=cfg.threads,
532
- min_count_for_pairwise=10,
634
+ _ = plot_rolling_grid(
635
+ rolling_dict,
636
+ plot_out_dir,
637
+ site_type,
638
+ rows_per_page=cfg.rows_per_qc_autocorr_grid,
639
+ cols_per_page=len(refs),
640
+ dpi=160,
641
+ metrics=("nrl_bp", "snr", "xi"),
642
+ per_metric_ylim={"snr": (0, 25)},
533
643
  )
534
-
535
- plot_positionwise_matrices(
644
+
645
+ make_dirs([pp_autocorr_dir])
646
+ plot_spatial_autocorr_grid(
536
647
  adata,
537
- methods=cfg.correlation_matrix_types,
648
+ pp_autocorr_dir,
649
+ site_types=cfg.autocorr_site_types,
538
650
  sample_col=cfg.sample_name_col_for_plotting,
539
- ref_col=cfg.reference_column,
540
- figsize_per_cell=(4.0, 3.0),
541
- dpi=160,
542
- cmaps=cfg.correlation_matrix_cmaps,
543
- vmin=None,
544
- vmax=None,
545
- output_dir=pp_corr_dir,
546
- output_key= "positionwise_result"
651
+ window=cfg.autocorr_rolling_window_size,
652
+ rows_per_fig=cfg.rows_per_qc_autocorr_grid,
547
653
  )
548
654
 
549
- ####### Save basic analysis adata - post preprocessing and duplicate removal ################
550
- from ..readwrite import safe_write_h5ad
551
- if not spatial_adata_path.exists() or cfg.force_redo_preprocessing:
552
- print('Saving spatial analyzed adata post preprocessing and duplicate removal')
553
- if ".gz" == spatial_adata_path.suffix:
554
- print(f"Spatial adata path: {spatial_adata_path}")
555
- safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
556
- else:
557
- spatial_adata_path = spatial_adata_path.with_name(spatial_adata_path.name + '.gz')
558
- print(f"Spatial adata path: {spatial_adata_path}")
559
- safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
560
- ############################################### smftools spatial end ###############################################
655
+ # ============================================================
656
+ # 4) Pearson / correlation matrices
657
+ # ============================================================
658
+ pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
561
659
 
562
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_adata_path)
660
+ if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
661
+ print(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
662
+ else:
663
+ compute_positionwise_statistics(
664
+ adata,
665
+ layer="nan0_0minus1",
666
+ methods=cfg.correlation_matrix_types,
667
+ sample_col=cfg.sample_name_col_for_plotting,
668
+ ref_col=cfg.reference_column,
669
+ output_key="positionwise_result",
670
+ site_types=cfg.correlation_matrix_site_types,
671
+ encoding="signed",
672
+ max_threads=cfg.threads,
673
+ min_count_for_pairwise=10,
674
+ )
675
+
676
+ plot_positionwise_matrices(
677
+ adata,
678
+ methods=cfg.correlation_matrix_types,
679
+ sample_col=cfg.sample_name_col_for_plotting,
680
+ ref_col=cfg.reference_column,
681
+ figsize_per_cell=(4.0, 3.0),
682
+ dpi=160,
683
+ cmaps=cfg.correlation_matrix_cmaps,
684
+ vmin=None,
685
+ vmax=None,
686
+ output_dir=pp_corr_dir,
687
+ output_key="positionwise_result",
688
+ )
689
+
690
+ # ============================================================
691
+ # 5) Save spatial AnnData
692
+ # ============================================================
693
+ if (not spatial_adata_path.exists()) or getattr(cfg, "force_redo_spatial_analyses", False):
694
+ print("Saving spatial analyzed AnnData (post preprocessing and duplicate removal).")
695
+ write_gz_h5ad(adata, spatial_adata_path)
563
696
 
564
697
  return adata, spatial_adata_path