smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,697 @@
1
+ from pathlib import Path
2
+ from typing import Optional, Tuple
3
+
4
+ import anndata as ad
5
+
6
+ def spatial_adata(
7
+ config_path: str,
8
+ ) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
9
+ """
10
+ CLI-facing wrapper for spatial analyses.
11
+
12
+ Called by: `smftools spatial <config_path>`
13
+
14
+ Responsibilities:
15
+ - Ensure a usable AnnData exists via `load_adata` + `preprocess_adata`.
16
+ - Determine which AnnData stages exist (raw, pp, pp_dedup, spatial, hmm).
17
+ - Respect cfg.force_redo_spatial_analyses.
18
+ - Decide whether to skip (return existing) or run the spatial core.
19
+ - Call `spatial_adata_core(...)` when actual work is needed.
20
+
21
+ Returns
22
+ -------
23
+ spatial_adata : AnnData | None
24
+ AnnData with spatial analyses, or None if we skipped because a later-stage
25
+ AnnData already exists.
26
+ spatial_adata_path : Path | None
27
+ Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
28
+ """
29
+ from ..readwrite import safe_read_h5ad, make_dirs, add_or_update_column_in_csv
30
+ from .load_adata import load_adata
31
+ from .preprocess_adata import preprocess_adata
32
+ from .helpers import get_adata_paths
33
+
34
+ # 1) Ensure config + basic paths via load_adata
35
+ loaded_adata, loaded_path, cfg = load_adata(config_path)
36
+ paths = get_adata_paths(cfg)
37
+
38
+ raw_path = paths.raw
39
+ pp_path = paths.pp
40
+ pp_dedup_path = paths.pp_dedup
41
+ spatial_path = paths.spatial
42
+ hmm_path = paths.hmm
43
+
44
+ # Stage-skipping logic for spatial
45
+ if not getattr(cfg, "force_redo_spatial_analyses", False):
46
+ # If HMM exists, it's the most processed stage — reuse it.
47
+ if hmm_path.exists():
48
+ print(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
49
+ return None, hmm_path
50
+
51
+ # If spatial exists, we consider spatial analyses already done.
52
+ if spatial_path.exists():
53
+ print(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
54
+ return None, spatial_path
55
+
56
+ # 2) Ensure preprocessing has been run
57
+ # This will create pp/pp_dedup as needed or return them if they already exist.
58
+ pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(config_path)
59
+
60
+ # Helper to load from disk, reusing loaded_adata if it matches
61
+ def _load(path: Path):
62
+ from ..readwrite import safe_read_h5ad
63
+ if loaded_adata is not None and loaded_path == path:
64
+ return loaded_adata
65
+ adata, _ = safe_read_h5ad(path)
66
+ return adata
67
+
68
+ # 3) Decide which AnnData to use as the *starting point* for spatial analyses
69
+ # Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
70
+ if pp_dedup_adata is not None:
71
+ start_adata = pp_dedup_adata
72
+ else:
73
+ if pp_dedup_path.exists():
74
+ start_adata = _load(pp_dedup_path)
75
+ elif pp_path.exists():
76
+ start_adata = _load(pp_path)
77
+ elif raw_path.exists():
78
+ start_adata = _load(raw_path)
79
+ else:
80
+ print("No suitable AnnData found for spatial analyses (need at least raw).")
81
+ return None, None
82
+
83
+ # 4) Run the spatial core
84
+ adata_spatial, spatial_path = spatial_adata_core(
85
+ adata=start_adata,
86
+ cfg=cfg,
87
+ spatial_adata_path=spatial_path,
88
+ pp_adata_path=pp_path,
89
+ pp_dup_rem_adata_path=pp_dedup_path,
90
+ pp_adata_in_memory=pp_adata,
91
+ )
92
+
93
+ # 5) Register spatial path in summary CSV
94
+ add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
95
+
96
+ return adata_spatial, spatial_path
97
+
98
+
99
+ def spatial_adata_core(
100
+ adata: ad.AnnData,
101
+ cfg,
102
+ spatial_adata_path: Path,
103
+ pp_adata_path: Path,
104
+ pp_dup_rem_adata_path: Path,
105
+ pp_adata_in_memory: Optional[ad.AnnData] = None,
106
+ ) -> Tuple[ad.AnnData, Path]:
107
+ """
108
+ Core spatial analysis pipeline.
109
+
110
+ Assumes:
111
+ - `adata` is (typically) the preprocessed, duplicate-removed AnnData.
112
+ - `cfg` is the ExperimentConfig.
113
+ - `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
114
+ from `get_adata_paths`.
115
+ - `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
116
+ the same run of `preprocess_adata`, to avoid re-reading from disk.
117
+
118
+ Does:
119
+ - Optional sample sheet load.
120
+ - Optional inversion & reindexing.
121
+ - Clustermaps on:
122
+ * preprocessed (non-dedup) AnnData (for non-direct modalities), and
123
+ * deduplicated preprocessed AnnData.
124
+ - PCA/UMAP/Leiden.
125
+ - Autocorrelation + rolling metrics + grids.
126
+ - Positionwise correlation matrices (non-direct modalities).
127
+ - Save spatial AnnData to `spatial_adata_path`.
128
+
129
+ Returns
130
+ -------
131
+ adata : AnnData
132
+ Spatially analyzed AnnData (same object, modified in-place).
133
+ spatial_adata_path : Path
134
+ Path where spatial AnnData was written.
135
+ """
136
+ import os
137
+ import warnings
138
+ from pathlib import Path
139
+
140
+ import numpy as np
141
+ import pandas as pd
142
+ import scanpy as sc
143
+
144
+ from ..readwrite import make_dirs, safe_read_h5ad
145
+ from .helpers import write_gz_h5ad
146
+
147
+ from ..preprocessing import (
148
+ load_sample_sheet,
149
+ invert_adata,
150
+ reindex_references_adata,
151
+ )
152
+ from ..plotting import (
153
+ combined_raw_clustermap,
154
+ plot_rolling_grid,
155
+ plot_spatial_autocorr_grid,
156
+ )
157
+ from ..tools import calculate_umap
158
+ from ..tools.spatial_autocorrelation import (
159
+ binary_autocorrelation_with_spacing,
160
+ analyze_autocorr_matrix,
161
+ bootstrap_periodicity,
162
+ rolling_autocorr_metrics,
163
+ )
164
+ from ..tools.position_stats import (
165
+ compute_positionwise_statistics,
166
+ plot_positionwise_matrices,
167
+ )
168
+
169
+ # -----------------------------
170
+ # General setup
171
+ # -----------------------------
172
+ output_directory = Path(cfg.output_directory)
173
+ make_dirs([output_directory])
174
+
175
+ smf_modality = cfg.smf_modality
176
+ if smf_modality == "conversion":
177
+ deaminase = False
178
+ else:
179
+ deaminase = True
180
+
181
+ first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
182
+
183
+ # -----------------------------
184
+ # Optional sample sheet metadata
185
+ # -----------------------------
186
+ if getattr(cfg, "sample_sheet_path", None):
187
+ load_sample_sheet(
188
+ adata,
189
+ cfg.sample_sheet_path,
190
+ mapping_key_column=cfg.sample_sheet_mapping_column,
191
+ as_category=True,
192
+ force_reload=cfg.force_reload_sample_sheet,
193
+ )
194
+
195
+ # -----------------------------
196
+ # Optional inversion along positions axis
197
+ # -----------------------------
198
+ if getattr(cfg, "invert_adata", False):
199
+ adata = invert_adata(adata)
200
+
201
+ # -----------------------------
202
+ # Optional reindexing by reference
203
+ # -----------------------------
204
+ reindex_references_adata(
205
+ adata,
206
+ reference_col=cfg.reference_column,
207
+ offsets=cfg.reindexing_offsets,
208
+ new_col=cfg.reindexed_var_suffix,
209
+ )
210
+
211
+ pp_dir = output_directory / "preprocessed"
212
+ references = adata.obs[cfg.reference_column].cat.categories
213
+
214
+ # ============================================================
215
+ # 1) Clustermaps (non-direct modalities) on *preprocessed* data
216
+ # ============================================================
217
+ if smf_modality != "direct":
218
+ preprocessed_version_available = pp_adata_path.exists()
219
+
220
+ if preprocessed_version_available:
221
+ pp_clustermap_dir = pp_dir / "06_clustermaps"
222
+
223
+ if pp_clustermap_dir.is_dir() and not getattr(
224
+ cfg, "force_redo_spatial_analyses", False
225
+ ):
226
+ print(f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData.")
227
+ else:
228
+ make_dirs([pp_dir, pp_clustermap_dir])
229
+
230
+ if first_pp_run and (pp_adata_in_memory is not None):
231
+ pp_adata = pp_adata_in_memory
232
+ else:
233
+ pp_adata, _ = safe_read_h5ad(pp_adata_path)
234
+
235
+ combined_raw_clustermap(
236
+ pp_adata,
237
+ sample_col=cfg.sample_name_col_for_plotting,
238
+ reference_col=cfg.reference_column,
239
+ mod_target_bases=cfg.mod_target_bases,
240
+ layer_c=cfg.layer_for_clustermap_plotting,
241
+ layer_gpc=cfg.layer_for_clustermap_plotting,
242
+ layer_cpg=cfg.layer_for_clustermap_plotting,
243
+ layer_a=cfg.layer_for_clustermap_plotting,
244
+ cmap_c=cfg.clustermap_cmap_c,
245
+ cmap_gpc=cfg.clustermap_cmap_gpc,
246
+ cmap_cpg=cfg.clustermap_cmap_cpg,
247
+ cmap_a=cfg.clustermap_cmap_a,
248
+ min_quality=cfg.read_quality_filter_thresholds[0],
249
+ min_length=cfg.read_len_filter_thresholds[0],
250
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
251
+ min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
252
+ bins=None,
253
+ sample_mapping=None,
254
+ save_path=pp_clustermap_dir,
255
+ sort_by=cfg.spatial_clustermap_sortby,
256
+ deaminase=deaminase,
257
+ index_col_suffix=cfg.reindexed_var_suffix,
258
+ )
259
+
260
+ # ============================================================
261
+ # 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
262
+ # ============================================================
263
+ pp_dir_dedup = pp_dir / "deduplicated"
264
+ pp_clustermap_dir_dedup = pp_dir_dedup / "06_clustermaps"
265
+ pp_umap_dir = pp_dir_dedup / "07_umaps"
266
+
267
+ # Clustermaps on deduplicated adata
268
+ if pp_clustermap_dir_dedup.is_dir() and not getattr(
269
+ cfg, "force_redo_spatial_analyses", False
270
+ ):
271
+ print(f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData.")
272
+ else:
273
+ make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
274
+ combined_raw_clustermap(
275
+ adata,
276
+ sample_col=cfg.sample_name_col_for_plotting,
277
+ reference_col=cfg.reference_column,
278
+ mod_target_bases=cfg.mod_target_bases,
279
+ layer_c=cfg.layer_for_clustermap_plotting,
280
+ layer_gpc=cfg.layer_for_clustermap_plotting,
281
+ layer_cpg=cfg.layer_for_clustermap_plotting,
282
+ layer_a=cfg.layer_for_clustermap_plotting,
283
+ cmap_c=cfg.clustermap_cmap_c,
284
+ cmap_gpc=cfg.clustermap_cmap_gpc,
285
+ cmap_cpg=cfg.clustermap_cmap_cpg,
286
+ cmap_a=cfg.clustermap_cmap_a,
287
+ min_quality=cfg.read_quality_filter_thresholds[0],
288
+ min_length=cfg.read_len_filter_thresholds[0],
289
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
290
+ min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
291
+ bins=None,
292
+ sample_mapping=None,
293
+ save_path=pp_clustermap_dir_dedup,
294
+ sort_by=cfg.spatial_clustermap_sortby,
295
+ deaminase=deaminase,
296
+ index_col_suffix=cfg.reindexed_var_suffix,
297
+ )
298
+
299
+ # UMAP / Leiden
300
+ if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
301
+ print(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
302
+ else:
303
+ make_dirs([pp_umap_dir])
304
+
305
+ var_filters = []
306
+ if smf_modality == "direct":
307
+ for ref in references:
308
+ for base in cfg.mod_target_bases:
309
+ var_filters.append(f"{ref}_{base}_site")
310
+ elif deaminase:
311
+ for ref in references:
312
+ var_filters.append(f"{ref}_C_site")
313
+ else:
314
+ for ref in references:
315
+ for base in cfg.mod_target_bases:
316
+ var_filters.append(f"{ref}_{base}_site")
317
+
318
+ adata = calculate_umap(
319
+ adata,
320
+ layer=cfg.layer_for_umap_plotting,
321
+ var_filters=var_filters,
322
+ n_pcs=10,
323
+ knn_neighbors=15,
324
+ )
325
+
326
+ sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
327
+
328
+ sc.settings.figdir = pp_umap_dir
329
+ umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
330
+ umap_layers += cfg.umap_layers_to_plot
331
+ sc.pl.umap(adata, color=umap_layers, show=False, save=True)
332
+
333
+ # ============================================================
334
+ # 3) Spatial autocorrelation + rolling metrics
335
+ # ============================================================
336
+ pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
337
+
338
+ if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
339
+ print(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
340
+ else:
341
+ positions = adata.var_names.astype(int).values
342
+ lags = np.arange(cfg.autocorr_max_lag + 1)
343
+
344
+ try:
345
+ from joblib import Parallel, delayed
346
+ _have_joblib = True
347
+ except Exception:
348
+ _have_joblib = False
349
+
350
+ samples = adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
351
+ ref_col = getattr(cfg, "reference_strand_col", "Reference_strand")
352
+ refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
353
+
354
+ for site_type in cfg.autocorr_site_types:
355
+ layer_key = f"{site_type}_site_binary"
356
+ if layer_key not in adata.layers:
357
+ print(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
358
+ continue
359
+
360
+ X = adata.layers[layer_key]
361
+ if getattr(X, "shape", (0,))[0] == 0:
362
+ print(f"Layer {layer_key} empty — skipping {site_type}.")
363
+ continue
364
+
365
+ rows = []
366
+ counts = []
367
+
368
+ if _have_joblib:
369
+ def _worker(row):
370
+ try:
371
+ ac, cnts = binary_autocorrelation_with_spacing(
372
+ row, positions, max_lag=cfg.autocorr_max_lag, return_counts=True
373
+ )
374
+ except Exception:
375
+ ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
376
+ cnts = np.zeros(cfg.autocorr_max_lag + 1, dtype=np.int32)
377
+ return ac, cnts
378
+
379
+ res = Parallel(n_jobs=getattr(cfg, "n_jobs", -1))(
380
+ delayed(_worker)(X[i]) for i in range(X.shape[0])
381
+ )
382
+ for ac, cnts in res:
383
+ rows.append(ac)
384
+ counts.append(cnts)
385
+ else:
386
+ for i in range(X.shape[0]):
387
+ ac, cnts = binary_autocorrelation_with_spacing(
388
+ X[i], positions, max_lag=cfg.autocorr_max_lag, return_counts=True
389
+ )
390
+ rows.append(ac)
391
+ counts.append(cnts)
392
+
393
+ autocorr_matrix = np.asarray(rows, dtype=np.float32)
394
+ counts_matrix = np.asarray(counts, dtype=np.int32)
395
+
396
+ adata.obsm[f"{site_type}_spatial_autocorr"] = autocorr_matrix
397
+ adata.obsm[f"{site_type}_spatial_autocorr_counts"] = counts_matrix
398
+ adata.uns[f"{site_type}_spatial_autocorr_lags"] = lags
399
+
400
+ try:
401
+ results = analyze_autocorr_matrix(
402
+ autocorr_matrix,
403
+ counts_matrix,
404
+ lags,
405
+ nrl_search_bp=(120, 260),
406
+ pad_factor=4,
407
+ min_count=20,
408
+ max_harmonics=6,
409
+ )
410
+ except Exception as e:
411
+ results = {"error": str(e)}
412
+
413
+ global_metrics = {
414
+ "nrl_bp": results.get("nrl_bp", np.nan),
415
+ "xi": results.get("xi", np.nan),
416
+ "snr": results.get("snr", np.nan),
417
+ "fwhm_bp": results.get("fwhm_bp", np.nan),
418
+ "envelope_sample_lags": results.get("envelope_sample_lags", np.array([])).tolist(),
419
+ "envelope_heights": results.get("envelope_heights", np.array([])).tolist(),
420
+ "analyzer_error": results.get("error", None),
421
+ }
422
+ adata.uns[f"{site_type}_spatial_periodicity_metrics"] = global_metrics
423
+
424
+ n_boot = getattr(cfg, "autocorr_bootstrap_n", 200)
425
+ try:
426
+ bs = bootstrap_periodicity(
427
+ autocorr_matrix,
428
+ counts_matrix,
429
+ lags,
430
+ n_boot=n_boot,
431
+ nrl_search_bp=(120, 260),
432
+ pad_factor=4,
433
+ min_count=20,
434
+ )
435
+ adata.uns[f"{site_type}_spatial_periodicity_boot"] = {
436
+ "nrl_boot": np.asarray(bs["nrl_boot"]).tolist(),
437
+ "xi_boot": np.asarray(bs["xi_boot"]).tolist(),
438
+ }
439
+ except Exception as e:
440
+ adata.uns[f"{site_type}_spatial_periodicity_boot_error"] = str(e)
441
+
442
+ metrics_by_group = {}
443
+ sample_col = cfg.sample_name_col_for_plotting
444
+
445
+ for sample_name in samples:
446
+ sample_mask = adata.obs[sample_col].values == sample_name
447
+
448
+ # combined group
449
+ mask = sample_mask
450
+ ac_sel = autocorr_matrix[mask, :]
451
+ cnt_sel = counts_matrix[mask, :] if counts_matrix is not None else None
452
+ if ac_sel.size:
453
+ try:
454
+ r = analyze_autocorr_matrix(
455
+ ac_sel,
456
+ cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
457
+ lags,
458
+ nrl_search_bp=(120, 260),
459
+ pad_factor=4,
460
+ min_count=10,
461
+ max_harmonics=6,
462
+ )
463
+ except Exception as e:
464
+ r = {"error": str(e)}
465
+ else:
466
+ r = {"error": "no_data"}
467
+ metrics_by_group[(sample_name, None)] = r
468
+
469
+ for ref in refs:
470
+ mask_ref = sample_mask & (adata.obs[ref_col].values == ref)
471
+ ac_sel = autocorr_matrix[mask_ref, :]
472
+ cnt_sel = counts_matrix[mask_ref, :] if counts_matrix is not None else None
473
+ if ac_sel.size:
474
+ try:
475
+ r = analyze_autocorr_matrix(
476
+ ac_sel,
477
+ cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
478
+ lags,
479
+ nrl_search_bp=(120, 260),
480
+ pad_factor=4,
481
+ min_count=10,
482
+ max_harmonics=6,
483
+ )
484
+ except Exception as e:
485
+ r = {"error": str(e)}
486
+ else:
487
+ r = {"error": "no_data"}
488
+ metrics_by_group[(sample_name, ref)] = r
489
+
490
+ adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
491
+
492
+ global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get("nrl_bp", None)
493
+
494
+ rolling_cfg = {
495
+ "window_size": getattr(
496
+ cfg,
497
+ "rolling_window_size",
498
+ getattr(cfg, "autocorr_rolling_window_size", 600),
499
+ ),
500
+ "step": getattr(cfg, "rolling_step", 100),
501
+ "max_lag": getattr(
502
+ cfg,
503
+ "rolling_max_lag",
504
+ getattr(cfg, "autocorr_max_lag", 500),
505
+ ),
506
+ "min_molecules_per_window": getattr(cfg, "rolling_min_molecules_per_window", 10),
507
+ "nrl_search_bp": getattr(cfg, "rolling_nrl_search_bp", (120, 240)),
508
+ "pad_factor": getattr(cfg, "rolling_pad_factor", 4),
509
+ "min_count_for_mean": getattr(cfg, "rolling_min_count_for_mean", 10),
510
+ "max_harmonics": getattr(cfg, "rolling_max_harmonics", 6),
511
+ "n_jobs": getattr(cfg, "rolling_n_jobs", 4),
512
+ }
513
+
514
+ write_plots = getattr(cfg, "rolling_write_plots", True)
515
+ write_csvs = getattr(cfg, "rolling_write_csvs", True)
516
+ min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30)
517
+
518
+ rolling_out_dir = os.path.join(pp_autocorr_dir, "rolling_metrics")
519
+ os.makedirs(rolling_out_dir, exist_ok=True)
520
+ site_out_dir = os.path.join(rolling_out_dir, site_type)
521
+ os.makedirs(site_out_dir, exist_ok=True)
522
+
523
+ combined_rows = []
524
+ rolling_results_by_group = {}
525
+
526
+ for sample_name in samples:
527
+ sample_mask = adata.obs[sample_col].values == sample_name
528
+ group_masks = [("all", sample_mask)]
529
+ for ref in refs:
530
+ ref_mask = sample_mask & (adata.obs[ref_col].values == ref)
531
+ group_masks.append((ref, ref_mask))
532
+
533
+ for ref_label, mask in group_masks:
534
+ n_group = int(mask.sum())
535
+ if n_group < min_molecules_for_group:
536
+ continue
537
+
538
+ X_group = X[mask, :]
539
+ try:
540
+ df_roll = rolling_autocorr_metrics(
541
+ X_group,
542
+ positions,
543
+ site_label=site_type,
544
+ window_size=rolling_cfg["window_size"],
545
+ step=rolling_cfg["step"],
546
+ max_lag=rolling_cfg["max_lag"],
547
+ min_molecules_per_window=rolling_cfg["min_molecules_per_window"],
548
+ nrl_search_bp=rolling_cfg["nrl_search_bp"],
549
+ pad_factor=rolling_cfg["pad_factor"],
550
+ min_count_for_mean=rolling_cfg["min_count_for_mean"],
551
+ max_harmonics=rolling_cfg["max_harmonics"],
552
+ n_jobs=rolling_cfg["n_jobs"],
553
+ verbose=False,
554
+ fixed_nrl_bp=global_nrl,
555
+ )
556
+ except Exception as e:
557
+ warnings.warn(
558
+ f"rolling_autocorr_metrics failed for {site_type} "
559
+ f"{sample_name} {ref_label}: {e}"
560
+ )
561
+ continue
562
+
563
+ if "center" not in df_roll.columns:
564
+ warnings.warn(
565
+ f"rolling_autocorr_metrics returned unexpected schema "
566
+ f"for {site_type} {sample_name} {ref_label}"
567
+ )
568
+ continue
569
+
570
+ compact_df = df_roll[["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]].copy()
571
+ compact_df["site"] = site_type
572
+ compact_df["sample"] = sample_name
573
+ compact_df["reference"] = ref_label if ref_label != "all" else "all"
574
+
575
+ if write_csvs:
576
+ safe_sample = str(sample_name).replace(os.sep, "_")
577
+ safe_ref = str(ref_label if ref_label != "all" else "all").replace(os.sep, "_")
578
+ out_csv = os.path.join(
579
+ site_out_dir,
580
+ f"{safe_sample}__{safe_ref}__rolling_metrics.csv",
581
+ )
582
+ try:
583
+ compact_df.to_csv(out_csv, index=False)
584
+ except Exception as e:
585
+ warnings.warn(f"Failed to write rolling CSV {out_csv}: {e}")
586
+
587
+ if write_plots:
588
+ try:
589
+ from ..plotting import plot_rolling_metrics as _plot_roll
590
+ except Exception:
591
+ _plot_roll = None
592
+ if _plot_roll is not None:
593
+ plot_png = os.path.join(
594
+ site_out_dir,
595
+ f"{safe_sample}__{safe_ref}__rolling_metrics.png",
596
+ )
597
+ try:
598
+ _plot_roll(
599
+ compact_df,
600
+ out_png=plot_png,
601
+ title=f"{site_type} {sample_name} {ref_label}",
602
+ figsize=(10, 3.5),
603
+ dpi=160,
604
+ show=False,
605
+ )
606
+ except Exception as e:
607
+ warnings.warn(
608
+ f"Failed to create rolling plot for {site_type} "
609
+ f"{sample_name} {ref_label}: {e}"
610
+ )
611
+
612
+ combined_rows.append(
613
+ compact_df.assign(site=site_type, sample=sample_name, reference=ref_label)
614
+ )
615
+ rolling_results_by_group[(sample_name, None if ref_label == "all" else ref_label)] = compact_df
616
+
617
+ adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
618
+
619
+ if combined_rows:
620
+ combined_df_site = pd.concat(combined_rows, ignore_index=True, sort=False)
621
+ combined_out_csv = os.path.join(
622
+ rolling_out_dir, f"{site_type}__rolling_metrics_combined.csv"
623
+ )
624
+ try:
625
+ combined_df_site.to_csv(combined_out_csv, index=False)
626
+ except Exception as e:
627
+ warnings.warn(
628
+ f"Failed to write combined rolling CSV for {site_type}: {e}"
629
+ )
630
+
631
+ rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
632
+ plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
633
+ os.makedirs(plot_out_dir, exist_ok=True)
634
+ _ = plot_rolling_grid(
635
+ rolling_dict,
636
+ plot_out_dir,
637
+ site_type,
638
+ rows_per_page=cfg.rows_per_qc_autocorr_grid,
639
+ cols_per_page=len(refs),
640
+ dpi=160,
641
+ metrics=("nrl_bp", "snr", "xi"),
642
+ per_metric_ylim={"snr": (0, 25)},
643
+ )
644
+
645
+ make_dirs([pp_autocorr_dir])
646
+ plot_spatial_autocorr_grid(
647
+ adata,
648
+ pp_autocorr_dir,
649
+ site_types=cfg.autocorr_site_types,
650
+ sample_col=cfg.sample_name_col_for_plotting,
651
+ window=cfg.autocorr_rolling_window_size,
652
+ rows_per_fig=cfg.rows_per_qc_autocorr_grid,
653
+ )
654
+
655
+ # ============================================================
656
+ # 4) Pearson / correlation matrices
657
+ # ============================================================
658
+ pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
659
+
660
+ if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
661
+ print(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
662
+ else:
663
+ compute_positionwise_statistics(
664
+ adata,
665
+ layer="nan0_0minus1",
666
+ methods=cfg.correlation_matrix_types,
667
+ sample_col=cfg.sample_name_col_for_plotting,
668
+ ref_col=cfg.reference_column,
669
+ output_key="positionwise_result",
670
+ site_types=cfg.correlation_matrix_site_types,
671
+ encoding="signed",
672
+ max_threads=cfg.threads,
673
+ min_count_for_pairwise=10,
674
+ )
675
+
676
+ plot_positionwise_matrices(
677
+ adata,
678
+ methods=cfg.correlation_matrix_types,
679
+ sample_col=cfg.sample_name_col_for_plotting,
680
+ ref_col=cfg.reference_column,
681
+ figsize_per_cell=(4.0, 3.0),
682
+ dpi=160,
683
+ cmaps=cfg.correlation_matrix_cmaps,
684
+ vmin=None,
685
+ vmax=None,
686
+ output_dir=pp_corr_dir,
687
+ output_key="positionwise_result",
688
+ )
689
+
690
+ # ============================================================
691
+ # 5) Save spatial AnnData
692
+ # ============================================================
693
+ if (not spatial_adata_path.exists()) or getattr(cfg, "force_redo_spatial_analyses", False):
694
+ print("Saving spatial analyzed AnnData (post preprocessing and duplicate removal).")
695
+ write_gz_h5ad(adata, spatial_adata_path)
696
+
697
+ return adata, spatial_adata_path