smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/cli_flows.py +94 -0
  5. smftools/cli/hmm_adata.py +338 -0
  6. smftools/cli/load_adata.py +577 -0
  7. smftools/cli/preprocess_adata.py +363 -0
  8. smftools/cli/spatial_adata.py +564 -0
  9. smftools/cli_entry.py +435 -0
  10. smftools/config/conversion.yaml +11 -6
  11. smftools/config/deaminase.yaml +12 -7
  12. smftools/config/default.yaml +36 -25
  13. smftools/config/direct.yaml +25 -1
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +109 -12
  16. smftools/informatics/__init__.py +13 -7
  17. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  18. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  19. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  20. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  21. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  22. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  23. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  24. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  25. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  26. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  27. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  28. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  30. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  31. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  32. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  34. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  35. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  36. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  37. smftools/informatics/bam_functions.py +812 -0
  38. smftools/informatics/basecalling.py +67 -0
  39. smftools/informatics/bed_functions.py +366 -0
  40. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  41. smftools/informatics/fasta_functions.py +255 -0
  42. smftools/informatics/h5ad_functions.py +197 -0
  43. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  44. smftools/informatics/modkit_functions.py +129 -0
  45. smftools/informatics/ohe.py +160 -0
  46. smftools/informatics/pod5_functions.py +224 -0
  47. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  48. smftools/plotting/autocorrelation_plotting.py +1 -3
  49. smftools/plotting/general_plotting.py +1037 -362
  50. smftools/preprocessing/__init__.py +2 -0
  51. smftools/preprocessing/append_base_context.py +3 -3
  52. smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
  53. smftools/preprocessing/binarize.py +17 -0
  54. smftools/preprocessing/binarize_on_Youden.py +2 -2
  55. smftools/preprocessing/calculate_position_Youden.py +1 -1
  56. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  57. smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
  58. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  59. smftools/readwrite.py +266 -140
  60. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
  61. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
  62. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  63. smftools/cli.py +0 -184
  64. smftools/informatics/fast5_to_pod5.py +0 -24
  65. smftools/informatics/helpers/__init__.py +0 -73
  66. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  67. smftools/informatics/helpers/bam_qc.py +0 -66
  68. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  69. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  70. smftools/informatics/helpers/discover_input_files.py +0 -100
  71. smftools/informatics/helpers/index_fasta.py +0 -12
  72. smftools/informatics/helpers/make_dirs.py +0 -21
  73. smftools/informatics/readwrite.py +0 -106
  74. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  75. smftools/load_adata.py +0 -1346
  76. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  77. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  78. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  79. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  80. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  81. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  82. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  83. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  84. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  85. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  86. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  87. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  88. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  89. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  90. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  91. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  92. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  93. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  94. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  95. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  96. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,564 @@
1
+ def spatial_adata(config_path):
2
+ """
3
+ High-level function to call for spatial analysis of an adata object.
4
+ Command line accesses this through smftools spatial <config_path>
5
+
6
+ Parameters:
7
+ config_path (str): A string representing the file path to the experiment configuration csv file.
8
+
9
+ Returns:
10
+ (pp_dedup_spatial_adata, pp_dedup_spatial_adata_path)
11
+ """
12
+ from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
13
+ from .load_adata import load_adata
14
+ from .preprocess_adata import preprocess_adata
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ import anndata as ad
19
+ import scanpy as sc
20
+
21
+ import os
22
+ from importlib import resources
23
+ from pathlib import Path
24
+
25
+ from datetime import datetime
26
+ date_str = datetime.today().strftime("%y%m%d")
27
+
28
+ ############################################### smftools load start ###############################################
29
+ adata, adata_path, cfg = load_adata(config_path)
30
+ # General config variable init - Necessary user passed inputs
31
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
32
+ output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
33
+ # Make initial output directory
34
+ make_dirs([output_directory])
35
+ ############################################### smftools load end ###############################################
36
+
37
+ ############################################### smftools preprocess start ###############################################
38
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
39
+ ############################################### smftools preprocess end ###############################################
40
+
41
+ ############################################### smftools spatial start ###############################################
42
+ input_manager_df = pd.read_csv(cfg.summary_file)
43
+ initial_adata_path = Path(input_manager_df['load_adata'][0])
44
+ pp_adata_path = Path(input_manager_df['pp_adata'][0])
45
+ pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
46
+ spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
47
+ hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
48
+
49
+ if smf_modality == 'conversion':
50
+ deaminase = False
51
+ else:
52
+ deaminase = True
53
+
54
+ if pp_adata and pp_dedup_adata:
55
+ # This happens on first run of the preprocessing pipeline
56
+ first_pp_run = True
57
+ adata = pp_adata
58
+ adata_unique = pp_dedup_adata
59
+ else:
60
+ # If an anndata is saved, check which stages of the anndata are available
61
+ first_pp_run = False
62
+ initial_version_available = initial_adata_path.exists()
63
+ preprocessed_version_available = pp_adata_path.exists()
64
+ preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
65
+ preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
66
+ hmm_version_available = hmm_adata_path.exists()
67
+
68
+ if cfg.force_redo_basic_analyses:
69
+ print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
70
+ if preprocessed_dup_removed_version_available:
71
+ adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
72
+ adata_version = "pp_dedup"
73
+ elif preprocessed_version_available:
74
+ adata, load_report = safe_read_h5ad(pp_adata_path)
75
+ adata_version = "pp"
76
+ elif initial_version_available:
77
+ adata, load_report = safe_read_h5ad(initial_adata_path)
78
+ adata_version = "initial"
79
+ else:
80
+ print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
81
+ return
82
+ elif preprocessed_dedup_spatial_version_available:
83
+ print(f"Preprocessed deduplicated spatial anndata found: {spatial_adata_path}")
84
+ return None, spatial_adata_path
85
+ elif preprocessed_dup_removed_version_available:
86
+ adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
87
+ adata_version = "pp_dedup"
88
+ elif preprocessed_version_available:
89
+ adata, load_report = safe_read_h5ad(pp_adata_path)
90
+ adata_version = "pp"
91
+ elif initial_version_available:
92
+ adata, load_report = safe_read_h5ad(initial_adata_path)
93
+ adata_version = "initial"
94
+ else:
95
+ print(f"No adata available.")
96
+ return
97
+
98
+ pp_dir = output_directory / "preprocessed"
99
+ references = adata.obs[cfg.reference_column].cat.categories
100
+
101
+ if smf_modality != 'direct':
102
+ ######### Clustermaps #########
103
+ if preprocessed_version_available:
104
+ pp_clustermap_dir = pp_dir / "06_clustermaps"
105
+
106
+ if pp_clustermap_dir.is_dir():
107
+ print(f'{pp_clustermap_dir} already exists. Skipping clustermap plotting.')
108
+ else:
109
+ from ..plotting import combined_raw_clustermap
110
+ make_dirs([pp_dir, pp_clustermap_dir])
111
+
112
+ if not first_pp_run:
113
+ pp_adata, load_report = safe_read_h5ad(pp_adata_path)
114
+ else:
115
+ pp_adata = adata
116
+
117
+ clustermap_results = combined_raw_clustermap(pp_adata,
118
+ sample_col=cfg.sample_name_col_for_plotting,
119
+ reference_col=cfg.reference_column,
120
+ mod_target_bases=cfg.mod_target_bases,
121
+ layer_any_c=cfg.layer_for_clustermap_plotting,
122
+ layer_gpc=cfg.layer_for_clustermap_plotting,
123
+ layer_cpg=cfg.layer_for_clustermap_plotting,
124
+ layer_a=cfg.layer_for_clustermap_plotting,
125
+ cmap_any_c="coolwarm",
126
+ cmap_gpc="coolwarm",
127
+ cmap_cpg="viridis",
128
+ cmap_a="coolwarm",
129
+ min_quality=cfg.read_quality_filter_thresholds[0],
130
+ min_length=cfg.read_len_filter_thresholds[0],
131
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
132
+ min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
133
+ bins=None,
134
+ sample_mapping=None,
135
+ save_path=pp_clustermap_dir,
136
+ sort_by='gpc',
137
+ deaminase=deaminase)
138
+ if first_pp_run:
139
+ adata = adata_unique
140
+ else:
141
+ pass
142
+
143
+ else:
144
+ pass
145
+
146
+ #### Proceed with dedeuplicated preprocessed anndata ###
147
+ pp_dir = pp_dir / "deduplicated"
148
+ pp_clustermap_dir = pp_dir / "06_clustermaps"
149
+ pp_umap_dir = pp_dir / "07_umaps"
150
+
151
+ if pp_clustermap_dir.is_dir():
152
+ print(f'{pp_clustermap_dir} already exists. Skipping clustermap plotting.')
153
+ else:
154
+ from ..plotting import combined_raw_clustermap
155
+ make_dirs([pp_dir, pp_clustermap_dir])
156
+ if smf_modality != 'direct':
157
+ sort_by = 'gpc'
158
+ else:
159
+ sort_by = 'any_a'
160
+ clustermap_results = combined_raw_clustermap(adata,
161
+ sample_col=cfg.sample_name_col_for_plotting,
162
+ reference_col=cfg.reference_column,
163
+ mod_target_bases=cfg.mod_target_bases,
164
+ layer_any_c=cfg.layer_for_clustermap_plotting,
165
+ layer_gpc=cfg.layer_for_clustermap_plotting,
166
+ layer_cpg=cfg.layer_for_clustermap_plotting,
167
+ layer_a=cfg.layer_for_clustermap_plotting,
168
+ cmap_any_c="coolwarm",
169
+ cmap_gpc="coolwarm",
170
+ cmap_cpg="viridis",
171
+ cmap_a="coolwarm",
172
+ min_quality=cfg.read_quality_filter_thresholds[0],
173
+ min_length=cfg.read_len_filter_thresholds[0],
174
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
175
+ min_position_valid_fraction=1-cfg.position_max_nan_threshold,
176
+ bins=None,
177
+ sample_mapping=None,
178
+ save_path=pp_clustermap_dir,
179
+ sort_by=sort_by,
180
+ deaminase=deaminase)
181
+
182
+ ######### PCA/UMAP/Leiden #########
183
+ if pp_umap_dir.is_dir():
184
+ print(f'{pp_umap_dir} already exists. Skipping UMAP plotting.')
185
+ else:
186
+ from ..tools import calculate_umap
187
+ make_dirs([pp_umap_dir])
188
+
189
+ var_filters = []
190
+ if smf_modality == 'direct':
191
+ for ref in references:
192
+ for base in cfg.mod_target_bases:
193
+ var_filters += [f'{ref}_{base}_site']
194
+ elif deaminase:
195
+ for ref in references:
196
+ var_filters += [f'{ref}_any_C_site']
197
+ else:
198
+ for ref in references:
199
+ for base in cfg.mod_target_bases:
200
+ var_filters += [f'{ref}_{base}_site']
201
+
202
+ adata = calculate_umap(adata,
203
+ layer=cfg.layer_for_umap_plotting,
204
+ var_filters=var_filters,
205
+ n_pcs=10,
206
+ knn_neighbors=15)
207
+
208
+ ## Clustering
209
+ sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
210
+
211
+ # Plotting UMAP
212
+ sc.settings.figdir = pp_umap_dir
213
+ umap_layers = ['leiden', cfg.sample_name_col_for_plotting, 'Reference_strand']
214
+ umap_layers += cfg.umap_layers_to_plot
215
+ sc.pl.umap(adata, color=umap_layers, show=False, save=True)
216
+
217
+ ########## Spatial autocorrelation analyses ###########
218
+ from ..tools.spatial_autocorrelation import binary_autocorrelation_with_spacing, analyze_autocorr_matrix, bootstrap_periodicity, rolling_autocorr_metrics
219
+ from ..plotting import plot_rolling_grid
220
+ import warnings
221
+
222
+ pp_autocorr_dir = pp_dir / "08_autocorrelations"
223
+
224
+ if pp_autocorr_dir.is_dir():
225
+ print(f'{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.')
226
+ else:
227
+ positions = adata.var_names.astype(int).values
228
+ lags = np.arange(cfg.autocorr_max_lag + 1)
229
+
230
+ # optional: try to parallelize autocorr per-row with joblib
231
+ try:
232
+ from joblib import Parallel, delayed
233
+ _have_joblib = True
234
+ except Exception:
235
+ _have_joblib = False
236
+
237
+ for site_type in cfg.autocorr_site_types:
238
+ layer_key = f"{site_type}_site_binary"
239
+ if layer_key not in adata.layers:
240
+ print(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
241
+ continue
242
+
243
+ X = adata.layers[layer_key]
244
+ if getattr(X, "shape", (0,))[0] == 0:
245
+ print(f"Layer {layer_key} empty — skipping {site_type}.")
246
+ continue
247
+
248
+ # compute per-molecule autocorrs (and counts)
249
+ rows = []
250
+ counts = []
251
+ if _have_joblib:
252
+ # parallel map
253
+ def _worker(row):
254
+ try:
255
+ ac, cnts = binary_autocorrelation_with_spacing(
256
+ row, positions, max_lag=cfg.autocorr_max_lag, return_counts=True
257
+ )
258
+ except Exception as e:
259
+ # on error return NaN arrays
260
+ ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
261
+ cnts = np.zeros(cfg.autocorr_max_lag + 1, dtype=np.int32)
262
+ return ac, cnts
263
+
264
+ res = Parallel(n_jobs=cfg.n_jobs if hasattr(cfg, "n_jobs") else -1)(
265
+ delayed(_worker)(X[i]) for i in range(X.shape[0])
266
+ )
267
+ for ac, cnts in res:
268
+ rows.append(ac)
269
+ counts.append(cnts)
270
+ else:
271
+ # sequential fallback
272
+ for i in range(X.shape[0]):
273
+ ac, cnts = binary_autocorrelation_with_spacing(
274
+ X[i], positions, max_lag=cfg.autocorr_max_lag, return_counts=True
275
+ )
276
+ rows.append(ac)
277
+ counts.append(cnts)
278
+
279
+ autocorr_matrix = np.asarray(rows, dtype=np.float32)
280
+ counts_matrix = np.asarray(counts, dtype=np.int32)
281
+
282
+ # store raw per-molecule arrays (keep memory format compact)
283
+ adata.obsm[f"{site_type}_spatial_autocorr"] = autocorr_matrix
284
+ adata.obsm[f"{site_type}_spatial_autocorr_counts"] = counts_matrix
285
+ adata.uns[f"{site_type}_spatial_autocorr_lags"] = lags
286
+
287
+ # compute global periodicity metrics across all molecules for this site_type
288
+ try:
289
+ results = analyze_autocorr_matrix(
290
+ autocorr_matrix, counts_matrix, lags,
291
+ nrl_search_bp=(120, 260), pad_factor=4, min_count=20, max_harmonics=6
292
+ )
293
+ except Exception as e:
294
+ results = {"error": str(e)}
295
+
296
+ # store global metrics (same keys you used)
297
+ global_metrics = {
298
+ "nrl_bp": results.get("nrl_bp", np.nan),
299
+ "xi": results.get("xi", np.nan),
300
+ "snr": results.get("snr", np.nan),
301
+ "fwhm_bp": results.get("fwhm_bp", np.nan),
302
+ "envelope_sample_lags": results.get("envelope_sample_lags", np.array([])).tolist(),
303
+ "envelope_heights": results.get("envelope_heights", np.array([])).tolist(),
304
+ "analyzer_error": results.get("error", None),
305
+ }
306
+ adata.uns[f"{site_type}_spatial_periodicity_metrics"] = global_metrics
307
+
308
+ # bootstrap for CI (use a reasonable default; set low only for debugging)
309
+ n_boot = getattr(cfg, "autocorr_bootstrap_n", 200)
310
+ # if user intentionally set very low n_boot in cfg, we keep that; otherwise default 200
311
+ try:
312
+ bs = bootstrap_periodicity(
313
+ autocorr_matrix, counts_matrix, lags,
314
+ n_boot=n_boot, nrl_search_bp=(120, 260), pad_factor=4, min_count=20
315
+ )
316
+ adata.uns[f"{site_type}_spatial_periodicity_boot"] = {
317
+ "nrl_boot": np.asarray(bs["nrl_boot"]).tolist(),
318
+ "xi_boot": np.asarray(bs["xi_boot"]).tolist(),
319
+ }
320
+ except Exception as e:
321
+ adata.uns[f"{site_type}_spatial_periodicity_boot_error"] = str(e)
322
+
323
+ # ----------------------------
324
+ # Compute group-level metrics for plotting (per sample × reference)
325
+ # ----------------------------
326
+ metrics_by_group = {}
327
+ sample_col = cfg.sample_name_col_for_plotting
328
+ ref_col = cfg.reference_strand_col if hasattr(cfg, "reference_strand_col") else "Reference_strand"
329
+ samples = adata.obs[sample_col].astype("category").cat.categories.tolist()
330
+ refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
331
+
332
+ # iterate groups and run analyzer on each group's subset; cache errors
333
+ for sample_name in samples:
334
+ sample_mask = (adata.obs[sample_col].values == sample_name)
335
+ # combined group
336
+ mask = sample_mask
337
+ ac_sel = autocorr_matrix[mask, :]
338
+ cnt_sel = counts_matrix[mask, :] if counts_matrix is not None else None
339
+ if ac_sel.size:
340
+ try:
341
+ r = analyze_autocorr_matrix(ac_sel, cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
342
+ lags, nrl_search_bp=(120,260), pad_factor=4, min_count=10, max_harmonics=6)
343
+ except Exception as e:
344
+ r = {"error": str(e)}
345
+ else:
346
+ r = {"error": "no_data"}
347
+ metrics_by_group[(sample_name, None)] = r
348
+
349
+ # per-reference groups
350
+ for ref in refs:
351
+ mask_ref = sample_mask & (adata.obs[ref_col].values == ref)
352
+ ac_sel = autocorr_matrix[mask_ref, :]
353
+ cnt_sel = counts_matrix[mask_ref, :] if counts_matrix is not None else None
354
+ if ac_sel.size:
355
+ try:
356
+ r = analyze_autocorr_matrix(ac_sel, cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
357
+ lags, nrl_search_bp=(120,260), pad_factor=4, min_count=10, max_harmonics=6)
358
+ except Exception as e:
359
+ r = {"error": str(e)}
360
+ else:
361
+ r = {"error": "no_data"}
362
+ metrics_by_group[(sample_name, ref)] = r
363
+
364
+ # persist group metrics
365
+ adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
366
+
367
+ global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get("nrl_bp", None)
368
+
369
+ # configuration / sensible defaults (override in cfg if present)
370
+ rolling_cfg = {
371
+ "window_size": getattr(cfg, "rolling_window_size", getattr(cfg, "autocorr_rolling_window_size", 600)),
372
+ "step": getattr(cfg, "rolling_step", 100),
373
+ "max_lag": getattr(cfg, "rolling_max_lag", cfg.autocorr_max_lag if hasattr(cfg, "autocorr_max_lag") else 500),
374
+ "min_molecules_per_window": getattr(cfg, "rolling_min_molecules_per_window", 10),
375
+ "nrl_search_bp": getattr(cfg, "rolling_nrl_search_bp", (120, 240)),
376
+ "pad_factor": getattr(cfg, "rolling_pad_factor", 4),
377
+ "min_count_for_mean": getattr(cfg, "rolling_min_count_for_mean", 10),
378
+ "max_harmonics": getattr(cfg, "rolling_max_harmonics", 6),
379
+ "n_jobs": getattr(cfg, "rolling_n_jobs", 4),
380
+ }
381
+
382
+ write_plots = getattr(cfg, "rolling_write_plots", True)
383
+ write_csvs = getattr(cfg, "rolling_write_csvs", True)
384
+ min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30) # only run rolling if group has >= this many molecules
385
+
386
+ rolling_out_dir = os.path.join(pp_autocorr_dir, "rolling_metrics")
387
+ os.makedirs(rolling_out_dir, exist_ok=True)
388
+ # also a per-site subfolder
389
+ site_out_dir = os.path.join(rolling_out_dir, site_type)
390
+ os.makedirs(site_out_dir, exist_ok=True)
391
+
392
+ combined_rows = [] # accumulate one row per window for combined CSV
393
+ rolling_results_by_group = {} # store DataFrame per group in memory (persist later to adata.uns)
394
+
395
+ # iterate groups (samples × refs). `samples` and `refs` were computed above.
396
+ for sample_name in samples:
397
+ sample_mask = (adata.obs[sample_col].values == sample_name)
398
+ # first the combined group ("all refs")
399
+ group_masks = [("all", sample_mask)]
400
+ # then per-reference groups
401
+ for ref in refs:
402
+ ref_mask = sample_mask & (adata.obs[ref_col].values == ref)
403
+ group_masks.append((ref, ref_mask))
404
+
405
+ for ref_label, mask in group_masks:
406
+ n_group = int(mask.sum())
407
+ if n_group < min_molecules_for_group:
408
+ # skip tiny groups
409
+ if cfg.get("verbosity", 0) if hasattr(cfg, "get") else False:
410
+ print(f"Skipping rolling for {site_type} {sample_name} {ref_label}: only {n_group} molecules (<{min_molecules_for_group})")
411
+ # still write an empty CSV row set if desired; here we skip
412
+ continue
413
+
414
+ # extract group matrix X_group (works with dense or sparse adata.layers)
415
+ X_group = X[mask, :]
416
+ # positions already set above
417
+ try:
418
+ # call your rolling function (this may be slow; it uses cfg.n_jobs)
419
+ df_roll = rolling_autocorr_metrics(
420
+ X_group,
421
+ positions,
422
+ site_label=site_type,
423
+ window_size=rolling_cfg["window_size"],
424
+ step=rolling_cfg["step"],
425
+ max_lag=rolling_cfg["max_lag"],
426
+ min_molecules_per_window=rolling_cfg["min_molecules_per_window"],
427
+ nrl_search_bp=rolling_cfg["nrl_search_bp"],
428
+ pad_factor=rolling_cfg["pad_factor"],
429
+ min_count_for_mean=rolling_cfg["min_count_for_mean"],
430
+ max_harmonics=rolling_cfg["max_harmonics"],
431
+ n_jobs=rolling_cfg["n_jobs"],
432
+ verbose=False,
433
+ fixed_nrl_bp=global_nrl
434
+ )
435
+ except Exception as e:
436
+ warnings.warn(f"rolling_autocorr_metrics failed for {site_type} {sample_name} {ref_label}: {e}")
437
+ continue
438
+
439
+ # normalize column names and keep only the compact set you want
440
+ # keep: center, n_molecules, nrl_bp, snr, xi, fwhm_bp
441
+ if "center" not in df_roll.columns:
442
+ # defensive: if the rolling function returned different schema, skip
443
+ warnings.warn(f"rolling_autocorr_metrics returned unexpected schema for {site_type} {sample_name} {ref_label}")
444
+ continue
445
+
446
+ compact_df = df_roll[["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]].copy()
447
+ compact_df["site"] = site_type
448
+ compact_df["sample"] = sample_name
449
+ compact_df["reference"] = ref_label if ref_label != "all" else "all"
450
+
451
+ # save per-group CSV
452
+ if write_csvs:
453
+ safe_sample = str(sample_name).replace(os.sep, "_")
454
+ safe_ref = str(ref_label if ref_label != "all" else "all").replace(os.sep, "_")
455
+ out_csv = os.path.join(site_out_dir, f"{safe_sample}__{safe_ref}__rolling_metrics.csv")
456
+ try:
457
+ compact_df.to_csv(out_csv, index=False)
458
+ except Exception as e:
459
+ warnings.warn(f"Failed to write rolling CSV {out_csv}: {e}")
460
+
461
+ # save a plot per-group (NRL and SNR vs center)
462
+ if write_plots:
463
+ try:
464
+ # use your plot helper; if it's in a different module, import accordingly
465
+ from ..plotting import plot_rolling_metrics as _plot_roll
466
+ except Exception:
467
+ _plot_roll = globals().get("plot_rolling_metrics", None)
468
+ if _plot_roll is not None:
469
+ plot_png = os.path.join(site_out_dir, f"{safe_sample}__{safe_ref}__rolling_metrics.png")
470
+ try:
471
+ _plot_roll(compact_df, out_png=plot_png,
472
+ title=f"{site_type} {sample_name} {ref_label}",
473
+ figsize=(10,3.5), dpi=160, show=False)
474
+ except Exception as e:
475
+ warnings.warn(f"Failed to create rolling plot for {site_type} {sample_name} {ref_label}: {e}")
476
+
477
+ # store in combined_rows and in-memory dict
478
+ combined_rows.append(compact_df.assign(site=site_type, sample=sample_name, reference=ref_label))
479
+ rolling_results_by_group[(sample_name, None if ref_label == "all" else ref_label)] = compact_df
480
+
481
+ # persist per-site rolling metrics into adata.uns as dict of DataFrames (or empty dict)
482
+ adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
483
+
484
+ # write combined CSV for this site across all groups
485
+ if len(combined_rows):
486
+ combined_df_site = pd.concat(combined_rows, ignore_index=True, sort=False)
487
+ combined_out_csv = os.path.join(rolling_out_dir, f"{site_type}__rolling_metrics_combined.csv")
488
+ try:
489
+ combined_df_site.to_csv(combined_out_csv, index=False)
490
+ except Exception as e:
491
+ warnings.warn(f"Failed to write combined rolling CSV for {site_type}: {e}")
492
+
493
+ rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
494
+ plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
495
+ os.makedirs(plot_out_dir, exist_ok=True)
496
+ pages = plot_rolling_grid(rolling_dict, plot_out_dir, site_type,
497
+ rows_per_page=cfg.rows_per_qc_autocorr_grid,
498
+ cols_per_page=len(refs),
499
+ dpi=160,
500
+ metrics=("nrl_bp","snr", "xi"),
501
+ per_metric_ylim={"snr": (0, 25)})
502
+
503
+ from ..plotting import plot_spatial_autocorr_grid
504
+ make_dirs([pp_autocorr_dir, pp_autocorr_dir])
505
+
506
+ plot_spatial_autocorr_grid(adata,
507
+ pp_autocorr_dir,
508
+ site_types=cfg.autocorr_site_types,
509
+ sample_col=cfg.sample_name_col_for_plotting,
510
+ window=cfg.autocorr_rolling_window_size,
511
+ rows_per_fig=cfg.rows_per_qc_autocorr_grid)
512
+
513
+ ############ Pearson analyses ###############
514
+ if smf_modality != 'direct':
515
+ from ..tools.position_stats import compute_positionwise_statistics, plot_positionwise_matrices
516
+
517
+ pp_corr_dir = pp_dir / "09_correlation_matrices"
518
+
519
+ if pp_corr_dir.is_dir():
520
+ print(f'{pp_corr_dir} already exists. Skipping correlation matrix plotting.')
521
+ else:
522
+ compute_positionwise_statistics(
523
+ adata,
524
+ layer="nan0_0minus1",
525
+ methods=cfg.correlation_matrix_types,
526
+ sample_col=cfg.sample_name_col_for_plotting,
527
+ ref_col=cfg.reference_column,
528
+ output_key="positionwise_result",
529
+ site_types=cfg.correlation_matrix_site_types,
530
+ encoding="signed",
531
+ max_threads=cfg.threads,
532
+ min_count_for_pairwise=10,
533
+ )
534
+
535
+ plot_positionwise_matrices(
536
+ adata,
537
+ methods=cfg.correlation_matrix_types,
538
+ sample_col=cfg.sample_name_col_for_plotting,
539
+ ref_col=cfg.reference_column,
540
+ figsize_per_cell=(4.0, 3.0),
541
+ dpi=160,
542
+ cmaps=cfg.correlation_matrix_cmaps,
543
+ vmin=None,
544
+ vmax=None,
545
+ output_dir=pp_corr_dir,
546
+ output_key= "positionwise_result"
547
+ )
548
+
549
+ ####### Save basic analysis adata - post preprocessing and duplicate removal ################
550
+ from ..readwrite import safe_write_h5ad
551
+ if not spatial_adata_path.exists() or cfg.force_redo_preprocessing:
552
+ print('Saving spatial analyzed adata post preprocessing and duplicate removal')
553
+ if ".gz" == spatial_adata_path.suffix:
554
+ print(f"Spatial adata path: {spatial_adata_path}")
555
+ safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
556
+ else:
557
+ spatial_adata_path = spatial_adata_path.with_name(spatial_adata_path.name + '.gz')
558
+ print(f"Spatial adata path: {spatial_adata_path}")
559
+ safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
560
+ ############################################### smftools spatial end ###############################################
561
+
562
+ add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_adata_path)
563
+
564
+ return adata, spatial_adata_path