smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,277 +1,458 @@
1
- def spatial_adata(config_path):
2
- """
3
- High-level function to call for spatial analysis of an adata object.
4
- Command line accesses this through smftools spatial <config_path>
1
+ from pathlib import Path
2
+ from typing import Optional, Tuple
3
+
4
+ import anndata as ad
5
+
6
+ from smftools.logging_utils import get_logger
5
7
 
6
- Parameters:
7
- config_path (str): A string representing the file path to the experiment configuration csv file.
8
+ logger = get_logger(__name__)
8
9
 
9
- Returns:
10
- (pp_dedup_spatial_adata, pp_dedup_spatial_adata_path)
10
+
11
+ def spatial_adata(
12
+ config_path: str,
13
+ ) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
14
+ """
15
+ CLI-facing wrapper for spatial analyses.
16
+
17
+ Called by: `smftools spatial <config_path>`
18
+
19
+ Responsibilities:
20
+ - Ensure a usable AnnData exists via `load_adata` + `preprocess_adata`.
21
+ - Determine which AnnData stages exist (raw, pp, pp_dedup, spatial, hmm).
22
+ - Respect cfg.force_redo_spatial_analyses.
23
+ - Decide whether to skip (return existing) or run the spatial core.
24
+ - Call `spatial_adata_core(...)` when actual work is needed.
25
+
26
+ Returns
27
+ -------
28
+ spatial_adata : AnnData | None
29
+ AnnData with spatial analyses, or None if we skipped because a later-stage
30
+ AnnData already exists.
31
+ spatial_adata_path : Path | None
32
+ Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
11
33
  """
12
- from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
34
+ from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
35
+ from .helpers import get_adata_paths
13
36
  from .load_adata import load_adata
14
37
  from .preprocess_adata import preprocess_adata
15
38
 
16
- import numpy as np
17
- import pandas as pd
18
- import anndata as ad
19
- import scanpy as sc
20
-
39
+ # 1) Ensure config + basic paths via load_adata
40
+ loaded_adata, loaded_path, cfg = load_adata(config_path)
41
+ paths = get_adata_paths(cfg)
42
+
43
+ raw_path = paths.raw
44
+ pp_path = paths.pp
45
+ pp_dedup_path = paths.pp_dedup
46
+ spatial_path = paths.spatial
47
+ hmm_path = paths.hmm
48
+
49
+ # Stage-skipping logic for spatial
50
+ if not getattr(cfg, "force_redo_spatial_analyses", False):
51
+ # If HMM exists, it's the most processed stage — reuse it.
52
+ if hmm_path.exists():
53
+ logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
54
+ return None, hmm_path
55
+
56
+ # If spatial exists, we consider spatial analyses already done.
57
+ if spatial_path.exists():
58
+ logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
59
+ return None, spatial_path
60
+
61
+ # 2) Ensure preprocessing has been run
62
+ # This will create pp/pp_dedup as needed or return them if they already exist.
63
+ pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
64
+ config_path
65
+ )
66
+
67
+ # Helper to load from disk, reusing loaded_adata if it matches
68
+ def _load(path: Path):
69
+ if loaded_adata is not None and loaded_path == path:
70
+ return loaded_adata
71
+ adata, _ = safe_read_h5ad(path)
72
+ return adata
73
+
74
+ # 3) Decide which AnnData to use as the *starting point* for spatial analyses
75
+ # Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
76
+ if pp_dedup_adata is not None:
77
+ start_adata = pp_dedup_adata
78
+ source_path = pp_dedup_adata_path_ret
79
+ else:
80
+ if pp_dedup_path.exists():
81
+ start_adata = _load(pp_dedup_path)
82
+ source_path = pp_dedup_path
83
+ elif pp_path.exists():
84
+ start_adata = _load(pp_path)
85
+ source_path = pp_path
86
+ elif raw_path.exists():
87
+ start_adata = _load(raw_path)
88
+ source_path = raw_path
89
+ else:
90
+ logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
91
+ return None, None
92
+
93
+ # 4) Run the spatial core
94
+ adata_spatial, spatial_path = spatial_adata_core(
95
+ adata=start_adata,
96
+ cfg=cfg,
97
+ spatial_adata_path=spatial_path,
98
+ pp_adata_path=pp_path,
99
+ pp_dup_rem_adata_path=pp_dedup_path,
100
+ pp_adata_in_memory=pp_adata,
101
+ source_adata_path=source_path,
102
+ config_path=config_path,
103
+ )
104
+
105
+ # 5) Register spatial path in summary CSV
106
+ add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
107
+
108
+ return adata_spatial, spatial_path
109
+
110
+
111
+ def spatial_adata_core(
112
+ adata: ad.AnnData,
113
+ cfg,
114
+ spatial_adata_path: Path,
115
+ pp_adata_path: Path,
116
+ pp_dup_rem_adata_path: Path,
117
+ pp_adata_in_memory: Optional[ad.AnnData] = None,
118
+ source_adata_path: Optional[Path] = None,
119
+ config_path: Optional[str] = None,
120
+ ) -> Tuple[ad.AnnData, Path]:
121
+ """
122
+ Core spatial analysis pipeline.
123
+
124
+ Assumes:
125
+ - `adata` is (typically) the preprocessed, duplicate-removed AnnData.
126
+ - `cfg` is the ExperimentConfig.
127
+ - `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
128
+ from `get_adata_paths`.
129
+ - `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
130
+ the same run of `preprocess_adata`, to avoid re-reading from disk.
131
+
132
+ Does:
133
+ - Optional sample sheet load.
134
+ - Optional inversion & reindexing.
135
+ - Clustermaps on:
136
+ * preprocessed (non-dedup) AnnData (for non-direct modalities), and
137
+ * deduplicated preprocessed AnnData.
138
+ - PCA/UMAP/Leiden.
139
+ - Autocorrelation + rolling metrics + grids.
140
+ - Positionwise correlation matrices (non-direct modalities).
141
+ - Save spatial AnnData to `spatial_adata_path`.
142
+
143
+ Returns
144
+ -------
145
+ adata : AnnData
146
+ Spatially analyzed AnnData (same object, modified in-place).
147
+ spatial_adata_path : Path
148
+ Path where spatial AnnData was written.
149
+ """
21
150
  import os
22
- from importlib import resources
151
+ import warnings
23
152
  from pathlib import Path
24
153
 
25
- from datetime import datetime
26
- date_str = datetime.today().strftime("%y%m%d")
154
+ import numpy as np
155
+ import pandas as pd
156
+ import scanpy as sc
27
157
 
28
- ############################################### smftools load start ###############################################
29
- adata, adata_path, cfg = load_adata(config_path)
30
- # General config variable init - Necessary user passed inputs
31
- smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
32
- output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
33
- # Make initial output directory
158
+ from ..metadata import record_smftools_metadata
159
+ from ..plotting import (
160
+ combined_raw_clustermap,
161
+ plot_rolling_grid,
162
+ plot_spatial_autocorr_grid,
163
+ )
164
+ from ..preprocessing import (
165
+ invert_adata,
166
+ load_sample_sheet,
167
+ reindex_references_adata,
168
+ )
169
+ from ..readwrite import make_dirs, safe_read_h5ad
170
+ from ..tools import calculate_umap
171
+ from ..tools.position_stats import (
172
+ compute_positionwise_statistics,
173
+ plot_positionwise_matrices,
174
+ )
175
+ from ..tools.spatial_autocorrelation import (
176
+ analyze_autocorr_matrix,
177
+ binary_autocorrelation_with_spacing,
178
+ bootstrap_periodicity,
179
+ rolling_autocorr_metrics,
180
+ )
181
+ from .helpers import write_gz_h5ad
182
+
183
+ # -----------------------------
184
+ # General setup
185
+ # -----------------------------
186
+ output_directory = Path(cfg.output_directory)
34
187
  make_dirs([output_directory])
35
- ############################################### smftools load end ###############################################
36
188
 
37
- ############################################### smftools preprocess start ###############################################
38
- pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
39
- ############################################### smftools preprocess end ###############################################
40
-
41
- ############################################### smftools spatial start ###############################################
42
- input_manager_df = pd.read_csv(cfg.summary_file)
43
- initial_adata_path = Path(input_manager_df['load_adata'][0])
44
- pp_adata_path = Path(input_manager_df['pp_adata'][0])
45
- pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
46
- spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
47
- hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
48
-
49
- if smf_modality == 'conversion':
189
+ smf_modality = cfg.smf_modality
190
+ if smf_modality == "conversion":
50
191
  deaminase = False
51
192
  else:
52
193
  deaminase = True
53
194
 
54
- if pp_adata and pp_dedup_adata:
55
- # This happens on first run of the preprocessing pipeline
56
- first_pp_run = True
57
- adata = pp_adata
58
- adata_unique = pp_dedup_adata
195
+ first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
196
+
197
+ # -----------------------------
198
+ # Optional sample sheet metadata
199
+ # -----------------------------
200
+ if getattr(cfg, "sample_sheet_path", None):
201
+ load_sample_sheet(
202
+ adata,
203
+ cfg.sample_sheet_path,
204
+ mapping_key_column=cfg.sample_sheet_mapping_column,
205
+ as_category=True,
206
+ force_reload=cfg.force_reload_sample_sheet,
207
+ )
208
+
209
+ # -----------------------------
210
+ # Optional inversion along positions axis
211
+ # -----------------------------
212
+ if getattr(cfg, "invert_adata", False):
213
+ adata = invert_adata(adata)
214
+
215
+ # -----------------------------
216
+ # Optional reindexing by reference
217
+ # -----------------------------
218
+ reindex_references_adata(
219
+ adata,
220
+ reference_col=cfg.reference_column,
221
+ offsets=cfg.reindexing_offsets,
222
+ new_col=cfg.reindexed_var_suffix,
223
+ )
224
+
225
+ if adata.uns.get("reindex_references_adata_performed", False):
226
+ reindex_suffix = cfg.reindexed_var_suffix
59
227
  else:
60
- # If an anndata is saved, check which stages of the anndata are available
61
- first_pp_run = False
62
- initial_version_available = initial_adata_path.exists()
63
- preprocessed_version_available = pp_adata_path.exists()
64
- preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
65
- preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
66
- hmm_version_available = hmm_adata_path.exists()
67
-
68
- if cfg.force_redo_basic_analyses:
69
- print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
70
- if preprocessed_dup_removed_version_available:
71
- adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
72
- adata_version = "pp_dedup"
73
- elif preprocessed_version_available:
74
- adata, load_report = safe_read_h5ad(pp_adata_path)
75
- adata_version = "pp"
76
- elif initial_version_available:
77
- adata, load_report = safe_read_h5ad(initial_adata_path)
78
- adata_version = "initial"
79
- else:
80
- print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
81
- return
82
- elif preprocessed_dedup_spatial_version_available:
83
- print(f"Preprocessed deduplicated spatial anndata found: {spatial_adata_path}")
84
- return None, spatial_adata_path
85
- elif preprocessed_dup_removed_version_available:
86
- adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
87
- adata_version = "pp_dedup"
88
- elif preprocessed_version_available:
89
- adata, load_report = safe_read_h5ad(pp_adata_path)
90
- adata_version = "pp"
91
- elif initial_version_available:
92
- adata, load_report = safe_read_h5ad(initial_adata_path)
93
- adata_version = "initial"
94
- else:
95
- print(f"No adata available.")
96
- return
97
-
228
+ reindex_suffix = None
229
+
98
230
  pp_dir = output_directory / "preprocessed"
99
231
  references = adata.obs[cfg.reference_column].cat.categories
100
232
 
101
- if smf_modality != 'direct':
102
- ######### Clustermaps #########
233
+ # ============================================================
234
+ # 1) Clustermaps (non-direct modalities) on *preprocessed* data
235
+ # ============================================================
236
+ if smf_modality != "direct":
237
+ preprocessed_version_available = pp_adata_path.exists()
238
+
103
239
  if preprocessed_version_available:
104
240
  pp_clustermap_dir = pp_dir / "06_clustermaps"
105
241
 
106
- if pp_clustermap_dir.is_dir():
107
- print(f'{pp_clustermap_dir} already exists. Skipping clustermap plotting.')
242
+ if pp_clustermap_dir.is_dir() and not getattr(
243
+ cfg, "force_redo_spatial_analyses", False
244
+ ):
245
+ logger.debug(
246
+ f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
247
+ )
108
248
  else:
109
- from ..plotting import combined_raw_clustermap
110
249
  make_dirs([pp_dir, pp_clustermap_dir])
111
250
 
112
- if not first_pp_run:
113
- pp_adata, load_report = safe_read_h5ad(pp_adata_path)
251
+ if first_pp_run and (pp_adata_in_memory is not None):
252
+ pp_adata = pp_adata_in_memory
114
253
  else:
115
- pp_adata = adata
116
-
117
- clustermap_results = combined_raw_clustermap(pp_adata,
118
- sample_col=cfg.sample_name_col_for_plotting,
119
- reference_col=cfg.reference_column,
120
- mod_target_bases=cfg.mod_target_bases,
121
- layer_any_c=cfg.layer_for_clustermap_plotting,
122
- layer_gpc=cfg.layer_for_clustermap_plotting,
123
- layer_cpg=cfg.layer_for_clustermap_plotting,
124
- layer_a=cfg.layer_for_clustermap_plotting,
125
- cmap_any_c="coolwarm",
126
- cmap_gpc="coolwarm",
127
- cmap_cpg="viridis",
128
- cmap_a="coolwarm",
129
- min_quality=cfg.read_quality_filter_thresholds[0],
130
- min_length=cfg.read_len_filter_thresholds[0],
131
- min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
132
- min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
133
- bins=None,
134
- sample_mapping=None,
135
- save_path=pp_clustermap_dir,
136
- sort_by='gpc',
137
- deaminase=deaminase)
138
- if first_pp_run:
139
- adata = adata_unique
140
- else:
141
- pass
254
+ pp_adata, _ = safe_read_h5ad(pp_adata_path)
255
+
256
+ # -----------------------------
257
+ # Optional sample sheet metadata
258
+ # -----------------------------
259
+ if getattr(cfg, "sample_sheet_path", None):
260
+ load_sample_sheet(
261
+ pp_adata,
262
+ cfg.sample_sheet_path,
263
+ mapping_key_column=cfg.sample_sheet_mapping_column,
264
+ as_category=True,
265
+ force_reload=cfg.force_reload_sample_sheet,
266
+ )
142
267
 
143
- else:
144
- pass
145
-
146
- #### Proceed with dedeuplicated preprocessed anndata ###
147
- pp_dir = pp_dir / "deduplicated"
148
- pp_clustermap_dir = pp_dir / "06_clustermaps"
149
- pp_umap_dir = pp_dir / "07_umaps"
150
-
151
- if pp_clustermap_dir.is_dir():
152
- print(f'{pp_clustermap_dir} already exists. Skipping clustermap plotting.')
268
+ # -----------------------------
269
+ # Optional inversion along positions axis
270
+ # -----------------------------
271
+ if getattr(cfg, "invert_adata", False):
272
+ pp_adata = invert_adata(pp_adata)
273
+
274
+ # -----------------------------
275
+ # Optional reindexing by reference
276
+ # -----------------------------
277
+ reindex_references_adata(
278
+ pp_adata,
279
+ reference_col=cfg.reference_column,
280
+ offsets=cfg.reindexing_offsets,
281
+ new_col=cfg.reindexed_var_suffix,
282
+ )
283
+
284
+ combined_raw_clustermap(
285
+ pp_adata,
286
+ sample_col=cfg.sample_name_col_for_plotting,
287
+ reference_col=cfg.reference_column,
288
+ mod_target_bases=cfg.mod_target_bases,
289
+ layer_c=cfg.layer_for_clustermap_plotting,
290
+ layer_gpc=cfg.layer_for_clustermap_plotting,
291
+ layer_cpg=cfg.layer_for_clustermap_plotting,
292
+ layer_a=cfg.layer_for_clustermap_plotting,
293
+ cmap_c=cfg.clustermap_cmap_c,
294
+ cmap_gpc=cfg.clustermap_cmap_gpc,
295
+ cmap_cpg=cfg.clustermap_cmap_cpg,
296
+ cmap_a=cfg.clustermap_cmap_a,
297
+ min_quality=cfg.read_quality_filter_thresholds[0],
298
+ min_length=cfg.read_len_filter_thresholds[0],
299
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
300
+ 0
301
+ ],
302
+ min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
303
+ demux_types=("double", "already"),
304
+ bins=None,
305
+ sample_mapping=None,
306
+ save_path=pp_clustermap_dir,
307
+ sort_by=cfg.spatial_clustermap_sortby,
308
+ deaminase=deaminase,
309
+ index_col_suffix=reindex_suffix,
310
+ )
311
+
312
+ # ============================================================
313
+ # 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
314
+ # ============================================================
315
+ pp_dir_dedup = pp_dir / "deduplicated"
316
+ pp_clustermap_dir_dedup = pp_dir_dedup / "06_clustermaps"
317
+ pp_umap_dir = pp_dir_dedup / "07_umaps"
318
+
319
+ # Clustermaps on deduplicated adata
320
+ if pp_clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
321
+ logger.debug(
322
+ f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
323
+ )
153
324
  else:
154
- from ..plotting import combined_raw_clustermap
155
- make_dirs([pp_dir, pp_clustermap_dir])
156
- if smf_modality != 'direct':
157
- sort_by = 'gpc'
158
- else:
159
- sort_by = 'any_a'
160
- clustermap_results = combined_raw_clustermap(adata,
161
- sample_col=cfg.sample_name_col_for_plotting,
162
- reference_col=cfg.reference_column,
163
- mod_target_bases=cfg.mod_target_bases,
164
- layer_any_c=cfg.layer_for_clustermap_plotting,
165
- layer_gpc=cfg.layer_for_clustermap_plotting,
166
- layer_cpg=cfg.layer_for_clustermap_plotting,
167
- layer_a=cfg.layer_for_clustermap_plotting,
168
- cmap_any_c="coolwarm",
169
- cmap_gpc="coolwarm",
170
- cmap_cpg="viridis",
171
- cmap_a="coolwarm",
172
- min_quality=cfg.read_quality_filter_thresholds[0],
173
- min_length=cfg.read_len_filter_thresholds[0],
174
- min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
175
- min_position_valid_fraction=1-cfg.position_max_nan_threshold,
176
- bins=None,
177
- sample_mapping=None,
178
- save_path=pp_clustermap_dir,
179
- sort_by=sort_by,
180
- deaminase=deaminase)
181
-
182
- ######### PCA/UMAP/Leiden #########
183
- if pp_umap_dir.is_dir():
184
- print(f'{pp_umap_dir} already exists. Skipping UMAP plotting.')
325
+ make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
326
+ combined_raw_clustermap(
327
+ adata,
328
+ sample_col=cfg.sample_name_col_for_plotting,
329
+ reference_col=cfg.reference_column,
330
+ mod_target_bases=cfg.mod_target_bases,
331
+ layer_c=cfg.layer_for_clustermap_plotting,
332
+ layer_gpc=cfg.layer_for_clustermap_plotting,
333
+ layer_cpg=cfg.layer_for_clustermap_plotting,
334
+ layer_a=cfg.layer_for_clustermap_plotting,
335
+ cmap_c=cfg.clustermap_cmap_c,
336
+ cmap_gpc=cfg.clustermap_cmap_gpc,
337
+ cmap_cpg=cfg.clustermap_cmap_cpg,
338
+ cmap_a=cfg.clustermap_cmap_a,
339
+ min_quality=cfg.read_quality_filter_thresholds[0],
340
+ min_length=cfg.read_len_filter_thresholds[0],
341
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
342
+ 0
343
+ ],
344
+ min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
345
+ demux_types=("double", "already"),
346
+ bins=None,
347
+ sample_mapping=None,
348
+ save_path=pp_clustermap_dir_dedup,
349
+ sort_by=cfg.spatial_clustermap_sortby,
350
+ deaminase=deaminase,
351
+ index_col_suffix=reindex_suffix,
352
+ )
353
+
354
+ # UMAP / Leiden
355
+ if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
356
+ logger.debug(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
185
357
  else:
186
- from ..tools import calculate_umap
187
358
  make_dirs([pp_umap_dir])
188
359
 
189
360
  var_filters = []
190
- if smf_modality == 'direct':
361
+ if smf_modality == "direct":
191
362
  for ref in references:
192
363
  for base in cfg.mod_target_bases:
193
- var_filters += [f'{ref}_{base}_site']
364
+ var_filters.append(f"{ref}_{base}_site")
194
365
  elif deaminase:
195
366
  for ref in references:
196
- var_filters += [f'{ref}_any_C_site']
367
+ var_filters.append(f"{ref}_C_site")
197
368
  else:
198
369
  for ref in references:
199
370
  for base in cfg.mod_target_bases:
200
- var_filters += [f'{ref}_{base}_site']
371
+ var_filters.append(f"{ref}_{base}_site")
201
372
 
202
- adata = calculate_umap(adata,
203
- layer=cfg.layer_for_umap_plotting,
204
- var_filters=var_filters,
205
- n_pcs=10,
206
- knn_neighbors=15)
373
+ adata = calculate_umap(
374
+ adata,
375
+ layer=cfg.layer_for_umap_plotting,
376
+ var_filters=var_filters,
377
+ n_pcs=10,
378
+ knn_neighbors=15,
379
+ )
207
380
 
208
- ## Clustering
209
381
  sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
210
382
 
211
- # Plotting UMAP
212
383
  sc.settings.figdir = pp_umap_dir
213
- umap_layers = ['leiden', cfg.sample_name_col_for_plotting, 'Reference_strand']
384
+ umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
214
385
  umap_layers += cfg.umap_layers_to_plot
215
386
  sc.pl.umap(adata, color=umap_layers, show=False, save=True)
216
387
 
217
- ########## Spatial autocorrelation analyses ###########
218
- from ..tools.spatial_autocorrelation import binary_autocorrelation_with_spacing, analyze_autocorr_matrix, bootstrap_periodicity, rolling_autocorr_metrics
219
- from ..plotting import plot_rolling_grid
220
- import warnings
221
-
222
- pp_autocorr_dir = pp_dir / "08_autocorrelations"
388
+ # ============================================================
389
+ # 3) Spatial autocorrelation + rolling metrics
390
+ # ============================================================
391
+ pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
223
392
 
224
- if pp_autocorr_dir.is_dir():
225
- print(f'{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.')
393
+ if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
394
+ logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
226
395
  else:
227
396
  positions = adata.var_names.astype(int).values
228
397
  lags = np.arange(cfg.autocorr_max_lag + 1)
229
398
 
230
- # optional: try to parallelize autocorr per-row with joblib
231
399
  try:
232
400
  from joblib import Parallel, delayed
401
+
233
402
  _have_joblib = True
234
403
  except Exception:
235
404
  _have_joblib = False
236
405
 
406
+ samples = (
407
+ adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
408
+ )
409
+ ref_col = getattr(cfg, "reference_strand_col", "Reference_strand")
410
+ refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
411
+
237
412
  for site_type in cfg.autocorr_site_types:
238
413
  layer_key = f"{site_type}_site_binary"
239
414
  if layer_key not in adata.layers:
240
- print(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
415
+ logger.debug(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
241
416
  continue
242
417
 
243
418
  X = adata.layers[layer_key]
244
419
  if getattr(X, "shape", (0,))[0] == 0:
245
- print(f"Layer {layer_key} empty — skipping {site_type}.")
420
+ logger.debug(f"Layer {layer_key} empty — skipping {site_type}.")
246
421
  continue
247
422
 
248
- # compute per-molecule autocorrs (and counts)
249
423
  rows = []
250
424
  counts = []
425
+
251
426
  if _have_joblib:
252
- # parallel map
427
+
253
428
  def _worker(row):
254
429
  try:
255
430
  ac, cnts = binary_autocorrelation_with_spacing(
256
- row, positions, max_lag=cfg.autocorr_max_lag, return_counts=True
431
+ row,
432
+ positions,
433
+ max_lag=cfg.autocorr_max_lag,
434
+ return_counts=True,
435
+ normalize=cfg.autocorr_normalization_method,
257
436
  )
258
- except Exception as e:
259
- # on error return NaN arrays
437
+ except Exception:
260
438
  ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
261
439
  cnts = np.zeros(cfg.autocorr_max_lag + 1, dtype=np.int32)
262
440
  return ac, cnts
263
441
 
264
- res = Parallel(n_jobs=cfg.n_jobs if hasattr(cfg, "n_jobs") else -1)(
442
+ res = Parallel(n_jobs=getattr(cfg, "n_jobs", -1))(
265
443
  delayed(_worker)(X[i]) for i in range(X.shape[0])
266
444
  )
267
445
  for ac, cnts in res:
268
446
  rows.append(ac)
269
447
  counts.append(cnts)
270
448
  else:
271
- # sequential fallback
272
449
  for i in range(X.shape[0]):
273
450
  ac, cnts = binary_autocorrelation_with_spacing(
274
- X[i], positions, max_lag=cfg.autocorr_max_lag, return_counts=True
451
+ X[i],
452
+ positions,
453
+ max_lag=cfg.autocorr_max_lag,
454
+ return_counts=True,
455
+ normalize=cfg.autocorr_normalization_method,
275
456
  )
276
457
  rows.append(ac)
277
458
  counts.append(cnts)
@@ -279,21 +460,23 @@ def spatial_adata(config_path):
279
460
  autocorr_matrix = np.asarray(rows, dtype=np.float32)
280
461
  counts_matrix = np.asarray(counts, dtype=np.int32)
281
462
 
282
- # store raw per-molecule arrays (keep memory format compact)
283
463
  adata.obsm[f"{site_type}_spatial_autocorr"] = autocorr_matrix
284
464
  adata.obsm[f"{site_type}_spatial_autocorr_counts"] = counts_matrix
285
465
  adata.uns[f"{site_type}_spatial_autocorr_lags"] = lags
286
466
 
287
- # compute global periodicity metrics across all molecules for this site_type
288
467
  try:
289
468
  results = analyze_autocorr_matrix(
290
- autocorr_matrix, counts_matrix, lags,
291
- nrl_search_bp=(120, 260), pad_factor=4, min_count=20, max_harmonics=6
469
+ autocorr_matrix,
470
+ counts_matrix,
471
+ lags,
472
+ nrl_search_bp=(120, 260),
473
+ pad_factor=4,
474
+ min_count=20,
475
+ max_harmonics=6,
292
476
  )
293
477
  except Exception as e:
294
478
  results = {"error": str(e)}
295
479
 
296
- # store global metrics (same keys you used)
297
480
  global_metrics = {
298
481
  "nrl_bp": results.get("nrl_bp", np.nan),
299
482
  "xi": results.get("xi", np.nan),
@@ -305,13 +488,16 @@ def spatial_adata(config_path):
305
488
  }
306
489
  adata.uns[f"{site_type}_spatial_periodicity_metrics"] = global_metrics
307
490
 
308
- # bootstrap for CI (use a reasonable default; set low only for debugging)
309
491
  n_boot = getattr(cfg, "autocorr_bootstrap_n", 200)
310
- # if user intentionally set very low n_boot in cfg, we keep that; otherwise default 200
311
492
  try:
312
493
  bs = bootstrap_periodicity(
313
- autocorr_matrix, counts_matrix, lags,
314
- n_boot=n_boot, nrl_search_bp=(120, 260), pad_factor=4, min_count=20
494
+ autocorr_matrix,
495
+ counts_matrix,
496
+ lags,
497
+ n_boot=n_boot,
498
+ nrl_search_bp=(120, 260),
499
+ pad_factor=4,
500
+ min_count=20,
315
501
  )
316
502
  adata.uns[f"{site_type}_spatial_periodicity_boot"] = {
317
503
  "nrl_boot": np.asarray(bs["nrl_boot"]).tolist(),
@@ -320,57 +506,74 @@ def spatial_adata(config_path):
320
506
  except Exception as e:
321
507
  adata.uns[f"{site_type}_spatial_periodicity_boot_error"] = str(e)
322
508
 
323
- # ----------------------------
324
- # Compute group-level metrics for plotting (per sample × reference)
325
- # ----------------------------
326
509
  metrics_by_group = {}
327
510
  sample_col = cfg.sample_name_col_for_plotting
328
- ref_col = cfg.reference_strand_col if hasattr(cfg, "reference_strand_col") else "Reference_strand"
329
- samples = adata.obs[sample_col].astype("category").cat.categories.tolist()
330
- refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
331
511
 
332
- # iterate groups and run analyzer on each group's subset; cache errors
333
512
  for sample_name in samples:
334
- sample_mask = (adata.obs[sample_col].values == sample_name)
513
+ sample_mask = adata.obs[sample_col].values == sample_name
514
+
335
515
  # combined group
336
516
  mask = sample_mask
337
517
  ac_sel = autocorr_matrix[mask, :]
338
518
  cnt_sel = counts_matrix[mask, :] if counts_matrix is not None else None
339
519
  if ac_sel.size:
340
520
  try:
341
- r = analyze_autocorr_matrix(ac_sel, cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
342
- lags, nrl_search_bp=(120,260), pad_factor=4, min_count=10, max_harmonics=6)
521
+ r = analyze_autocorr_matrix(
522
+ ac_sel,
523
+ cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
524
+ lags,
525
+ nrl_search_bp=(120, 260),
526
+ pad_factor=4,
527
+ min_count=10,
528
+ max_harmonics=6,
529
+ )
343
530
  except Exception as e:
344
531
  r = {"error": str(e)}
345
532
  else:
346
533
  r = {"error": "no_data"}
347
534
  metrics_by_group[(sample_name, None)] = r
348
535
 
349
- # per-reference groups
350
536
  for ref in refs:
351
537
  mask_ref = sample_mask & (adata.obs[ref_col].values == ref)
352
538
  ac_sel = autocorr_matrix[mask_ref, :]
353
539
  cnt_sel = counts_matrix[mask_ref, :] if counts_matrix is not None else None
354
540
  if ac_sel.size:
355
541
  try:
356
- r = analyze_autocorr_matrix(ac_sel, cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
357
- lags, nrl_search_bp=(120,260), pad_factor=4, min_count=10, max_harmonics=6)
542
+ r = analyze_autocorr_matrix(
543
+ ac_sel,
544
+ cnt_sel
545
+ if cnt_sel is not None
546
+ else np.zeros_like(ac_sel, dtype=int),
547
+ lags,
548
+ nrl_search_bp=(120, 260),
549
+ pad_factor=4,
550
+ min_count=10,
551
+ max_harmonics=6,
552
+ )
358
553
  except Exception as e:
359
554
  r = {"error": str(e)}
360
555
  else:
361
556
  r = {"error": "no_data"}
362
557
  metrics_by_group[(sample_name, ref)] = r
363
558
 
364
- # persist group metrics
365
559
  adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
366
560
 
367
- global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get("nrl_bp", None)
561
+ global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get(
562
+ "nrl_bp", None
563
+ )
368
564
 
369
- # configuration / sensible defaults (override in cfg if present)
370
565
  rolling_cfg = {
371
- "window_size": getattr(cfg, "rolling_window_size", getattr(cfg, "autocorr_rolling_window_size", 600)),
566
+ "window_size": getattr(
567
+ cfg,
568
+ "rolling_window_size",
569
+ getattr(cfg, "autocorr_rolling_window_size", 600),
570
+ ),
372
571
  "step": getattr(cfg, "rolling_step", 100),
373
- "max_lag": getattr(cfg, "rolling_max_lag", cfg.autocorr_max_lag if hasattr(cfg, "autocorr_max_lag") else 500),
572
+ "max_lag": getattr(
573
+ cfg,
574
+ "rolling_max_lag",
575
+ getattr(cfg, "autocorr_max_lag", 500),
576
+ ),
374
577
  "min_molecules_per_window": getattr(cfg, "rolling_min_molecules_per_window", 10),
375
578
  "nrl_search_bp": getattr(cfg, "rolling_nrl_search_bp", (120, 240)),
376
579
  "pad_factor": getattr(cfg, "rolling_pad_factor", 4),
@@ -381,23 +584,19 @@ def spatial_adata(config_path):
381
584
 
382
585
  write_plots = getattr(cfg, "rolling_write_plots", True)
383
586
  write_csvs = getattr(cfg, "rolling_write_csvs", True)
384
- min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30) # only run rolling if group has >= this many molecules
587
+ min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30)
385
588
 
386
589
  rolling_out_dir = os.path.join(pp_autocorr_dir, "rolling_metrics")
387
590
  os.makedirs(rolling_out_dir, exist_ok=True)
388
- # also a per-site subfolder
389
591
  site_out_dir = os.path.join(rolling_out_dir, site_type)
390
592
  os.makedirs(site_out_dir, exist_ok=True)
391
593
 
392
- combined_rows = [] # accumulate one row per window for combined CSV
393
- rolling_results_by_group = {} # store DataFrame per group in memory (persist later to adata.uns)
594
+ combined_rows = []
595
+ rolling_results_by_group = {}
394
596
 
395
- # iterate groups (samples × refs). `samples` and `refs` were computed above.
396
597
  for sample_name in samples:
397
- sample_mask = (adata.obs[sample_col].values == sample_name)
398
- # first the combined group ("all refs")
598
+ sample_mask = adata.obs[sample_col].values == sample_name
399
599
  group_masks = [("all", sample_mask)]
400
- # then per-reference groups
401
600
  for ref in refs:
402
601
  ref_mask = sample_mask & (adata.obs[ref_col].values == ref)
403
602
  group_masks.append((ref, ref_mask))
@@ -405,17 +604,10 @@ def spatial_adata(config_path):
405
604
  for ref_label, mask in group_masks:
406
605
  n_group = int(mask.sum())
407
606
  if n_group < min_molecules_for_group:
408
- # skip tiny groups
409
- if cfg.get("verbosity", 0) if hasattr(cfg, "get") else False:
410
- print(f"Skipping rolling for {site_type} {sample_name} {ref_label}: only {n_group} molecules (<{min_molecules_for_group})")
411
- # still write an empty CSV row set if desired; here we skip
412
607
  continue
413
608
 
414
- # extract group matrix X_group (works with dense or sparse adata.layers)
415
609
  X_group = X[mask, :]
416
- # positions already set above
417
610
  try:
418
- # call your rolling function (this may be slow; it uses cfg.n_jobs)
419
611
  df_roll = rolling_autocorr_metrics(
420
612
  X_group,
421
613
  positions,
@@ -430,135 +622,160 @@ def spatial_adata(config_path):
430
622
  max_harmonics=rolling_cfg["max_harmonics"],
431
623
  n_jobs=rolling_cfg["n_jobs"],
432
624
  verbose=False,
433
- fixed_nrl_bp=global_nrl
625
+ fixed_nrl_bp=global_nrl,
434
626
  )
435
627
  except Exception as e:
436
- warnings.warn(f"rolling_autocorr_metrics failed for {site_type} {sample_name} {ref_label}: {e}")
628
+ logger.warning(
629
+ f"rolling_autocorr_metrics failed for {site_type} "
630
+ f"{sample_name} {ref_label}: {e}"
631
+ )
437
632
  continue
438
633
 
439
- # normalize column names and keep only the compact set you want
440
- # keep: center, n_molecules, nrl_bp, snr, xi, fwhm_bp
441
634
  if "center" not in df_roll.columns:
442
- # defensive: if the rolling function returned different schema, skip
443
- warnings.warn(f"rolling_autocorr_metrics returned unexpected schema for {site_type} {sample_name} {ref_label}")
635
+ logger.warning(
636
+ f"rolling_autocorr_metrics returned unexpected schema "
637
+ f"for {site_type} {sample_name} {ref_label}"
638
+ )
444
639
  continue
445
640
 
446
- compact_df = df_roll[["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]].copy()
641
+ compact_df = df_roll[
642
+ ["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]
643
+ ].copy()
447
644
  compact_df["site"] = site_type
448
645
  compact_df["sample"] = sample_name
449
646
  compact_df["reference"] = ref_label if ref_label != "all" else "all"
450
647
 
451
- # save per-group CSV
452
648
  if write_csvs:
453
649
  safe_sample = str(sample_name).replace(os.sep, "_")
454
- safe_ref = str(ref_label if ref_label != "all" else "all").replace(os.sep, "_")
455
- out_csv = os.path.join(site_out_dir, f"{safe_sample}__{safe_ref}__rolling_metrics.csv")
650
+ safe_ref = str(ref_label if ref_label != "all" else "all").replace(
651
+ os.sep, "_"
652
+ )
653
+ out_csv = os.path.join(
654
+ site_out_dir,
655
+ f"{safe_sample}__{safe_ref}__rolling_metrics.csv",
656
+ )
456
657
  try:
457
658
  compact_df.to_csv(out_csv, index=False)
458
659
  except Exception as e:
459
- warnings.warn(f"Failed to write rolling CSV {out_csv}: {e}")
660
+ logger.warning(f"Failed to write rolling CSV {out_csv}: {e}")
460
661
 
461
- # save a plot per-group (NRL and SNR vs center)
462
662
  if write_plots:
463
663
  try:
464
- # use your plot helper; if it's in a different module, import accordingly
465
664
  from ..plotting import plot_rolling_metrics as _plot_roll
466
665
  except Exception:
467
- _plot_roll = globals().get("plot_rolling_metrics", None)
666
+ _plot_roll = None
468
667
  if _plot_roll is not None:
469
- plot_png = os.path.join(site_out_dir, f"{safe_sample}__{safe_ref}__rolling_metrics.png")
668
+ plot_png = os.path.join(
669
+ site_out_dir,
670
+ f"{safe_sample}__{safe_ref}__rolling_metrics.png",
671
+ )
470
672
  try:
471
- _plot_roll(compact_df, out_png=plot_png,
472
- title=f"{site_type} {sample_name} {ref_label}",
473
- figsize=(10,3.5), dpi=160, show=False)
673
+ _plot_roll(
674
+ compact_df,
675
+ out_png=plot_png,
676
+ title=f"{site_type} {sample_name} {ref_label}",
677
+ figsize=(10, 3.5),
678
+ dpi=160,
679
+ show=False,
680
+ )
474
681
  except Exception as e:
475
- warnings.warn(f"Failed to create rolling plot for {site_type} {sample_name} {ref_label}: {e}")
682
+ logger.warning(
683
+ f"Failed to create rolling plot for {site_type} "
684
+ f"{sample_name} {ref_label}: {e}"
685
+ )
476
686
 
477
- # store in combined_rows and in-memory dict
478
- combined_rows.append(compact_df.assign(site=site_type, sample=sample_name, reference=ref_label))
479
- rolling_results_by_group[(sample_name, None if ref_label == "all" else ref_label)] = compact_df
687
+ combined_rows.append(
688
+ compact_df.assign(site=site_type, sample=sample_name, reference=ref_label)
689
+ )
690
+ rolling_results_by_group[
691
+ (sample_name, None if ref_label == "all" else ref_label)
692
+ ] = compact_df
480
693
 
481
- # persist per-site rolling metrics into adata.uns as dict of DataFrames (or empty dict)
482
694
  adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
483
695
 
484
- # write combined CSV for this site across all groups
485
- if len(combined_rows):
696
+ if combined_rows:
486
697
  combined_df_site = pd.concat(combined_rows, ignore_index=True, sort=False)
487
- combined_out_csv = os.path.join(rolling_out_dir, f"{site_type}__rolling_metrics_combined.csv")
698
+ combined_out_csv = os.path.join(
699
+ rolling_out_dir, f"{site_type}__rolling_metrics_combined.csv"
700
+ )
488
701
  try:
489
702
  combined_df_site.to_csv(combined_out_csv, index=False)
490
703
  except Exception as e:
491
- warnings.warn(f"Failed to write combined rolling CSV for {site_type}: {e}")
704
+ logger.warning(f"Failed to write combined rolling CSV for {site_type}: {e}")
492
705
 
493
706
  rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
494
707
  plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
495
708
  os.makedirs(plot_out_dir, exist_ok=True)
496
- pages = plot_rolling_grid(rolling_dict, plot_out_dir, site_type,
497
- rows_per_page=cfg.rows_per_qc_autocorr_grid,
498
- cols_per_page=len(refs),
499
- dpi=160,
500
- metrics=("nrl_bp","snr", "xi"),
501
- per_metric_ylim={"snr": (0, 25)})
502
-
503
- from ..plotting import plot_spatial_autocorr_grid
504
- make_dirs([pp_autocorr_dir, pp_autocorr_dir])
505
-
506
- plot_spatial_autocorr_grid(adata,
507
- pp_autocorr_dir,
508
- site_types=cfg.autocorr_site_types,
509
- sample_col=cfg.sample_name_col_for_plotting,
510
- window=cfg.autocorr_rolling_window_size,
511
- rows_per_fig=cfg.rows_per_qc_autocorr_grid)
512
-
513
- ############ Pearson analyses ###############
514
- if smf_modality != 'direct':
515
- from ..tools.position_stats import compute_positionwise_statistics, plot_positionwise_matrices
516
-
517
- pp_corr_dir = pp_dir / "09_correlation_matrices"
518
-
519
- if pp_corr_dir.is_dir():
520
- print(f'{pp_corr_dir} already exists. Skipping correlation matrix plotting.')
521
- else:
522
- compute_positionwise_statistics(
523
- adata,
524
- layer="nan0_0minus1",
525
- methods=cfg.correlation_matrix_types,
526
- sample_col=cfg.sample_name_col_for_plotting,
527
- ref_col=cfg.reference_column,
528
- output_key="positionwise_result",
529
- site_types=cfg.correlation_matrix_site_types,
530
- encoding="signed",
531
- max_threads=cfg.threads,
532
- min_count_for_pairwise=10,
709
+ _ = plot_rolling_grid(
710
+ rolling_dict,
711
+ plot_out_dir,
712
+ site_type,
713
+ rows_per_page=cfg.rows_per_qc_autocorr_grid,
714
+ cols_per_page=len(refs),
715
+ dpi=160,
716
+ metrics=("nrl_bp", "snr", "xi"),
717
+ per_metric_ylim={"snr": (0, 25)},
533
718
  )
534
-
535
- plot_positionwise_matrices(
719
+
720
+ make_dirs([pp_autocorr_dir])
721
+ plot_spatial_autocorr_grid(
536
722
  adata,
537
- methods=cfg.correlation_matrix_types,
723
+ pp_autocorr_dir,
724
+ site_types=cfg.autocorr_site_types,
538
725
  sample_col=cfg.sample_name_col_for_plotting,
539
- ref_col=cfg.reference_column,
540
- figsize_per_cell=(4.0, 3.0),
541
- dpi=160,
542
- cmaps=cfg.correlation_matrix_cmaps,
543
- vmin=None,
544
- vmax=None,
545
- output_dir=pp_corr_dir,
546
- output_key= "positionwise_result"
726
+ window=cfg.autocorr_rolling_window_size,
727
+ rows_per_fig=cfg.rows_per_qc_autocorr_grid,
728
+ normalization_method=cfg.autocorr_normalization_method,
547
729
  )
548
730
 
549
- ####### Save basic analysis adata - post preprocessing and duplicate removal ################
550
- from ..readwrite import safe_write_h5ad
551
- if not spatial_adata_path.exists() or cfg.force_redo_preprocessing:
552
- print('Saving spatial analyzed adata post preprocessing and duplicate removal')
553
- if ".gz" == spatial_adata_path.suffix:
554
- print(f"Spatial adata path: {spatial_adata_path}")
555
- safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
556
- else:
557
- spatial_adata_path = spatial_adata_path.with_name(spatial_adata_path.name + '.gz')
558
- print(f"Spatial adata path: {spatial_adata_path}")
559
- safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
560
- ############################################### smftools spatial end ###############################################
561
-
562
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_adata_path)
731
+ # ============================================================
732
+ # 4) Pearson / correlation matrices
733
+ # ============================================================
734
+ pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
563
735
 
564
- return adata, spatial_adata_path
736
+ if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
737
+ logger.debug(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
738
+ else:
739
+ compute_positionwise_statistics(
740
+ adata,
741
+ layer="nan0_0minus1",
742
+ methods=cfg.correlation_matrix_types,
743
+ sample_col=cfg.sample_name_col_for_plotting,
744
+ ref_col=cfg.reference_column,
745
+ output_key="positionwise_result",
746
+ site_types=cfg.correlation_matrix_site_types,
747
+ encoding="signed",
748
+ max_threads=cfg.threads,
749
+ min_count_for_pairwise=10,
750
+ )
751
+
752
+ plot_positionwise_matrices(
753
+ adata,
754
+ methods=cfg.correlation_matrix_types,
755
+ sample_col=cfg.sample_name_col_for_plotting,
756
+ ref_col=cfg.reference_column,
757
+ figsize_per_cell=(4.0, 3.0),
758
+ dpi=160,
759
+ cmaps=cfg.correlation_matrix_cmaps,
760
+ vmin=None,
761
+ vmax=None,
762
+ output_dir=pp_corr_dir,
763
+ output_key="positionwise_result",
764
+ )
765
+
766
+ # ============================================================
767
+ # 5) Save spatial AnnData
768
+ # ============================================================
769
+ if (not spatial_adata_path.exists()) or getattr(cfg, "force_redo_spatial_analyses", False):
770
+ logger.info("Saving spatial analyzed AnnData (post preprocessing and duplicate removal).")
771
+ record_smftools_metadata(
772
+ adata,
773
+ step_name="spatial",
774
+ cfg=cfg,
775
+ config_path=config_path,
776
+ input_paths=[source_adata_path] if source_adata_path else None,
777
+ output_path=spatial_adata_path,
778
+ )
779
+ write_gz_h5ad(adata, spatial_adata_path)
780
+
781
+ return adata, spatial_adata_path