smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +168 -145
- smftools/cli/load_adata.py +155 -95
- smftools/cli/preprocess_adata.py +222 -130
- smftools/cli/spatial_adata.py +441 -308
- smftools/cli_entry.py +4 -5
- smftools/config/conversion.yaml +12 -5
- smftools/config/deaminase.yaml +11 -9
- smftools/config/default.yaml +123 -19
- smftools/config/direct.yaml +3 -0
- smftools/config/experiment_config.py +120 -19
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/bam_functions.py +28 -29
- smftools/informatics/h5ad_functions.py +1 -1
- smftools/plotting/general_plotting.py +97 -51
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +2 -4
- smftools/preprocessing/append_base_context.py +34 -25
- smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
- smftools/preprocessing/binarize_on_Youden.py +10 -8
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +41 -25
- smftools/preprocessing/calculate_read_modification_stats.py +1 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
- smftools/preprocessing/flag_duplicate_reads.py +1 -1
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +94 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
smftools/cli/spatial_adata.py
CHANGED
|
@@ -1,239 +1,356 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
High-level function to call for spatial analysis of an adata object.
|
|
4
|
-
Command line accesses this through smftools spatial <config_path>
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional, Tuple
|
|
5
3
|
|
|
6
|
-
|
|
7
|
-
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
4
|
+
import anndata as ad
|
|
8
5
|
|
|
9
|
-
|
|
10
|
-
|
|
6
|
+
def spatial_adata(
|
|
7
|
+
config_path: str,
|
|
8
|
+
) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
|
|
9
|
+
"""
|
|
10
|
+
CLI-facing wrapper for spatial analyses.
|
|
11
|
+
|
|
12
|
+
Called by: `smftools spatial <config_path>`
|
|
13
|
+
|
|
14
|
+
Responsibilities:
|
|
15
|
+
- Ensure a usable AnnData exists via `load_adata` + `preprocess_adata`.
|
|
16
|
+
- Determine which AnnData stages exist (raw, pp, pp_dedup, spatial, hmm).
|
|
17
|
+
- Respect cfg.force_redo_spatial_analyses.
|
|
18
|
+
- Decide whether to skip (return existing) or run the spatial core.
|
|
19
|
+
- Call `spatial_adata_core(...)` when actual work is needed.
|
|
20
|
+
|
|
21
|
+
Returns
|
|
22
|
+
-------
|
|
23
|
+
spatial_adata : AnnData | None
|
|
24
|
+
AnnData with spatial analyses, or None if we skipped because a later-stage
|
|
25
|
+
AnnData already exists.
|
|
26
|
+
spatial_adata_path : Path | None
|
|
27
|
+
Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
|
|
11
28
|
"""
|
|
12
|
-
from ..readwrite import safe_read_h5ad,
|
|
29
|
+
from ..readwrite import safe_read_h5ad, make_dirs, add_or_update_column_in_csv
|
|
13
30
|
from .load_adata import load_adata
|
|
14
31
|
from .preprocess_adata import preprocess_adata
|
|
32
|
+
from .helpers import get_adata_paths
|
|
33
|
+
|
|
34
|
+
# 1) Ensure config + basic paths via load_adata
|
|
35
|
+
loaded_adata, loaded_path, cfg = load_adata(config_path)
|
|
36
|
+
paths = get_adata_paths(cfg)
|
|
37
|
+
|
|
38
|
+
raw_path = paths.raw
|
|
39
|
+
pp_path = paths.pp
|
|
40
|
+
pp_dedup_path = paths.pp_dedup
|
|
41
|
+
spatial_path = paths.spatial
|
|
42
|
+
hmm_path = paths.hmm
|
|
43
|
+
|
|
44
|
+
# Stage-skipping logic for spatial
|
|
45
|
+
if not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
46
|
+
# If HMM exists, it's the most processed stage — reuse it.
|
|
47
|
+
if hmm_path.exists():
|
|
48
|
+
print(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
|
|
49
|
+
return None, hmm_path
|
|
50
|
+
|
|
51
|
+
# If spatial exists, we consider spatial analyses already done.
|
|
52
|
+
if spatial_path.exists():
|
|
53
|
+
print(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
|
|
54
|
+
return None, spatial_path
|
|
55
|
+
|
|
56
|
+
# 2) Ensure preprocessing has been run
|
|
57
|
+
# This will create pp/pp_dedup as needed or return them if they already exist.
|
|
58
|
+
pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(config_path)
|
|
59
|
+
|
|
60
|
+
# Helper to load from disk, reusing loaded_adata if it matches
|
|
61
|
+
def _load(path: Path):
|
|
62
|
+
from ..readwrite import safe_read_h5ad
|
|
63
|
+
if loaded_adata is not None and loaded_path == path:
|
|
64
|
+
return loaded_adata
|
|
65
|
+
adata, _ = safe_read_h5ad(path)
|
|
66
|
+
return adata
|
|
67
|
+
|
|
68
|
+
# 3) Decide which AnnData to use as the *starting point* for spatial analyses
|
|
69
|
+
# Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
|
|
70
|
+
if pp_dedup_adata is not None:
|
|
71
|
+
start_adata = pp_dedup_adata
|
|
72
|
+
else:
|
|
73
|
+
if pp_dedup_path.exists():
|
|
74
|
+
start_adata = _load(pp_dedup_path)
|
|
75
|
+
elif pp_path.exists():
|
|
76
|
+
start_adata = _load(pp_path)
|
|
77
|
+
elif raw_path.exists():
|
|
78
|
+
start_adata = _load(raw_path)
|
|
79
|
+
else:
|
|
80
|
+
print("No suitable AnnData found for spatial analyses (need at least raw).")
|
|
81
|
+
return None, None
|
|
82
|
+
|
|
83
|
+
# 4) Run the spatial core
|
|
84
|
+
adata_spatial, spatial_path = spatial_adata_core(
|
|
85
|
+
adata=start_adata,
|
|
86
|
+
cfg=cfg,
|
|
87
|
+
spatial_adata_path=spatial_path,
|
|
88
|
+
pp_adata_path=pp_path,
|
|
89
|
+
pp_dup_rem_adata_path=pp_dedup_path,
|
|
90
|
+
pp_adata_in_memory=pp_adata,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# 5) Register spatial path in summary CSV
|
|
94
|
+
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
|
|
95
|
+
|
|
96
|
+
return adata_spatial, spatial_path
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def spatial_adata_core(
|
|
100
|
+
adata: ad.AnnData,
|
|
101
|
+
cfg,
|
|
102
|
+
spatial_adata_path: Path,
|
|
103
|
+
pp_adata_path: Path,
|
|
104
|
+
pp_dup_rem_adata_path: Path,
|
|
105
|
+
pp_adata_in_memory: Optional[ad.AnnData] = None,
|
|
106
|
+
) -> Tuple[ad.AnnData, Path]:
|
|
107
|
+
"""
|
|
108
|
+
Core spatial analysis pipeline.
|
|
109
|
+
|
|
110
|
+
Assumes:
|
|
111
|
+
- `adata` is (typically) the preprocessed, duplicate-removed AnnData.
|
|
112
|
+
- `cfg` is the ExperimentConfig.
|
|
113
|
+
- `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
|
|
114
|
+
from `get_adata_paths`.
|
|
115
|
+
- `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
|
|
116
|
+
the same run of `preprocess_adata`, to avoid re-reading from disk.
|
|
117
|
+
|
|
118
|
+
Does:
|
|
119
|
+
- Optional sample sheet load.
|
|
120
|
+
- Optional inversion & reindexing.
|
|
121
|
+
- Clustermaps on:
|
|
122
|
+
* preprocessed (non-dedup) AnnData (for non-direct modalities), and
|
|
123
|
+
* deduplicated preprocessed AnnData.
|
|
124
|
+
- PCA/UMAP/Leiden.
|
|
125
|
+
- Autocorrelation + rolling metrics + grids.
|
|
126
|
+
- Positionwise correlation matrices (non-direct modalities).
|
|
127
|
+
- Save spatial AnnData to `spatial_adata_path`.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
adata : AnnData
|
|
132
|
+
Spatially analyzed AnnData (same object, modified in-place).
|
|
133
|
+
spatial_adata_path : Path
|
|
134
|
+
Path where spatial AnnData was written.
|
|
135
|
+
"""
|
|
136
|
+
import os
|
|
137
|
+
import warnings
|
|
138
|
+
from pathlib import Path
|
|
15
139
|
|
|
16
140
|
import numpy as np
|
|
17
141
|
import pandas as pd
|
|
18
|
-
import anndata as ad
|
|
19
142
|
import scanpy as sc
|
|
20
143
|
|
|
21
|
-
import
|
|
22
|
-
from
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
144
|
+
from ..readwrite import make_dirs, safe_read_h5ad
|
|
145
|
+
from .helpers import write_gz_h5ad
|
|
146
|
+
|
|
147
|
+
from ..preprocessing import (
|
|
148
|
+
load_sample_sheet,
|
|
149
|
+
invert_adata,
|
|
150
|
+
reindex_references_adata,
|
|
151
|
+
)
|
|
152
|
+
from ..plotting import (
|
|
153
|
+
combined_raw_clustermap,
|
|
154
|
+
plot_rolling_grid,
|
|
155
|
+
plot_spatial_autocorr_grid,
|
|
156
|
+
)
|
|
157
|
+
from ..tools import calculate_umap
|
|
158
|
+
from ..tools.spatial_autocorrelation import (
|
|
159
|
+
binary_autocorrelation_with_spacing,
|
|
160
|
+
analyze_autocorr_matrix,
|
|
161
|
+
bootstrap_periodicity,
|
|
162
|
+
rolling_autocorr_metrics,
|
|
163
|
+
)
|
|
164
|
+
from ..tools.position_stats import (
|
|
165
|
+
compute_positionwise_statistics,
|
|
166
|
+
plot_positionwise_matrices,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# -----------------------------
|
|
170
|
+
# General setup
|
|
171
|
+
# -----------------------------
|
|
172
|
+
output_directory = Path(cfg.output_directory)
|
|
34
173
|
make_dirs([output_directory])
|
|
35
|
-
############################################### smftools load end ###############################################
|
|
36
|
-
|
|
37
|
-
############################################### smftools preprocess start ###############################################
|
|
38
|
-
pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
|
|
39
|
-
############################################### smftools preprocess end ###############################################
|
|
40
174
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
initial_adata_path = Path(input_manager_df['load_adata'][0])
|
|
44
|
-
pp_adata_path = Path(input_manager_df['pp_adata'][0])
|
|
45
|
-
pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
|
|
46
|
-
spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
|
|
47
|
-
hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
|
|
48
|
-
|
|
49
|
-
if smf_modality == 'conversion':
|
|
175
|
+
smf_modality = cfg.smf_modality
|
|
176
|
+
if smf_modality == "conversion":
|
|
50
177
|
deaminase = False
|
|
51
178
|
else:
|
|
52
179
|
deaminase = True
|
|
53
180
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
print(f"Preprocessed deduplicated spatial anndata found: {spatial_adata_path}")
|
|
84
|
-
return None, spatial_adata_path
|
|
85
|
-
elif preprocessed_dup_removed_version_available:
|
|
86
|
-
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
87
|
-
adata_version = "pp_dedup"
|
|
88
|
-
elif preprocessed_version_available:
|
|
89
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
90
|
-
adata_version = "pp"
|
|
91
|
-
elif initial_version_available:
|
|
92
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
93
|
-
adata_version = "initial"
|
|
94
|
-
else:
|
|
95
|
-
print(f"No adata available.")
|
|
96
|
-
return
|
|
181
|
+
first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
|
|
182
|
+
|
|
183
|
+
# -----------------------------
|
|
184
|
+
# Optional sample sheet metadata
|
|
185
|
+
# -----------------------------
|
|
186
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
187
|
+
load_sample_sheet(
|
|
188
|
+
adata,
|
|
189
|
+
cfg.sample_sheet_path,
|
|
190
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
191
|
+
as_category=True,
|
|
192
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# -----------------------------
|
|
196
|
+
# Optional inversion along positions axis
|
|
197
|
+
# -----------------------------
|
|
198
|
+
if getattr(cfg, "invert_adata", False):
|
|
199
|
+
adata = invert_adata(adata)
|
|
200
|
+
|
|
201
|
+
# -----------------------------
|
|
202
|
+
# Optional reindexing by reference
|
|
203
|
+
# -----------------------------
|
|
204
|
+
reindex_references_adata(
|
|
205
|
+
adata,
|
|
206
|
+
reference_col=cfg.reference_column,
|
|
207
|
+
offsets=cfg.reindexing_offsets,
|
|
208
|
+
new_col=cfg.reindexed_var_suffix,
|
|
209
|
+
)
|
|
97
210
|
|
|
98
211
|
pp_dir = output_directory / "preprocessed"
|
|
99
212
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
100
213
|
|
|
101
|
-
|
|
102
|
-
|
|
214
|
+
# ============================================================
|
|
215
|
+
# 1) Clustermaps (non-direct modalities) on *preprocessed* data
|
|
216
|
+
# ============================================================
|
|
217
|
+
if smf_modality != "direct":
|
|
218
|
+
preprocessed_version_available = pp_adata_path.exists()
|
|
219
|
+
|
|
103
220
|
if preprocessed_version_available:
|
|
104
221
|
pp_clustermap_dir = pp_dir / "06_clustermaps"
|
|
105
222
|
|
|
106
|
-
if pp_clustermap_dir.is_dir()
|
|
107
|
-
|
|
223
|
+
if pp_clustermap_dir.is_dir() and not getattr(
|
|
224
|
+
cfg, "force_redo_spatial_analyses", False
|
|
225
|
+
):
|
|
226
|
+
print(f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData.")
|
|
108
227
|
else:
|
|
109
|
-
from ..plotting import combined_raw_clustermap
|
|
110
228
|
make_dirs([pp_dir, pp_clustermap_dir])
|
|
111
229
|
|
|
112
|
-
if not
|
|
113
|
-
pp_adata
|
|
230
|
+
if first_pp_run and (pp_adata_in_memory is not None):
|
|
231
|
+
pp_adata = pp_adata_in_memory
|
|
114
232
|
else:
|
|
115
|
-
pp_adata =
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
pass
|
|
142
|
-
|
|
143
|
-
else:
|
|
144
|
-
pass
|
|
233
|
+
pp_adata, _ = safe_read_h5ad(pp_adata_path)
|
|
234
|
+
|
|
235
|
+
combined_raw_clustermap(
|
|
236
|
+
pp_adata,
|
|
237
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
238
|
+
reference_col=cfg.reference_column,
|
|
239
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
240
|
+
layer_c=cfg.layer_for_clustermap_plotting,
|
|
241
|
+
layer_gpc=cfg.layer_for_clustermap_plotting,
|
|
242
|
+
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
243
|
+
layer_a=cfg.layer_for_clustermap_plotting,
|
|
244
|
+
cmap_c=cfg.clustermap_cmap_c,
|
|
245
|
+
cmap_gpc=cfg.clustermap_cmap_gpc,
|
|
246
|
+
cmap_cpg=cfg.clustermap_cmap_cpg,
|
|
247
|
+
cmap_a=cfg.clustermap_cmap_a,
|
|
248
|
+
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
249
|
+
min_length=cfg.read_len_filter_thresholds[0],
|
|
250
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
|
|
251
|
+
min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
252
|
+
bins=None,
|
|
253
|
+
sample_mapping=None,
|
|
254
|
+
save_path=pp_clustermap_dir,
|
|
255
|
+
sort_by=cfg.spatial_clustermap_sortby,
|
|
256
|
+
deaminase=deaminase,
|
|
257
|
+
index_col_suffix=cfg.reindexed_var_suffix,
|
|
258
|
+
)
|
|
145
259
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
260
|
+
# ============================================================
|
|
261
|
+
# 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
|
|
262
|
+
# ============================================================
|
|
263
|
+
pp_dir_dedup = pp_dir / "deduplicated"
|
|
264
|
+
pp_clustermap_dir_dedup = pp_dir_dedup / "06_clustermaps"
|
|
265
|
+
pp_umap_dir = pp_dir_dedup / "07_umaps"
|
|
266
|
+
|
|
267
|
+
# Clustermaps on deduplicated adata
|
|
268
|
+
if pp_clustermap_dir_dedup.is_dir() and not getattr(
|
|
269
|
+
cfg, "force_redo_spatial_analyses", False
|
|
270
|
+
):
|
|
271
|
+
print(f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData.")
|
|
153
272
|
else:
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
if pp_umap_dir.is_dir():
|
|
184
|
-
print(f'{pp_umap_dir} already exists. Skipping UMAP plotting.')
|
|
273
|
+
make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
|
|
274
|
+
combined_raw_clustermap(
|
|
275
|
+
adata,
|
|
276
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
277
|
+
reference_col=cfg.reference_column,
|
|
278
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
279
|
+
layer_c=cfg.layer_for_clustermap_plotting,
|
|
280
|
+
layer_gpc=cfg.layer_for_clustermap_plotting,
|
|
281
|
+
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
282
|
+
layer_a=cfg.layer_for_clustermap_plotting,
|
|
283
|
+
cmap_c=cfg.clustermap_cmap_c,
|
|
284
|
+
cmap_gpc=cfg.clustermap_cmap_gpc,
|
|
285
|
+
cmap_cpg=cfg.clustermap_cmap_cpg,
|
|
286
|
+
cmap_a=cfg.clustermap_cmap_a,
|
|
287
|
+
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
288
|
+
min_length=cfg.read_len_filter_thresholds[0],
|
|
289
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
|
|
290
|
+
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
291
|
+
bins=None,
|
|
292
|
+
sample_mapping=None,
|
|
293
|
+
save_path=pp_clustermap_dir_dedup,
|
|
294
|
+
sort_by=cfg.spatial_clustermap_sortby,
|
|
295
|
+
deaminase=deaminase,
|
|
296
|
+
index_col_suffix=cfg.reindexed_var_suffix,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# UMAP / Leiden
|
|
300
|
+
if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
301
|
+
print(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
|
|
185
302
|
else:
|
|
186
|
-
from ..tools import calculate_umap
|
|
187
303
|
make_dirs([pp_umap_dir])
|
|
188
304
|
|
|
189
305
|
var_filters = []
|
|
190
|
-
if smf_modality ==
|
|
306
|
+
if smf_modality == "direct":
|
|
191
307
|
for ref in references:
|
|
192
308
|
for base in cfg.mod_target_bases:
|
|
193
|
-
var_filters
|
|
309
|
+
var_filters.append(f"{ref}_{base}_site")
|
|
194
310
|
elif deaminase:
|
|
195
311
|
for ref in references:
|
|
196
|
-
var_filters
|
|
312
|
+
var_filters.append(f"{ref}_C_site")
|
|
197
313
|
else:
|
|
198
314
|
for ref in references:
|
|
199
315
|
for base in cfg.mod_target_bases:
|
|
200
|
-
var_filters
|
|
316
|
+
var_filters.append(f"{ref}_{base}_site")
|
|
201
317
|
|
|
202
|
-
adata = calculate_umap(
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
318
|
+
adata = calculate_umap(
|
|
319
|
+
adata,
|
|
320
|
+
layer=cfg.layer_for_umap_plotting,
|
|
321
|
+
var_filters=var_filters,
|
|
322
|
+
n_pcs=10,
|
|
323
|
+
knn_neighbors=15,
|
|
324
|
+
)
|
|
207
325
|
|
|
208
|
-
## Clustering
|
|
209
326
|
sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
|
|
210
327
|
|
|
211
|
-
# Plotting UMAP
|
|
212
328
|
sc.settings.figdir = pp_umap_dir
|
|
213
|
-
umap_layers = [
|
|
329
|
+
umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
214
330
|
umap_layers += cfg.umap_layers_to_plot
|
|
215
331
|
sc.pl.umap(adata, color=umap_layers, show=False, save=True)
|
|
216
332
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
pp_autocorr_dir = pp_dir / "08_autocorrelations"
|
|
333
|
+
# ============================================================
|
|
334
|
+
# 3) Spatial autocorrelation + rolling metrics
|
|
335
|
+
# ============================================================
|
|
336
|
+
pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
|
|
223
337
|
|
|
224
|
-
if pp_autocorr_dir.is_dir():
|
|
225
|
-
print(f
|
|
338
|
+
if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
339
|
+
print(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
|
|
226
340
|
else:
|
|
227
341
|
positions = adata.var_names.astype(int).values
|
|
228
342
|
lags = np.arange(cfg.autocorr_max_lag + 1)
|
|
229
343
|
|
|
230
|
-
# optional: try to parallelize autocorr per-row with joblib
|
|
231
344
|
try:
|
|
232
345
|
from joblib import Parallel, delayed
|
|
233
346
|
_have_joblib = True
|
|
234
347
|
except Exception:
|
|
235
348
|
_have_joblib = False
|
|
236
349
|
|
|
350
|
+
samples = adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
|
|
351
|
+
ref_col = getattr(cfg, "reference_strand_col", "Reference_strand")
|
|
352
|
+
refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
|
|
353
|
+
|
|
237
354
|
for site_type in cfg.autocorr_site_types:
|
|
238
355
|
layer_key = f"{site_type}_site_binary"
|
|
239
356
|
if layer_key not in adata.layers:
|
|
@@ -245,30 +362,27 @@ def spatial_adata(config_path):
|
|
|
245
362
|
print(f"Layer {layer_key} empty — skipping {site_type}.")
|
|
246
363
|
continue
|
|
247
364
|
|
|
248
|
-
# compute per-molecule autocorrs (and counts)
|
|
249
365
|
rows = []
|
|
250
366
|
counts = []
|
|
367
|
+
|
|
251
368
|
if _have_joblib:
|
|
252
|
-
# parallel map
|
|
253
369
|
def _worker(row):
|
|
254
370
|
try:
|
|
255
371
|
ac, cnts = binary_autocorrelation_with_spacing(
|
|
256
372
|
row, positions, max_lag=cfg.autocorr_max_lag, return_counts=True
|
|
257
373
|
)
|
|
258
|
-
except Exception
|
|
259
|
-
# on error return NaN arrays
|
|
374
|
+
except Exception:
|
|
260
375
|
ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
|
|
261
376
|
cnts = np.zeros(cfg.autocorr_max_lag + 1, dtype=np.int32)
|
|
262
377
|
return ac, cnts
|
|
263
378
|
|
|
264
|
-
res = Parallel(n_jobs=
|
|
379
|
+
res = Parallel(n_jobs=getattr(cfg, "n_jobs", -1))(
|
|
265
380
|
delayed(_worker)(X[i]) for i in range(X.shape[0])
|
|
266
381
|
)
|
|
267
382
|
for ac, cnts in res:
|
|
268
383
|
rows.append(ac)
|
|
269
384
|
counts.append(cnts)
|
|
270
385
|
else:
|
|
271
|
-
# sequential fallback
|
|
272
386
|
for i in range(X.shape[0]):
|
|
273
387
|
ac, cnts = binary_autocorrelation_with_spacing(
|
|
274
388
|
X[i], positions, max_lag=cfg.autocorr_max_lag, return_counts=True
|
|
@@ -279,21 +393,23 @@ def spatial_adata(config_path):
|
|
|
279
393
|
autocorr_matrix = np.asarray(rows, dtype=np.float32)
|
|
280
394
|
counts_matrix = np.asarray(counts, dtype=np.int32)
|
|
281
395
|
|
|
282
|
-
# store raw per-molecule arrays (keep memory format compact)
|
|
283
396
|
adata.obsm[f"{site_type}_spatial_autocorr"] = autocorr_matrix
|
|
284
397
|
adata.obsm[f"{site_type}_spatial_autocorr_counts"] = counts_matrix
|
|
285
398
|
adata.uns[f"{site_type}_spatial_autocorr_lags"] = lags
|
|
286
399
|
|
|
287
|
-
# compute global periodicity metrics across all molecules for this site_type
|
|
288
400
|
try:
|
|
289
401
|
results = analyze_autocorr_matrix(
|
|
290
|
-
autocorr_matrix,
|
|
291
|
-
|
|
402
|
+
autocorr_matrix,
|
|
403
|
+
counts_matrix,
|
|
404
|
+
lags,
|
|
405
|
+
nrl_search_bp=(120, 260),
|
|
406
|
+
pad_factor=4,
|
|
407
|
+
min_count=20,
|
|
408
|
+
max_harmonics=6,
|
|
292
409
|
)
|
|
293
410
|
except Exception as e:
|
|
294
411
|
results = {"error": str(e)}
|
|
295
412
|
|
|
296
|
-
# store global metrics (same keys you used)
|
|
297
413
|
global_metrics = {
|
|
298
414
|
"nrl_bp": results.get("nrl_bp", np.nan),
|
|
299
415
|
"xi": results.get("xi", np.nan),
|
|
@@ -305,13 +421,16 @@ def spatial_adata(config_path):
|
|
|
305
421
|
}
|
|
306
422
|
adata.uns[f"{site_type}_spatial_periodicity_metrics"] = global_metrics
|
|
307
423
|
|
|
308
|
-
# bootstrap for CI (use a reasonable default; set low only for debugging)
|
|
309
424
|
n_boot = getattr(cfg, "autocorr_bootstrap_n", 200)
|
|
310
|
-
# if user intentionally set very low n_boot in cfg, we keep that; otherwise default 200
|
|
311
425
|
try:
|
|
312
426
|
bs = bootstrap_periodicity(
|
|
313
|
-
autocorr_matrix,
|
|
314
|
-
|
|
427
|
+
autocorr_matrix,
|
|
428
|
+
counts_matrix,
|
|
429
|
+
lags,
|
|
430
|
+
n_boot=n_boot,
|
|
431
|
+
nrl_search_bp=(120, 260),
|
|
432
|
+
pad_factor=4,
|
|
433
|
+
min_count=20,
|
|
315
434
|
)
|
|
316
435
|
adata.uns[f"{site_type}_spatial_periodicity_boot"] = {
|
|
317
436
|
"nrl_boot": np.asarray(bs["nrl_boot"]).tolist(),
|
|
@@ -320,57 +439,70 @@ def spatial_adata(config_path):
|
|
|
320
439
|
except Exception as e:
|
|
321
440
|
adata.uns[f"{site_type}_spatial_periodicity_boot_error"] = str(e)
|
|
322
441
|
|
|
323
|
-
# ----------------------------
|
|
324
|
-
# Compute group-level metrics for plotting (per sample × reference)
|
|
325
|
-
# ----------------------------
|
|
326
442
|
metrics_by_group = {}
|
|
327
443
|
sample_col = cfg.sample_name_col_for_plotting
|
|
328
|
-
ref_col = cfg.reference_strand_col if hasattr(cfg, "reference_strand_col") else "Reference_strand"
|
|
329
|
-
samples = adata.obs[sample_col].astype("category").cat.categories.tolist()
|
|
330
|
-
refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
|
|
331
444
|
|
|
332
|
-
# iterate groups and run analyzer on each group's subset; cache errors
|
|
333
445
|
for sample_name in samples:
|
|
334
|
-
sample_mask =
|
|
446
|
+
sample_mask = adata.obs[sample_col].values == sample_name
|
|
447
|
+
|
|
335
448
|
# combined group
|
|
336
449
|
mask = sample_mask
|
|
337
450
|
ac_sel = autocorr_matrix[mask, :]
|
|
338
451
|
cnt_sel = counts_matrix[mask, :] if counts_matrix is not None else None
|
|
339
452
|
if ac_sel.size:
|
|
340
453
|
try:
|
|
341
|
-
r = analyze_autocorr_matrix(
|
|
342
|
-
|
|
454
|
+
r = analyze_autocorr_matrix(
|
|
455
|
+
ac_sel,
|
|
456
|
+
cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
|
|
457
|
+
lags,
|
|
458
|
+
nrl_search_bp=(120, 260),
|
|
459
|
+
pad_factor=4,
|
|
460
|
+
min_count=10,
|
|
461
|
+
max_harmonics=6,
|
|
462
|
+
)
|
|
343
463
|
except Exception as e:
|
|
344
464
|
r = {"error": str(e)}
|
|
345
465
|
else:
|
|
346
466
|
r = {"error": "no_data"}
|
|
347
467
|
metrics_by_group[(sample_name, None)] = r
|
|
348
468
|
|
|
349
|
-
# per-reference groups
|
|
350
469
|
for ref in refs:
|
|
351
470
|
mask_ref = sample_mask & (adata.obs[ref_col].values == ref)
|
|
352
471
|
ac_sel = autocorr_matrix[mask_ref, :]
|
|
353
472
|
cnt_sel = counts_matrix[mask_ref, :] if counts_matrix is not None else None
|
|
354
473
|
if ac_sel.size:
|
|
355
474
|
try:
|
|
356
|
-
r = analyze_autocorr_matrix(
|
|
357
|
-
|
|
475
|
+
r = analyze_autocorr_matrix(
|
|
476
|
+
ac_sel,
|
|
477
|
+
cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
|
|
478
|
+
lags,
|
|
479
|
+
nrl_search_bp=(120, 260),
|
|
480
|
+
pad_factor=4,
|
|
481
|
+
min_count=10,
|
|
482
|
+
max_harmonics=6,
|
|
483
|
+
)
|
|
358
484
|
except Exception as e:
|
|
359
485
|
r = {"error": str(e)}
|
|
360
486
|
else:
|
|
361
487
|
r = {"error": "no_data"}
|
|
362
488
|
metrics_by_group[(sample_name, ref)] = r
|
|
363
489
|
|
|
364
|
-
# persist group metrics
|
|
365
490
|
adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
|
|
366
491
|
|
|
367
492
|
global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get("nrl_bp", None)
|
|
368
493
|
|
|
369
|
-
# configuration / sensible defaults (override in cfg if present)
|
|
370
494
|
rolling_cfg = {
|
|
371
|
-
"window_size": getattr(
|
|
495
|
+
"window_size": getattr(
|
|
496
|
+
cfg,
|
|
497
|
+
"rolling_window_size",
|
|
498
|
+
getattr(cfg, "autocorr_rolling_window_size", 600),
|
|
499
|
+
),
|
|
372
500
|
"step": getattr(cfg, "rolling_step", 100),
|
|
373
|
-
"max_lag": getattr(
|
|
501
|
+
"max_lag": getattr(
|
|
502
|
+
cfg,
|
|
503
|
+
"rolling_max_lag",
|
|
504
|
+
getattr(cfg, "autocorr_max_lag", 500),
|
|
505
|
+
),
|
|
374
506
|
"min_molecules_per_window": getattr(cfg, "rolling_min_molecules_per_window", 10),
|
|
375
507
|
"nrl_search_bp": getattr(cfg, "rolling_nrl_search_bp", (120, 240)),
|
|
376
508
|
"pad_factor": getattr(cfg, "rolling_pad_factor", 4),
|
|
@@ -381,23 +513,19 @@ def spatial_adata(config_path):
|
|
|
381
513
|
|
|
382
514
|
write_plots = getattr(cfg, "rolling_write_plots", True)
|
|
383
515
|
write_csvs = getattr(cfg, "rolling_write_csvs", True)
|
|
384
|
-
min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30)
|
|
516
|
+
min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30)
|
|
385
517
|
|
|
386
518
|
rolling_out_dir = os.path.join(pp_autocorr_dir, "rolling_metrics")
|
|
387
519
|
os.makedirs(rolling_out_dir, exist_ok=True)
|
|
388
|
-
# also a per-site subfolder
|
|
389
520
|
site_out_dir = os.path.join(rolling_out_dir, site_type)
|
|
390
521
|
os.makedirs(site_out_dir, exist_ok=True)
|
|
391
522
|
|
|
392
|
-
combined_rows = []
|
|
393
|
-
rolling_results_by_group = {}
|
|
523
|
+
combined_rows = []
|
|
524
|
+
rolling_results_by_group = {}
|
|
394
525
|
|
|
395
|
-
# iterate groups (samples × refs). `samples` and `refs` were computed above.
|
|
396
526
|
for sample_name in samples:
|
|
397
|
-
sample_mask =
|
|
398
|
-
# first the combined group ("all refs")
|
|
527
|
+
sample_mask = adata.obs[sample_col].values == sample_name
|
|
399
528
|
group_masks = [("all", sample_mask)]
|
|
400
|
-
# then per-reference groups
|
|
401
529
|
for ref in refs:
|
|
402
530
|
ref_mask = sample_mask & (adata.obs[ref_col].values == ref)
|
|
403
531
|
group_masks.append((ref, ref_mask))
|
|
@@ -405,17 +533,10 @@ def spatial_adata(config_path):
|
|
|
405
533
|
for ref_label, mask in group_masks:
|
|
406
534
|
n_group = int(mask.sum())
|
|
407
535
|
if n_group < min_molecules_for_group:
|
|
408
|
-
# skip tiny groups
|
|
409
|
-
if cfg.get("verbosity", 0) if hasattr(cfg, "get") else False:
|
|
410
|
-
print(f"Skipping rolling for {site_type} {sample_name} {ref_label}: only {n_group} molecules (<{min_molecules_for_group})")
|
|
411
|
-
# still write an empty CSV row set if desired; here we skip
|
|
412
536
|
continue
|
|
413
537
|
|
|
414
|
-
# extract group matrix X_group (works with dense or sparse adata.layers)
|
|
415
538
|
X_group = X[mask, :]
|
|
416
|
-
# positions already set above
|
|
417
539
|
try:
|
|
418
|
-
# call your rolling function (this may be slow; it uses cfg.n_jobs)
|
|
419
540
|
df_roll = rolling_autocorr_metrics(
|
|
420
541
|
X_group,
|
|
421
542
|
positions,
|
|
@@ -430,17 +551,20 @@ def spatial_adata(config_path):
|
|
|
430
551
|
max_harmonics=rolling_cfg["max_harmonics"],
|
|
431
552
|
n_jobs=rolling_cfg["n_jobs"],
|
|
432
553
|
verbose=False,
|
|
433
|
-
fixed_nrl_bp=global_nrl
|
|
554
|
+
fixed_nrl_bp=global_nrl,
|
|
434
555
|
)
|
|
435
556
|
except Exception as e:
|
|
436
|
-
warnings.warn(
|
|
557
|
+
warnings.warn(
|
|
558
|
+
f"rolling_autocorr_metrics failed for {site_type} "
|
|
559
|
+
f"{sample_name} {ref_label}: {e}"
|
|
560
|
+
)
|
|
437
561
|
continue
|
|
438
562
|
|
|
439
|
-
# normalize column names and keep only the compact set you want
|
|
440
|
-
# keep: center, n_molecules, nrl_bp, snr, xi, fwhm_bp
|
|
441
563
|
if "center" not in df_roll.columns:
|
|
442
|
-
|
|
443
|
-
|
|
564
|
+
warnings.warn(
|
|
565
|
+
f"rolling_autocorr_metrics returned unexpected schema "
|
|
566
|
+
f"for {site_type} {sample_name} {ref_label}"
|
|
567
|
+
)
|
|
444
568
|
continue
|
|
445
569
|
|
|
446
570
|
compact_df = df_roll[["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]].copy()
|
|
@@ -448,117 +572,126 @@ def spatial_adata(config_path):
|
|
|
448
572
|
compact_df["sample"] = sample_name
|
|
449
573
|
compact_df["reference"] = ref_label if ref_label != "all" else "all"
|
|
450
574
|
|
|
451
|
-
# save per-group CSV
|
|
452
575
|
if write_csvs:
|
|
453
576
|
safe_sample = str(sample_name).replace(os.sep, "_")
|
|
454
577
|
safe_ref = str(ref_label if ref_label != "all" else "all").replace(os.sep, "_")
|
|
455
|
-
out_csv = os.path.join(
|
|
578
|
+
out_csv = os.path.join(
|
|
579
|
+
site_out_dir,
|
|
580
|
+
f"{safe_sample}__{safe_ref}__rolling_metrics.csv",
|
|
581
|
+
)
|
|
456
582
|
try:
|
|
457
583
|
compact_df.to_csv(out_csv, index=False)
|
|
458
584
|
except Exception as e:
|
|
459
585
|
warnings.warn(f"Failed to write rolling CSV {out_csv}: {e}")
|
|
460
586
|
|
|
461
|
-
# save a plot per-group (NRL and SNR vs center)
|
|
462
587
|
if write_plots:
|
|
463
588
|
try:
|
|
464
|
-
# use your plot helper; if it's in a different module, import accordingly
|
|
465
589
|
from ..plotting import plot_rolling_metrics as _plot_roll
|
|
466
590
|
except Exception:
|
|
467
|
-
_plot_roll =
|
|
591
|
+
_plot_roll = None
|
|
468
592
|
if _plot_roll is not None:
|
|
469
|
-
plot_png = os.path.join(
|
|
593
|
+
plot_png = os.path.join(
|
|
594
|
+
site_out_dir,
|
|
595
|
+
f"{safe_sample}__{safe_ref}__rolling_metrics.png",
|
|
596
|
+
)
|
|
470
597
|
try:
|
|
471
|
-
_plot_roll(
|
|
472
|
-
|
|
473
|
-
|
|
598
|
+
_plot_roll(
|
|
599
|
+
compact_df,
|
|
600
|
+
out_png=plot_png,
|
|
601
|
+
title=f"{site_type} {sample_name} {ref_label}",
|
|
602
|
+
figsize=(10, 3.5),
|
|
603
|
+
dpi=160,
|
|
604
|
+
show=False,
|
|
605
|
+
)
|
|
474
606
|
except Exception as e:
|
|
475
|
-
warnings.warn(
|
|
607
|
+
warnings.warn(
|
|
608
|
+
f"Failed to create rolling plot for {site_type} "
|
|
609
|
+
f"{sample_name} {ref_label}: {e}"
|
|
610
|
+
)
|
|
476
611
|
|
|
477
|
-
|
|
478
|
-
|
|
612
|
+
combined_rows.append(
|
|
613
|
+
compact_df.assign(site=site_type, sample=sample_name, reference=ref_label)
|
|
614
|
+
)
|
|
479
615
|
rolling_results_by_group[(sample_name, None if ref_label == "all" else ref_label)] = compact_df
|
|
480
616
|
|
|
481
|
-
# persist per-site rolling metrics into adata.uns as dict of DataFrames (or empty dict)
|
|
482
617
|
adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
|
|
483
618
|
|
|
484
|
-
|
|
485
|
-
if len(combined_rows):
|
|
619
|
+
if combined_rows:
|
|
486
620
|
combined_df_site = pd.concat(combined_rows, ignore_index=True, sort=False)
|
|
487
|
-
combined_out_csv = os.path.join(
|
|
621
|
+
combined_out_csv = os.path.join(
|
|
622
|
+
rolling_out_dir, f"{site_type}__rolling_metrics_combined.csv"
|
|
623
|
+
)
|
|
488
624
|
try:
|
|
489
625
|
combined_df_site.to_csv(combined_out_csv, index=False)
|
|
490
626
|
except Exception as e:
|
|
491
|
-
warnings.warn(
|
|
627
|
+
warnings.warn(
|
|
628
|
+
f"Failed to write combined rolling CSV for {site_type}: {e}"
|
|
629
|
+
)
|
|
492
630
|
|
|
493
631
|
rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
|
|
494
632
|
plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
|
|
495
633
|
os.makedirs(plot_out_dir, exist_ok=True)
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
plot_spatial_autocorr_grid(adata,
|
|
507
|
-
pp_autocorr_dir,
|
|
508
|
-
site_types=cfg.autocorr_site_types,
|
|
509
|
-
sample_col=cfg.sample_name_col_for_plotting,
|
|
510
|
-
window=cfg.autocorr_rolling_window_size,
|
|
511
|
-
rows_per_fig=cfg.rows_per_qc_autocorr_grid)
|
|
512
|
-
|
|
513
|
-
############ Pearson analyses ###############
|
|
514
|
-
if smf_modality != 'direct':
|
|
515
|
-
from ..tools.position_stats import compute_positionwise_statistics, plot_positionwise_matrices
|
|
516
|
-
|
|
517
|
-
pp_corr_dir = pp_dir / "09_correlation_matrices"
|
|
518
|
-
|
|
519
|
-
if pp_corr_dir.is_dir():
|
|
520
|
-
print(f'{pp_corr_dir} already exists. Skipping correlation matrix plotting.')
|
|
521
|
-
else:
|
|
522
|
-
compute_positionwise_statistics(
|
|
523
|
-
adata,
|
|
524
|
-
layer="nan0_0minus1",
|
|
525
|
-
methods=cfg.correlation_matrix_types,
|
|
526
|
-
sample_col=cfg.sample_name_col_for_plotting,
|
|
527
|
-
ref_col=cfg.reference_column,
|
|
528
|
-
output_key="positionwise_result",
|
|
529
|
-
site_types=cfg.correlation_matrix_site_types,
|
|
530
|
-
encoding="signed",
|
|
531
|
-
max_threads=cfg.threads,
|
|
532
|
-
min_count_for_pairwise=10,
|
|
634
|
+
_ = plot_rolling_grid(
|
|
635
|
+
rolling_dict,
|
|
636
|
+
plot_out_dir,
|
|
637
|
+
site_type,
|
|
638
|
+
rows_per_page=cfg.rows_per_qc_autocorr_grid,
|
|
639
|
+
cols_per_page=len(refs),
|
|
640
|
+
dpi=160,
|
|
641
|
+
metrics=("nrl_bp", "snr", "xi"),
|
|
642
|
+
per_metric_ylim={"snr": (0, 25)},
|
|
533
643
|
)
|
|
534
|
-
|
|
535
|
-
|
|
644
|
+
|
|
645
|
+
make_dirs([pp_autocorr_dir])
|
|
646
|
+
plot_spatial_autocorr_grid(
|
|
536
647
|
adata,
|
|
537
|
-
|
|
648
|
+
pp_autocorr_dir,
|
|
649
|
+
site_types=cfg.autocorr_site_types,
|
|
538
650
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
dpi=160,
|
|
542
|
-
cmaps=cfg.correlation_matrix_cmaps,
|
|
543
|
-
vmin=None,
|
|
544
|
-
vmax=None,
|
|
545
|
-
output_dir=pp_corr_dir,
|
|
546
|
-
output_key= "positionwise_result"
|
|
651
|
+
window=cfg.autocorr_rolling_window_size,
|
|
652
|
+
rows_per_fig=cfg.rows_per_qc_autocorr_grid,
|
|
547
653
|
)
|
|
548
654
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
if ".gz" == spatial_adata_path.suffix:
|
|
554
|
-
print(f"Spatial adata path: {spatial_adata_path}")
|
|
555
|
-
safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
|
|
556
|
-
else:
|
|
557
|
-
spatial_adata_path = spatial_adata_path.with_name(spatial_adata_path.name + '.gz')
|
|
558
|
-
print(f"Spatial adata path: {spatial_adata_path}")
|
|
559
|
-
safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
|
|
560
|
-
############################################### smftools spatial end ###############################################
|
|
655
|
+
# ============================================================
|
|
656
|
+
# 4) Pearson / correlation matrices
|
|
657
|
+
# ============================================================
|
|
658
|
+
pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
|
|
561
659
|
|
|
562
|
-
|
|
660
|
+
if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
661
|
+
print(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
|
|
662
|
+
else:
|
|
663
|
+
compute_positionwise_statistics(
|
|
664
|
+
adata,
|
|
665
|
+
layer="nan0_0minus1",
|
|
666
|
+
methods=cfg.correlation_matrix_types,
|
|
667
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
668
|
+
ref_col=cfg.reference_column,
|
|
669
|
+
output_key="positionwise_result",
|
|
670
|
+
site_types=cfg.correlation_matrix_site_types,
|
|
671
|
+
encoding="signed",
|
|
672
|
+
max_threads=cfg.threads,
|
|
673
|
+
min_count_for_pairwise=10,
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
plot_positionwise_matrices(
|
|
677
|
+
adata,
|
|
678
|
+
methods=cfg.correlation_matrix_types,
|
|
679
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
680
|
+
ref_col=cfg.reference_column,
|
|
681
|
+
figsize_per_cell=(4.0, 3.0),
|
|
682
|
+
dpi=160,
|
|
683
|
+
cmaps=cfg.correlation_matrix_cmaps,
|
|
684
|
+
vmin=None,
|
|
685
|
+
vmax=None,
|
|
686
|
+
output_dir=pp_corr_dir,
|
|
687
|
+
output_key="positionwise_result",
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
# ============================================================
|
|
691
|
+
# 5) Save spatial AnnData
|
|
692
|
+
# ============================================================
|
|
693
|
+
if (not spatial_adata_path.exists()) or getattr(cfg, "force_redo_spatial_analyses", False):
|
|
694
|
+
print("Saving spatial analyzed AnnData (post preprocessing and duplicate removal).")
|
|
695
|
+
write_gz_h5ad(adata, spatial_adata_path)
|
|
563
696
|
|
|
564
697
|
return adata, spatial_adata_path
|