smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/cli/spatial_adata.py
CHANGED
|
@@ -1,277 +1,458 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import anndata as ad
|
|
5
|
+
|
|
6
|
+
from smftools.logging_utils import get_logger
|
|
5
7
|
|
|
6
|
-
|
|
7
|
-
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
8
|
+
logger = get_logger(__name__)
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
def spatial_adata(
|
|
12
|
+
config_path: str,
|
|
13
|
+
) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
|
|
14
|
+
"""
|
|
15
|
+
CLI-facing wrapper for spatial analyses.
|
|
16
|
+
|
|
17
|
+
Called by: `smftools spatial <config_path>`
|
|
18
|
+
|
|
19
|
+
Responsibilities:
|
|
20
|
+
- Ensure a usable AnnData exists via `load_adata` + `preprocess_adata`.
|
|
21
|
+
- Determine which AnnData stages exist (raw, pp, pp_dedup, spatial, hmm).
|
|
22
|
+
- Respect cfg.force_redo_spatial_analyses.
|
|
23
|
+
- Decide whether to skip (return existing) or run the spatial core.
|
|
24
|
+
- Call `spatial_adata_core(...)` when actual work is needed.
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
spatial_adata : AnnData | None
|
|
29
|
+
AnnData with spatial analyses, or None if we skipped because a later-stage
|
|
30
|
+
AnnData already exists.
|
|
31
|
+
spatial_adata_path : Path | None
|
|
32
|
+
Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
|
|
11
33
|
"""
|
|
12
|
-
from ..readwrite import
|
|
34
|
+
from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
|
|
35
|
+
from .helpers import get_adata_paths
|
|
13
36
|
from .load_adata import load_adata
|
|
14
37
|
from .preprocess_adata import preprocess_adata
|
|
15
38
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
39
|
+
# 1) Ensure config + basic paths via load_adata
|
|
40
|
+
loaded_adata, loaded_path, cfg = load_adata(config_path)
|
|
41
|
+
paths = get_adata_paths(cfg)
|
|
42
|
+
|
|
43
|
+
raw_path = paths.raw
|
|
44
|
+
pp_path = paths.pp
|
|
45
|
+
pp_dedup_path = paths.pp_dedup
|
|
46
|
+
spatial_path = paths.spatial
|
|
47
|
+
hmm_path = paths.hmm
|
|
48
|
+
|
|
49
|
+
# Stage-skipping logic for spatial
|
|
50
|
+
if not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
51
|
+
# If HMM exists, it's the most processed stage — reuse it.
|
|
52
|
+
if hmm_path.exists():
|
|
53
|
+
logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
|
|
54
|
+
return None, hmm_path
|
|
55
|
+
|
|
56
|
+
# If spatial exists, we consider spatial analyses already done.
|
|
57
|
+
if spatial_path.exists():
|
|
58
|
+
logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
|
|
59
|
+
return None, spatial_path
|
|
60
|
+
|
|
61
|
+
# 2) Ensure preprocessing has been run
|
|
62
|
+
# This will create pp/pp_dedup as needed or return them if they already exist.
|
|
63
|
+
pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
|
|
64
|
+
config_path
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Helper to load from disk, reusing loaded_adata if it matches
|
|
68
|
+
def _load(path: Path):
|
|
69
|
+
if loaded_adata is not None and loaded_path == path:
|
|
70
|
+
return loaded_adata
|
|
71
|
+
adata, _ = safe_read_h5ad(path)
|
|
72
|
+
return adata
|
|
73
|
+
|
|
74
|
+
# 3) Decide which AnnData to use as the *starting point* for spatial analyses
|
|
75
|
+
# Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
|
|
76
|
+
if pp_dedup_adata is not None:
|
|
77
|
+
start_adata = pp_dedup_adata
|
|
78
|
+
source_path = pp_dedup_adata_path_ret
|
|
79
|
+
else:
|
|
80
|
+
if pp_dedup_path.exists():
|
|
81
|
+
start_adata = _load(pp_dedup_path)
|
|
82
|
+
source_path = pp_dedup_path
|
|
83
|
+
elif pp_path.exists():
|
|
84
|
+
start_adata = _load(pp_path)
|
|
85
|
+
source_path = pp_path
|
|
86
|
+
elif raw_path.exists():
|
|
87
|
+
start_adata = _load(raw_path)
|
|
88
|
+
source_path = raw_path
|
|
89
|
+
else:
|
|
90
|
+
logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
|
|
91
|
+
return None, None
|
|
92
|
+
|
|
93
|
+
# 4) Run the spatial core
|
|
94
|
+
adata_spatial, spatial_path = spatial_adata_core(
|
|
95
|
+
adata=start_adata,
|
|
96
|
+
cfg=cfg,
|
|
97
|
+
spatial_adata_path=spatial_path,
|
|
98
|
+
pp_adata_path=pp_path,
|
|
99
|
+
pp_dup_rem_adata_path=pp_dedup_path,
|
|
100
|
+
pp_adata_in_memory=pp_adata,
|
|
101
|
+
source_adata_path=source_path,
|
|
102
|
+
config_path=config_path,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# 5) Register spatial path in summary CSV
|
|
106
|
+
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
|
|
107
|
+
|
|
108
|
+
return adata_spatial, spatial_path
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def spatial_adata_core(
|
|
112
|
+
adata: ad.AnnData,
|
|
113
|
+
cfg,
|
|
114
|
+
spatial_adata_path: Path,
|
|
115
|
+
pp_adata_path: Path,
|
|
116
|
+
pp_dup_rem_adata_path: Path,
|
|
117
|
+
pp_adata_in_memory: Optional[ad.AnnData] = None,
|
|
118
|
+
source_adata_path: Optional[Path] = None,
|
|
119
|
+
config_path: Optional[str] = None,
|
|
120
|
+
) -> Tuple[ad.AnnData, Path]:
|
|
121
|
+
"""
|
|
122
|
+
Core spatial analysis pipeline.
|
|
123
|
+
|
|
124
|
+
Assumes:
|
|
125
|
+
- `adata` is (typically) the preprocessed, duplicate-removed AnnData.
|
|
126
|
+
- `cfg` is the ExperimentConfig.
|
|
127
|
+
- `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
|
|
128
|
+
from `get_adata_paths`.
|
|
129
|
+
- `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
|
|
130
|
+
the same run of `preprocess_adata`, to avoid re-reading from disk.
|
|
131
|
+
|
|
132
|
+
Does:
|
|
133
|
+
- Optional sample sheet load.
|
|
134
|
+
- Optional inversion & reindexing.
|
|
135
|
+
- Clustermaps on:
|
|
136
|
+
* preprocessed (non-dedup) AnnData (for non-direct modalities), and
|
|
137
|
+
* deduplicated preprocessed AnnData.
|
|
138
|
+
- PCA/UMAP/Leiden.
|
|
139
|
+
- Autocorrelation + rolling metrics + grids.
|
|
140
|
+
- Positionwise correlation matrices (non-direct modalities).
|
|
141
|
+
- Save spatial AnnData to `spatial_adata_path`.
|
|
142
|
+
|
|
143
|
+
Returns
|
|
144
|
+
-------
|
|
145
|
+
adata : AnnData
|
|
146
|
+
Spatially analyzed AnnData (same object, modified in-place).
|
|
147
|
+
spatial_adata_path : Path
|
|
148
|
+
Path where spatial AnnData was written.
|
|
149
|
+
"""
|
|
21
150
|
import os
|
|
22
|
-
|
|
151
|
+
import warnings
|
|
23
152
|
from pathlib import Path
|
|
24
153
|
|
|
25
|
-
|
|
26
|
-
|
|
154
|
+
import numpy as np
|
|
155
|
+
import pandas as pd
|
|
156
|
+
import scanpy as sc
|
|
27
157
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
158
|
+
from ..metadata import record_smftools_metadata
|
|
159
|
+
from ..plotting import (
|
|
160
|
+
combined_raw_clustermap,
|
|
161
|
+
plot_rolling_grid,
|
|
162
|
+
plot_spatial_autocorr_grid,
|
|
163
|
+
)
|
|
164
|
+
from ..preprocessing import (
|
|
165
|
+
invert_adata,
|
|
166
|
+
load_sample_sheet,
|
|
167
|
+
reindex_references_adata,
|
|
168
|
+
)
|
|
169
|
+
from ..readwrite import make_dirs, safe_read_h5ad
|
|
170
|
+
from ..tools import calculate_umap
|
|
171
|
+
from ..tools.position_stats import (
|
|
172
|
+
compute_positionwise_statistics,
|
|
173
|
+
plot_positionwise_matrices,
|
|
174
|
+
)
|
|
175
|
+
from ..tools.spatial_autocorrelation import (
|
|
176
|
+
analyze_autocorr_matrix,
|
|
177
|
+
binary_autocorrelation_with_spacing,
|
|
178
|
+
bootstrap_periodicity,
|
|
179
|
+
rolling_autocorr_metrics,
|
|
180
|
+
)
|
|
181
|
+
from .helpers import write_gz_h5ad
|
|
182
|
+
|
|
183
|
+
# -----------------------------
|
|
184
|
+
# General setup
|
|
185
|
+
# -----------------------------
|
|
186
|
+
output_directory = Path(cfg.output_directory)
|
|
34
187
|
make_dirs([output_directory])
|
|
35
|
-
############################################### smftools load end ###############################################
|
|
36
188
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
############################################### smftools preprocess end ###############################################
|
|
40
|
-
|
|
41
|
-
############################################### smftools spatial start ###############################################
|
|
42
|
-
input_manager_df = pd.read_csv(cfg.summary_file)
|
|
43
|
-
initial_adata_path = Path(input_manager_df['load_adata'][0])
|
|
44
|
-
pp_adata_path = Path(input_manager_df['pp_adata'][0])
|
|
45
|
-
pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
|
|
46
|
-
spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
|
|
47
|
-
hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
|
|
48
|
-
|
|
49
|
-
if smf_modality == 'conversion':
|
|
189
|
+
smf_modality = cfg.smf_modality
|
|
190
|
+
if smf_modality == "conversion":
|
|
50
191
|
deaminase = False
|
|
51
192
|
else:
|
|
52
193
|
deaminase = True
|
|
53
194
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
195
|
+
first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
|
|
196
|
+
|
|
197
|
+
# -----------------------------
|
|
198
|
+
# Optional sample sheet metadata
|
|
199
|
+
# -----------------------------
|
|
200
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
201
|
+
load_sample_sheet(
|
|
202
|
+
adata,
|
|
203
|
+
cfg.sample_sheet_path,
|
|
204
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
205
|
+
as_category=True,
|
|
206
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# -----------------------------
|
|
210
|
+
# Optional inversion along positions axis
|
|
211
|
+
# -----------------------------
|
|
212
|
+
if getattr(cfg, "invert_adata", False):
|
|
213
|
+
adata = invert_adata(adata)
|
|
214
|
+
|
|
215
|
+
# -----------------------------
|
|
216
|
+
# Optional reindexing by reference
|
|
217
|
+
# -----------------------------
|
|
218
|
+
reindex_references_adata(
|
|
219
|
+
adata,
|
|
220
|
+
reference_col=cfg.reference_column,
|
|
221
|
+
offsets=cfg.reindexing_offsets,
|
|
222
|
+
new_col=cfg.reindexed_var_suffix,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
if adata.uns.get("reindex_references_adata_performed", False):
|
|
226
|
+
reindex_suffix = cfg.reindexed_var_suffix
|
|
59
227
|
else:
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
initial_version_available = initial_adata_path.exists()
|
|
63
|
-
preprocessed_version_available = pp_adata_path.exists()
|
|
64
|
-
preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
|
|
65
|
-
preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
|
|
66
|
-
hmm_version_available = hmm_adata_path.exists()
|
|
67
|
-
|
|
68
|
-
if cfg.force_redo_basic_analyses:
|
|
69
|
-
print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
|
|
70
|
-
if preprocessed_dup_removed_version_available:
|
|
71
|
-
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
72
|
-
adata_version = "pp_dedup"
|
|
73
|
-
elif preprocessed_version_available:
|
|
74
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
75
|
-
adata_version = "pp"
|
|
76
|
-
elif initial_version_available:
|
|
77
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
78
|
-
adata_version = "initial"
|
|
79
|
-
else:
|
|
80
|
-
print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
|
|
81
|
-
return
|
|
82
|
-
elif preprocessed_dedup_spatial_version_available:
|
|
83
|
-
print(f"Preprocessed deduplicated spatial anndata found: {spatial_adata_path}")
|
|
84
|
-
return None, spatial_adata_path
|
|
85
|
-
elif preprocessed_dup_removed_version_available:
|
|
86
|
-
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
87
|
-
adata_version = "pp_dedup"
|
|
88
|
-
elif preprocessed_version_available:
|
|
89
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
90
|
-
adata_version = "pp"
|
|
91
|
-
elif initial_version_available:
|
|
92
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
93
|
-
adata_version = "initial"
|
|
94
|
-
else:
|
|
95
|
-
print(f"No adata available.")
|
|
96
|
-
return
|
|
97
|
-
|
|
228
|
+
reindex_suffix = None
|
|
229
|
+
|
|
98
230
|
pp_dir = output_directory / "preprocessed"
|
|
99
231
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
100
232
|
|
|
101
|
-
|
|
102
|
-
|
|
233
|
+
# ============================================================
|
|
234
|
+
# 1) Clustermaps (non-direct modalities) on *preprocessed* data
|
|
235
|
+
# ============================================================
|
|
236
|
+
if smf_modality != "direct":
|
|
237
|
+
preprocessed_version_available = pp_adata_path.exists()
|
|
238
|
+
|
|
103
239
|
if preprocessed_version_available:
|
|
104
240
|
pp_clustermap_dir = pp_dir / "06_clustermaps"
|
|
105
241
|
|
|
106
|
-
if pp_clustermap_dir.is_dir()
|
|
107
|
-
|
|
242
|
+
if pp_clustermap_dir.is_dir() and not getattr(
|
|
243
|
+
cfg, "force_redo_spatial_analyses", False
|
|
244
|
+
):
|
|
245
|
+
logger.debug(
|
|
246
|
+
f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
|
|
247
|
+
)
|
|
108
248
|
else:
|
|
109
|
-
from ..plotting import combined_raw_clustermap
|
|
110
249
|
make_dirs([pp_dir, pp_clustermap_dir])
|
|
111
250
|
|
|
112
|
-
if not
|
|
113
|
-
pp_adata
|
|
251
|
+
if first_pp_run and (pp_adata_in_memory is not None):
|
|
252
|
+
pp_adata = pp_adata_in_memory
|
|
114
253
|
else:
|
|
115
|
-
pp_adata =
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
cmap_a="coolwarm",
|
|
129
|
-
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
130
|
-
min_length=cfg.read_len_filter_thresholds[0],
|
|
131
|
-
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
|
|
132
|
-
min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
133
|
-
bins=None,
|
|
134
|
-
sample_mapping=None,
|
|
135
|
-
save_path=pp_clustermap_dir,
|
|
136
|
-
sort_by='gpc',
|
|
137
|
-
deaminase=deaminase)
|
|
138
|
-
if first_pp_run:
|
|
139
|
-
adata = adata_unique
|
|
140
|
-
else:
|
|
141
|
-
pass
|
|
254
|
+
pp_adata, _ = safe_read_h5ad(pp_adata_path)
|
|
255
|
+
|
|
256
|
+
# -----------------------------
|
|
257
|
+
# Optional sample sheet metadata
|
|
258
|
+
# -----------------------------
|
|
259
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
260
|
+
load_sample_sheet(
|
|
261
|
+
pp_adata,
|
|
262
|
+
cfg.sample_sheet_path,
|
|
263
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
264
|
+
as_category=True,
|
|
265
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
266
|
+
)
|
|
142
267
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
268
|
+
# -----------------------------
|
|
269
|
+
# Optional inversion along positions axis
|
|
270
|
+
# -----------------------------
|
|
271
|
+
if getattr(cfg, "invert_adata", False):
|
|
272
|
+
pp_adata = invert_adata(pp_adata)
|
|
273
|
+
|
|
274
|
+
# -----------------------------
|
|
275
|
+
# Optional reindexing by reference
|
|
276
|
+
# -----------------------------
|
|
277
|
+
reindex_references_adata(
|
|
278
|
+
pp_adata,
|
|
279
|
+
reference_col=cfg.reference_column,
|
|
280
|
+
offsets=cfg.reindexing_offsets,
|
|
281
|
+
new_col=cfg.reindexed_var_suffix,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
combined_raw_clustermap(
|
|
285
|
+
pp_adata,
|
|
286
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
287
|
+
reference_col=cfg.reference_column,
|
|
288
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
289
|
+
layer_c=cfg.layer_for_clustermap_plotting,
|
|
290
|
+
layer_gpc=cfg.layer_for_clustermap_plotting,
|
|
291
|
+
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
292
|
+
layer_a=cfg.layer_for_clustermap_plotting,
|
|
293
|
+
cmap_c=cfg.clustermap_cmap_c,
|
|
294
|
+
cmap_gpc=cfg.clustermap_cmap_gpc,
|
|
295
|
+
cmap_cpg=cfg.clustermap_cmap_cpg,
|
|
296
|
+
cmap_a=cfg.clustermap_cmap_a,
|
|
297
|
+
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
298
|
+
min_length=cfg.read_len_filter_thresholds[0],
|
|
299
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
300
|
+
0
|
|
301
|
+
],
|
|
302
|
+
min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
303
|
+
demux_types=("double", "already"),
|
|
304
|
+
bins=None,
|
|
305
|
+
sample_mapping=None,
|
|
306
|
+
save_path=pp_clustermap_dir,
|
|
307
|
+
sort_by=cfg.spatial_clustermap_sortby,
|
|
308
|
+
deaminase=deaminase,
|
|
309
|
+
index_col_suffix=reindex_suffix,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# ============================================================
|
|
313
|
+
# 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
|
|
314
|
+
# ============================================================
|
|
315
|
+
pp_dir_dedup = pp_dir / "deduplicated"
|
|
316
|
+
pp_clustermap_dir_dedup = pp_dir_dedup / "06_clustermaps"
|
|
317
|
+
pp_umap_dir = pp_dir_dedup / "07_umaps"
|
|
318
|
+
|
|
319
|
+
# Clustermaps on deduplicated adata
|
|
320
|
+
if pp_clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
321
|
+
logger.debug(
|
|
322
|
+
f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
|
|
323
|
+
)
|
|
153
324
|
else:
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
325
|
+
make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
|
|
326
|
+
combined_raw_clustermap(
|
|
327
|
+
adata,
|
|
328
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
329
|
+
reference_col=cfg.reference_column,
|
|
330
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
331
|
+
layer_c=cfg.layer_for_clustermap_plotting,
|
|
332
|
+
layer_gpc=cfg.layer_for_clustermap_plotting,
|
|
333
|
+
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
334
|
+
layer_a=cfg.layer_for_clustermap_plotting,
|
|
335
|
+
cmap_c=cfg.clustermap_cmap_c,
|
|
336
|
+
cmap_gpc=cfg.clustermap_cmap_gpc,
|
|
337
|
+
cmap_cpg=cfg.clustermap_cmap_cpg,
|
|
338
|
+
cmap_a=cfg.clustermap_cmap_a,
|
|
339
|
+
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
340
|
+
min_length=cfg.read_len_filter_thresholds[0],
|
|
341
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
342
|
+
0
|
|
343
|
+
],
|
|
344
|
+
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
345
|
+
demux_types=("double", "already"),
|
|
346
|
+
bins=None,
|
|
347
|
+
sample_mapping=None,
|
|
348
|
+
save_path=pp_clustermap_dir_dedup,
|
|
349
|
+
sort_by=cfg.spatial_clustermap_sortby,
|
|
350
|
+
deaminase=deaminase,
|
|
351
|
+
index_col_suffix=reindex_suffix,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# UMAP / Leiden
|
|
355
|
+
if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
356
|
+
logger.debug(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
|
|
185
357
|
else:
|
|
186
|
-
from ..tools import calculate_umap
|
|
187
358
|
make_dirs([pp_umap_dir])
|
|
188
359
|
|
|
189
360
|
var_filters = []
|
|
190
|
-
if smf_modality ==
|
|
361
|
+
if smf_modality == "direct":
|
|
191
362
|
for ref in references:
|
|
192
363
|
for base in cfg.mod_target_bases:
|
|
193
|
-
var_filters
|
|
364
|
+
var_filters.append(f"{ref}_{base}_site")
|
|
194
365
|
elif deaminase:
|
|
195
366
|
for ref in references:
|
|
196
|
-
var_filters
|
|
367
|
+
var_filters.append(f"{ref}_C_site")
|
|
197
368
|
else:
|
|
198
369
|
for ref in references:
|
|
199
370
|
for base in cfg.mod_target_bases:
|
|
200
|
-
var_filters
|
|
371
|
+
var_filters.append(f"{ref}_{base}_site")
|
|
201
372
|
|
|
202
|
-
adata = calculate_umap(
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
373
|
+
adata = calculate_umap(
|
|
374
|
+
adata,
|
|
375
|
+
layer=cfg.layer_for_umap_plotting,
|
|
376
|
+
var_filters=var_filters,
|
|
377
|
+
n_pcs=10,
|
|
378
|
+
knn_neighbors=15,
|
|
379
|
+
)
|
|
207
380
|
|
|
208
|
-
## Clustering
|
|
209
381
|
sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
|
|
210
382
|
|
|
211
|
-
# Plotting UMAP
|
|
212
383
|
sc.settings.figdir = pp_umap_dir
|
|
213
|
-
umap_layers = [
|
|
384
|
+
umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
214
385
|
umap_layers += cfg.umap_layers_to_plot
|
|
215
386
|
sc.pl.umap(adata, color=umap_layers, show=False, save=True)
|
|
216
387
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
pp_autocorr_dir = pp_dir / "08_autocorrelations"
|
|
388
|
+
# ============================================================
|
|
389
|
+
# 3) Spatial autocorrelation + rolling metrics
|
|
390
|
+
# ============================================================
|
|
391
|
+
pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
|
|
223
392
|
|
|
224
|
-
if pp_autocorr_dir.is_dir():
|
|
225
|
-
|
|
393
|
+
if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
394
|
+
logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
|
|
226
395
|
else:
|
|
227
396
|
positions = adata.var_names.astype(int).values
|
|
228
397
|
lags = np.arange(cfg.autocorr_max_lag + 1)
|
|
229
398
|
|
|
230
|
-
# optional: try to parallelize autocorr per-row with joblib
|
|
231
399
|
try:
|
|
232
400
|
from joblib import Parallel, delayed
|
|
401
|
+
|
|
233
402
|
_have_joblib = True
|
|
234
403
|
except Exception:
|
|
235
404
|
_have_joblib = False
|
|
236
405
|
|
|
406
|
+
samples = (
|
|
407
|
+
adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
|
|
408
|
+
)
|
|
409
|
+
ref_col = getattr(cfg, "reference_strand_col", "Reference_strand")
|
|
410
|
+
refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
|
|
411
|
+
|
|
237
412
|
for site_type in cfg.autocorr_site_types:
|
|
238
413
|
layer_key = f"{site_type}_site_binary"
|
|
239
414
|
if layer_key not in adata.layers:
|
|
240
|
-
|
|
415
|
+
logger.debug(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
|
|
241
416
|
continue
|
|
242
417
|
|
|
243
418
|
X = adata.layers[layer_key]
|
|
244
419
|
if getattr(X, "shape", (0,))[0] == 0:
|
|
245
|
-
|
|
420
|
+
logger.debug(f"Layer {layer_key} empty — skipping {site_type}.")
|
|
246
421
|
continue
|
|
247
422
|
|
|
248
|
-
# compute per-molecule autocorrs (and counts)
|
|
249
423
|
rows = []
|
|
250
424
|
counts = []
|
|
425
|
+
|
|
251
426
|
if _have_joblib:
|
|
252
|
-
|
|
427
|
+
|
|
253
428
|
def _worker(row):
|
|
254
429
|
try:
|
|
255
430
|
ac, cnts = binary_autocorrelation_with_spacing(
|
|
256
|
-
row,
|
|
431
|
+
row,
|
|
432
|
+
positions,
|
|
433
|
+
max_lag=cfg.autocorr_max_lag,
|
|
434
|
+
return_counts=True,
|
|
435
|
+
normalize=cfg.autocorr_normalization_method,
|
|
257
436
|
)
|
|
258
|
-
except Exception
|
|
259
|
-
# on error return NaN arrays
|
|
437
|
+
except Exception:
|
|
260
438
|
ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
|
|
261
439
|
cnts = np.zeros(cfg.autocorr_max_lag + 1, dtype=np.int32)
|
|
262
440
|
return ac, cnts
|
|
263
441
|
|
|
264
|
-
res = Parallel(n_jobs=
|
|
442
|
+
res = Parallel(n_jobs=getattr(cfg, "n_jobs", -1))(
|
|
265
443
|
delayed(_worker)(X[i]) for i in range(X.shape[0])
|
|
266
444
|
)
|
|
267
445
|
for ac, cnts in res:
|
|
268
446
|
rows.append(ac)
|
|
269
447
|
counts.append(cnts)
|
|
270
448
|
else:
|
|
271
|
-
# sequential fallback
|
|
272
449
|
for i in range(X.shape[0]):
|
|
273
450
|
ac, cnts = binary_autocorrelation_with_spacing(
|
|
274
|
-
X[i],
|
|
451
|
+
X[i],
|
|
452
|
+
positions,
|
|
453
|
+
max_lag=cfg.autocorr_max_lag,
|
|
454
|
+
return_counts=True,
|
|
455
|
+
normalize=cfg.autocorr_normalization_method,
|
|
275
456
|
)
|
|
276
457
|
rows.append(ac)
|
|
277
458
|
counts.append(cnts)
|
|
@@ -279,21 +460,23 @@ def spatial_adata(config_path):
|
|
|
279
460
|
autocorr_matrix = np.asarray(rows, dtype=np.float32)
|
|
280
461
|
counts_matrix = np.asarray(counts, dtype=np.int32)
|
|
281
462
|
|
|
282
|
-
# store raw per-molecule arrays (keep memory format compact)
|
|
283
463
|
adata.obsm[f"{site_type}_spatial_autocorr"] = autocorr_matrix
|
|
284
464
|
adata.obsm[f"{site_type}_spatial_autocorr_counts"] = counts_matrix
|
|
285
465
|
adata.uns[f"{site_type}_spatial_autocorr_lags"] = lags
|
|
286
466
|
|
|
287
|
-
# compute global periodicity metrics across all molecules for this site_type
|
|
288
467
|
try:
|
|
289
468
|
results = analyze_autocorr_matrix(
|
|
290
|
-
autocorr_matrix,
|
|
291
|
-
|
|
469
|
+
autocorr_matrix,
|
|
470
|
+
counts_matrix,
|
|
471
|
+
lags,
|
|
472
|
+
nrl_search_bp=(120, 260),
|
|
473
|
+
pad_factor=4,
|
|
474
|
+
min_count=20,
|
|
475
|
+
max_harmonics=6,
|
|
292
476
|
)
|
|
293
477
|
except Exception as e:
|
|
294
478
|
results = {"error": str(e)}
|
|
295
479
|
|
|
296
|
-
# store global metrics (same keys you used)
|
|
297
480
|
global_metrics = {
|
|
298
481
|
"nrl_bp": results.get("nrl_bp", np.nan),
|
|
299
482
|
"xi": results.get("xi", np.nan),
|
|
@@ -305,13 +488,16 @@ def spatial_adata(config_path):
|
|
|
305
488
|
}
|
|
306
489
|
adata.uns[f"{site_type}_spatial_periodicity_metrics"] = global_metrics
|
|
307
490
|
|
|
308
|
-
# bootstrap for CI (use a reasonable default; set low only for debugging)
|
|
309
491
|
n_boot = getattr(cfg, "autocorr_bootstrap_n", 200)
|
|
310
|
-
# if user intentionally set very low n_boot in cfg, we keep that; otherwise default 200
|
|
311
492
|
try:
|
|
312
493
|
bs = bootstrap_periodicity(
|
|
313
|
-
autocorr_matrix,
|
|
314
|
-
|
|
494
|
+
autocorr_matrix,
|
|
495
|
+
counts_matrix,
|
|
496
|
+
lags,
|
|
497
|
+
n_boot=n_boot,
|
|
498
|
+
nrl_search_bp=(120, 260),
|
|
499
|
+
pad_factor=4,
|
|
500
|
+
min_count=20,
|
|
315
501
|
)
|
|
316
502
|
adata.uns[f"{site_type}_spatial_periodicity_boot"] = {
|
|
317
503
|
"nrl_boot": np.asarray(bs["nrl_boot"]).tolist(),
|
|
@@ -320,57 +506,74 @@ def spatial_adata(config_path):
|
|
|
320
506
|
except Exception as e:
|
|
321
507
|
adata.uns[f"{site_type}_spatial_periodicity_boot_error"] = str(e)
|
|
322
508
|
|
|
323
|
-
# ----------------------------
|
|
324
|
-
# Compute group-level metrics for plotting (per sample × reference)
|
|
325
|
-
# ----------------------------
|
|
326
509
|
metrics_by_group = {}
|
|
327
510
|
sample_col = cfg.sample_name_col_for_plotting
|
|
328
|
-
ref_col = cfg.reference_strand_col if hasattr(cfg, "reference_strand_col") else "Reference_strand"
|
|
329
|
-
samples = adata.obs[sample_col].astype("category").cat.categories.tolist()
|
|
330
|
-
refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
|
|
331
511
|
|
|
332
|
-
# iterate groups and run analyzer on each group's subset; cache errors
|
|
333
512
|
for sample_name in samples:
|
|
334
|
-
sample_mask =
|
|
513
|
+
sample_mask = adata.obs[sample_col].values == sample_name
|
|
514
|
+
|
|
335
515
|
# combined group
|
|
336
516
|
mask = sample_mask
|
|
337
517
|
ac_sel = autocorr_matrix[mask, :]
|
|
338
518
|
cnt_sel = counts_matrix[mask, :] if counts_matrix is not None else None
|
|
339
519
|
if ac_sel.size:
|
|
340
520
|
try:
|
|
341
|
-
r = analyze_autocorr_matrix(
|
|
342
|
-
|
|
521
|
+
r = analyze_autocorr_matrix(
|
|
522
|
+
ac_sel,
|
|
523
|
+
cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
|
|
524
|
+
lags,
|
|
525
|
+
nrl_search_bp=(120, 260),
|
|
526
|
+
pad_factor=4,
|
|
527
|
+
min_count=10,
|
|
528
|
+
max_harmonics=6,
|
|
529
|
+
)
|
|
343
530
|
except Exception as e:
|
|
344
531
|
r = {"error": str(e)}
|
|
345
532
|
else:
|
|
346
533
|
r = {"error": "no_data"}
|
|
347
534
|
metrics_by_group[(sample_name, None)] = r
|
|
348
535
|
|
|
349
|
-
# per-reference groups
|
|
350
536
|
for ref in refs:
|
|
351
537
|
mask_ref = sample_mask & (adata.obs[ref_col].values == ref)
|
|
352
538
|
ac_sel = autocorr_matrix[mask_ref, :]
|
|
353
539
|
cnt_sel = counts_matrix[mask_ref, :] if counts_matrix is not None else None
|
|
354
540
|
if ac_sel.size:
|
|
355
541
|
try:
|
|
356
|
-
r = analyze_autocorr_matrix(
|
|
357
|
-
|
|
542
|
+
r = analyze_autocorr_matrix(
|
|
543
|
+
ac_sel,
|
|
544
|
+
cnt_sel
|
|
545
|
+
if cnt_sel is not None
|
|
546
|
+
else np.zeros_like(ac_sel, dtype=int),
|
|
547
|
+
lags,
|
|
548
|
+
nrl_search_bp=(120, 260),
|
|
549
|
+
pad_factor=4,
|
|
550
|
+
min_count=10,
|
|
551
|
+
max_harmonics=6,
|
|
552
|
+
)
|
|
358
553
|
except Exception as e:
|
|
359
554
|
r = {"error": str(e)}
|
|
360
555
|
else:
|
|
361
556
|
r = {"error": "no_data"}
|
|
362
557
|
metrics_by_group[(sample_name, ref)] = r
|
|
363
558
|
|
|
364
|
-
# persist group metrics
|
|
365
559
|
adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
|
|
366
560
|
|
|
367
|
-
global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get(
|
|
561
|
+
global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get(
|
|
562
|
+
"nrl_bp", None
|
|
563
|
+
)
|
|
368
564
|
|
|
369
|
-
# configuration / sensible defaults (override in cfg if present)
|
|
370
565
|
rolling_cfg = {
|
|
371
|
-
"window_size": getattr(
|
|
566
|
+
"window_size": getattr(
|
|
567
|
+
cfg,
|
|
568
|
+
"rolling_window_size",
|
|
569
|
+
getattr(cfg, "autocorr_rolling_window_size", 600),
|
|
570
|
+
),
|
|
372
571
|
"step": getattr(cfg, "rolling_step", 100),
|
|
373
|
-
"max_lag": getattr(
|
|
572
|
+
"max_lag": getattr(
|
|
573
|
+
cfg,
|
|
574
|
+
"rolling_max_lag",
|
|
575
|
+
getattr(cfg, "autocorr_max_lag", 500),
|
|
576
|
+
),
|
|
374
577
|
"min_molecules_per_window": getattr(cfg, "rolling_min_molecules_per_window", 10),
|
|
375
578
|
"nrl_search_bp": getattr(cfg, "rolling_nrl_search_bp", (120, 240)),
|
|
376
579
|
"pad_factor": getattr(cfg, "rolling_pad_factor", 4),
|
|
@@ -381,23 +584,19 @@ def spatial_adata(config_path):
|
|
|
381
584
|
|
|
382
585
|
write_plots = getattr(cfg, "rolling_write_plots", True)
|
|
383
586
|
write_csvs = getattr(cfg, "rolling_write_csvs", True)
|
|
384
|
-
min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30)
|
|
587
|
+
min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30)
|
|
385
588
|
|
|
386
589
|
rolling_out_dir = os.path.join(pp_autocorr_dir, "rolling_metrics")
|
|
387
590
|
os.makedirs(rolling_out_dir, exist_ok=True)
|
|
388
|
-
# also a per-site subfolder
|
|
389
591
|
site_out_dir = os.path.join(rolling_out_dir, site_type)
|
|
390
592
|
os.makedirs(site_out_dir, exist_ok=True)
|
|
391
593
|
|
|
392
|
-
combined_rows = []
|
|
393
|
-
rolling_results_by_group = {}
|
|
594
|
+
combined_rows = []
|
|
595
|
+
rolling_results_by_group = {}
|
|
394
596
|
|
|
395
|
-
# iterate groups (samples × refs). `samples` and `refs` were computed above.
|
|
396
597
|
for sample_name in samples:
|
|
397
|
-
sample_mask =
|
|
398
|
-
# first the combined group ("all refs")
|
|
598
|
+
sample_mask = adata.obs[sample_col].values == sample_name
|
|
399
599
|
group_masks = [("all", sample_mask)]
|
|
400
|
-
# then per-reference groups
|
|
401
600
|
for ref in refs:
|
|
402
601
|
ref_mask = sample_mask & (adata.obs[ref_col].values == ref)
|
|
403
602
|
group_masks.append((ref, ref_mask))
|
|
@@ -405,17 +604,10 @@ def spatial_adata(config_path):
|
|
|
405
604
|
for ref_label, mask in group_masks:
|
|
406
605
|
n_group = int(mask.sum())
|
|
407
606
|
if n_group < min_molecules_for_group:
|
|
408
|
-
# skip tiny groups
|
|
409
|
-
if cfg.get("verbosity", 0) if hasattr(cfg, "get") else False:
|
|
410
|
-
print(f"Skipping rolling for {site_type} {sample_name} {ref_label}: only {n_group} molecules (<{min_molecules_for_group})")
|
|
411
|
-
# still write an empty CSV row set if desired; here we skip
|
|
412
607
|
continue
|
|
413
608
|
|
|
414
|
-
# extract group matrix X_group (works with dense or sparse adata.layers)
|
|
415
609
|
X_group = X[mask, :]
|
|
416
|
-
# positions already set above
|
|
417
610
|
try:
|
|
418
|
-
# call your rolling function (this may be slow; it uses cfg.n_jobs)
|
|
419
611
|
df_roll = rolling_autocorr_metrics(
|
|
420
612
|
X_group,
|
|
421
613
|
positions,
|
|
@@ -430,135 +622,160 @@ def spatial_adata(config_path):
|
|
|
430
622
|
max_harmonics=rolling_cfg["max_harmonics"],
|
|
431
623
|
n_jobs=rolling_cfg["n_jobs"],
|
|
432
624
|
verbose=False,
|
|
433
|
-
fixed_nrl_bp=global_nrl
|
|
625
|
+
fixed_nrl_bp=global_nrl,
|
|
434
626
|
)
|
|
435
627
|
except Exception as e:
|
|
436
|
-
|
|
628
|
+
logger.warning(
|
|
629
|
+
f"rolling_autocorr_metrics failed for {site_type} "
|
|
630
|
+
f"{sample_name} {ref_label}: {e}"
|
|
631
|
+
)
|
|
437
632
|
continue
|
|
438
633
|
|
|
439
|
-
# normalize column names and keep only the compact set you want
|
|
440
|
-
# keep: center, n_molecules, nrl_bp, snr, xi, fwhm_bp
|
|
441
634
|
if "center" not in df_roll.columns:
|
|
442
|
-
|
|
443
|
-
|
|
635
|
+
logger.warning(
|
|
636
|
+
f"rolling_autocorr_metrics returned unexpected schema "
|
|
637
|
+
f"for {site_type} {sample_name} {ref_label}"
|
|
638
|
+
)
|
|
444
639
|
continue
|
|
445
640
|
|
|
446
|
-
compact_df = df_roll[
|
|
641
|
+
compact_df = df_roll[
|
|
642
|
+
["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]
|
|
643
|
+
].copy()
|
|
447
644
|
compact_df["site"] = site_type
|
|
448
645
|
compact_df["sample"] = sample_name
|
|
449
646
|
compact_df["reference"] = ref_label if ref_label != "all" else "all"
|
|
450
647
|
|
|
451
|
-
# save per-group CSV
|
|
452
648
|
if write_csvs:
|
|
453
649
|
safe_sample = str(sample_name).replace(os.sep, "_")
|
|
454
|
-
safe_ref = str(ref_label if ref_label != "all" else "all").replace(
|
|
455
|
-
|
|
650
|
+
safe_ref = str(ref_label if ref_label != "all" else "all").replace(
|
|
651
|
+
os.sep, "_"
|
|
652
|
+
)
|
|
653
|
+
out_csv = os.path.join(
|
|
654
|
+
site_out_dir,
|
|
655
|
+
f"{safe_sample}__{safe_ref}__rolling_metrics.csv",
|
|
656
|
+
)
|
|
456
657
|
try:
|
|
457
658
|
compact_df.to_csv(out_csv, index=False)
|
|
458
659
|
except Exception as e:
|
|
459
|
-
|
|
660
|
+
logger.warning(f"Failed to write rolling CSV {out_csv}: {e}")
|
|
460
661
|
|
|
461
|
-
# save a plot per-group (NRL and SNR vs center)
|
|
462
662
|
if write_plots:
|
|
463
663
|
try:
|
|
464
|
-
# use your plot helper; if it's in a different module, import accordingly
|
|
465
664
|
from ..plotting import plot_rolling_metrics as _plot_roll
|
|
466
665
|
except Exception:
|
|
467
|
-
_plot_roll =
|
|
666
|
+
_plot_roll = None
|
|
468
667
|
if _plot_roll is not None:
|
|
469
|
-
plot_png = os.path.join(
|
|
668
|
+
plot_png = os.path.join(
|
|
669
|
+
site_out_dir,
|
|
670
|
+
f"{safe_sample}__{safe_ref}__rolling_metrics.png",
|
|
671
|
+
)
|
|
470
672
|
try:
|
|
471
|
-
_plot_roll(
|
|
472
|
-
|
|
473
|
-
|
|
673
|
+
_plot_roll(
|
|
674
|
+
compact_df,
|
|
675
|
+
out_png=plot_png,
|
|
676
|
+
title=f"{site_type} {sample_name} {ref_label}",
|
|
677
|
+
figsize=(10, 3.5),
|
|
678
|
+
dpi=160,
|
|
679
|
+
show=False,
|
|
680
|
+
)
|
|
474
681
|
except Exception as e:
|
|
475
|
-
|
|
682
|
+
logger.warning(
|
|
683
|
+
f"Failed to create rolling plot for {site_type} "
|
|
684
|
+
f"{sample_name} {ref_label}: {e}"
|
|
685
|
+
)
|
|
476
686
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
687
|
+
combined_rows.append(
|
|
688
|
+
compact_df.assign(site=site_type, sample=sample_name, reference=ref_label)
|
|
689
|
+
)
|
|
690
|
+
rolling_results_by_group[
|
|
691
|
+
(sample_name, None if ref_label == "all" else ref_label)
|
|
692
|
+
] = compact_df
|
|
480
693
|
|
|
481
|
-
# persist per-site rolling metrics into adata.uns as dict of DataFrames (or empty dict)
|
|
482
694
|
adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
|
|
483
695
|
|
|
484
|
-
|
|
485
|
-
if len(combined_rows):
|
|
696
|
+
if combined_rows:
|
|
486
697
|
combined_df_site = pd.concat(combined_rows, ignore_index=True, sort=False)
|
|
487
|
-
combined_out_csv = os.path.join(
|
|
698
|
+
combined_out_csv = os.path.join(
|
|
699
|
+
rolling_out_dir, f"{site_type}__rolling_metrics_combined.csv"
|
|
700
|
+
)
|
|
488
701
|
try:
|
|
489
702
|
combined_df_site.to_csv(combined_out_csv, index=False)
|
|
490
703
|
except Exception as e:
|
|
491
|
-
|
|
704
|
+
logger.warning(f"Failed to write combined rolling CSV for {site_type}: {e}")
|
|
492
705
|
|
|
493
706
|
rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
|
|
494
707
|
plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
|
|
495
708
|
os.makedirs(plot_out_dir, exist_ok=True)
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
plot_spatial_autocorr_grid(adata,
|
|
507
|
-
pp_autocorr_dir,
|
|
508
|
-
site_types=cfg.autocorr_site_types,
|
|
509
|
-
sample_col=cfg.sample_name_col_for_plotting,
|
|
510
|
-
window=cfg.autocorr_rolling_window_size,
|
|
511
|
-
rows_per_fig=cfg.rows_per_qc_autocorr_grid)
|
|
512
|
-
|
|
513
|
-
############ Pearson analyses ###############
|
|
514
|
-
if smf_modality != 'direct':
|
|
515
|
-
from ..tools.position_stats import compute_positionwise_statistics, plot_positionwise_matrices
|
|
516
|
-
|
|
517
|
-
pp_corr_dir = pp_dir / "09_correlation_matrices"
|
|
518
|
-
|
|
519
|
-
if pp_corr_dir.is_dir():
|
|
520
|
-
print(f'{pp_corr_dir} already exists. Skipping correlation matrix plotting.')
|
|
521
|
-
else:
|
|
522
|
-
compute_positionwise_statistics(
|
|
523
|
-
adata,
|
|
524
|
-
layer="nan0_0minus1",
|
|
525
|
-
methods=cfg.correlation_matrix_types,
|
|
526
|
-
sample_col=cfg.sample_name_col_for_plotting,
|
|
527
|
-
ref_col=cfg.reference_column,
|
|
528
|
-
output_key="positionwise_result",
|
|
529
|
-
site_types=cfg.correlation_matrix_site_types,
|
|
530
|
-
encoding="signed",
|
|
531
|
-
max_threads=cfg.threads,
|
|
532
|
-
min_count_for_pairwise=10,
|
|
709
|
+
_ = plot_rolling_grid(
|
|
710
|
+
rolling_dict,
|
|
711
|
+
plot_out_dir,
|
|
712
|
+
site_type,
|
|
713
|
+
rows_per_page=cfg.rows_per_qc_autocorr_grid,
|
|
714
|
+
cols_per_page=len(refs),
|
|
715
|
+
dpi=160,
|
|
716
|
+
metrics=("nrl_bp", "snr", "xi"),
|
|
717
|
+
per_metric_ylim={"snr": (0, 25)},
|
|
533
718
|
)
|
|
534
|
-
|
|
535
|
-
|
|
719
|
+
|
|
720
|
+
make_dirs([pp_autocorr_dir])
|
|
721
|
+
plot_spatial_autocorr_grid(
|
|
536
722
|
adata,
|
|
537
|
-
|
|
723
|
+
pp_autocorr_dir,
|
|
724
|
+
site_types=cfg.autocorr_site_types,
|
|
538
725
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
cmaps=cfg.correlation_matrix_cmaps,
|
|
543
|
-
vmin=None,
|
|
544
|
-
vmax=None,
|
|
545
|
-
output_dir=pp_corr_dir,
|
|
546
|
-
output_key= "positionwise_result"
|
|
726
|
+
window=cfg.autocorr_rolling_window_size,
|
|
727
|
+
rows_per_fig=cfg.rows_per_qc_autocorr_grid,
|
|
728
|
+
normalization_method=cfg.autocorr_normalization_method,
|
|
547
729
|
)
|
|
548
730
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
if ".gz" == spatial_adata_path.suffix:
|
|
554
|
-
print(f"Spatial adata path: {spatial_adata_path}")
|
|
555
|
-
safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
|
|
556
|
-
else:
|
|
557
|
-
spatial_adata_path = spatial_adata_path.with_name(spatial_adata_path.name + '.gz')
|
|
558
|
-
print(f"Spatial adata path: {spatial_adata_path}")
|
|
559
|
-
safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
|
|
560
|
-
############################################### smftools spatial end ###############################################
|
|
561
|
-
|
|
562
|
-
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_adata_path)
|
|
731
|
+
# ============================================================
|
|
732
|
+
# 4) Pearson / correlation matrices
|
|
733
|
+
# ============================================================
|
|
734
|
+
pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
|
|
563
735
|
|
|
564
|
-
|
|
736
|
+
if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
737
|
+
logger.debug(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
|
|
738
|
+
else:
|
|
739
|
+
compute_positionwise_statistics(
|
|
740
|
+
adata,
|
|
741
|
+
layer="nan0_0minus1",
|
|
742
|
+
methods=cfg.correlation_matrix_types,
|
|
743
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
744
|
+
ref_col=cfg.reference_column,
|
|
745
|
+
output_key="positionwise_result",
|
|
746
|
+
site_types=cfg.correlation_matrix_site_types,
|
|
747
|
+
encoding="signed",
|
|
748
|
+
max_threads=cfg.threads,
|
|
749
|
+
min_count_for_pairwise=10,
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
plot_positionwise_matrices(
|
|
753
|
+
adata,
|
|
754
|
+
methods=cfg.correlation_matrix_types,
|
|
755
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
756
|
+
ref_col=cfg.reference_column,
|
|
757
|
+
figsize_per_cell=(4.0, 3.0),
|
|
758
|
+
dpi=160,
|
|
759
|
+
cmaps=cfg.correlation_matrix_cmaps,
|
|
760
|
+
vmin=None,
|
|
761
|
+
vmax=None,
|
|
762
|
+
output_dir=pp_corr_dir,
|
|
763
|
+
output_key="positionwise_result",
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
# ============================================================
|
|
767
|
+
# 5) Save spatial AnnData
|
|
768
|
+
# ============================================================
|
|
769
|
+
if (not spatial_adata_path.exists()) or getattr(cfg, "force_redo_spatial_analyses", False):
|
|
770
|
+
logger.info("Saving spatial analyzed AnnData (post preprocessing and duplicate removal).")
|
|
771
|
+
record_smftools_metadata(
|
|
772
|
+
adata,
|
|
773
|
+
step_name="spatial",
|
|
774
|
+
cfg=cfg,
|
|
775
|
+
config_path=config_path,
|
|
776
|
+
input_paths=[source_adata_path] if source_adata_path else None,
|
|
777
|
+
output_path=spatial_adata_path,
|
|
778
|
+
)
|
|
779
|
+
write_gz_h5ad(adata, spatial_adata_path)
|
|
780
|
+
|
|
781
|
+
return adata, spatial_adata_path
|