smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +7 -1
- smftools/cli/hmm_adata.py +902 -244
- smftools/cli/load_adata.py +318 -198
- smftools/cli/preprocess_adata.py +285 -171
- smftools/cli/spatial_adata.py +137 -53
- smftools/cli_entry.py +94 -178
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +22 -17
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +505 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2125 -1426
- smftools/hmm/__init__.py +2 -3
- smftools/hmm/archived/call_hmm_peaks.py +16 -1
- smftools/hmm/call_hmm_peaks.py +173 -193
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +379 -156
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +195 -29
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +347 -168
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +145 -85
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +8 -8
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +103 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +688 -271
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.4.dist-info/RECORD +0 -176
- /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/cli/spatial_adata.py
CHANGED
|
@@ -3,6 +3,11 @@ from typing import Optional, Tuple
|
|
|
3
3
|
|
|
4
4
|
import anndata as ad
|
|
5
5
|
|
|
6
|
+
from smftools.logging_utils import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
6
11
|
def spatial_adata(
|
|
7
12
|
config_path: str,
|
|
8
13
|
) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
|
|
@@ -26,10 +31,10 @@ def spatial_adata(
|
|
|
26
31
|
spatial_adata_path : Path | None
|
|
27
32
|
Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
|
|
28
33
|
"""
|
|
29
|
-
from ..readwrite import
|
|
34
|
+
from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
|
|
35
|
+
from .helpers import get_adata_paths
|
|
30
36
|
from .load_adata import load_adata
|
|
31
37
|
from .preprocess_adata import preprocess_adata
|
|
32
|
-
from .helpers import get_adata_paths
|
|
33
38
|
|
|
34
39
|
# 1) Ensure config + basic paths via load_adata
|
|
35
40
|
loaded_adata, loaded_path, cfg = load_adata(config_path)
|
|
@@ -45,21 +50,22 @@ def spatial_adata(
|
|
|
45
50
|
if not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
46
51
|
# If HMM exists, it's the most processed stage — reuse it.
|
|
47
52
|
if hmm_path.exists():
|
|
48
|
-
|
|
53
|
+
logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
|
|
49
54
|
return None, hmm_path
|
|
50
55
|
|
|
51
56
|
# If spatial exists, we consider spatial analyses already done.
|
|
52
57
|
if spatial_path.exists():
|
|
53
|
-
|
|
58
|
+
logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
|
|
54
59
|
return None, spatial_path
|
|
55
60
|
|
|
56
61
|
# 2) Ensure preprocessing has been run
|
|
57
62
|
# This will create pp/pp_dedup as needed or return them if they already exist.
|
|
58
|
-
pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
|
|
63
|
+
pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
|
|
64
|
+
config_path
|
|
65
|
+
)
|
|
59
66
|
|
|
60
67
|
# Helper to load from disk, reusing loaded_adata if it matches
|
|
61
68
|
def _load(path: Path):
|
|
62
|
-
from ..readwrite import safe_read_h5ad
|
|
63
69
|
if loaded_adata is not None and loaded_path == path:
|
|
64
70
|
return loaded_adata
|
|
65
71
|
adata, _ = safe_read_h5ad(path)
|
|
@@ -69,15 +75,19 @@ def spatial_adata(
|
|
|
69
75
|
# Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
|
|
70
76
|
if pp_dedup_adata is not None:
|
|
71
77
|
start_adata = pp_dedup_adata
|
|
78
|
+
source_path = pp_dedup_adata_path_ret
|
|
72
79
|
else:
|
|
73
80
|
if pp_dedup_path.exists():
|
|
74
81
|
start_adata = _load(pp_dedup_path)
|
|
82
|
+
source_path = pp_dedup_path
|
|
75
83
|
elif pp_path.exists():
|
|
76
84
|
start_adata = _load(pp_path)
|
|
85
|
+
source_path = pp_path
|
|
77
86
|
elif raw_path.exists():
|
|
78
87
|
start_adata = _load(raw_path)
|
|
88
|
+
source_path = raw_path
|
|
79
89
|
else:
|
|
80
|
-
|
|
90
|
+
logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
|
|
81
91
|
return None, None
|
|
82
92
|
|
|
83
93
|
# 4) Run the spatial core
|
|
@@ -88,6 +98,8 @@ def spatial_adata(
|
|
|
88
98
|
pp_adata_path=pp_path,
|
|
89
99
|
pp_dup_rem_adata_path=pp_dedup_path,
|
|
90
100
|
pp_adata_in_memory=pp_adata,
|
|
101
|
+
source_adata_path=source_path,
|
|
102
|
+
config_path=config_path,
|
|
91
103
|
)
|
|
92
104
|
|
|
93
105
|
# 5) Register spatial path in summary CSV
|
|
@@ -103,6 +115,8 @@ def spatial_adata_core(
|
|
|
103
115
|
pp_adata_path: Path,
|
|
104
116
|
pp_dup_rem_adata_path: Path,
|
|
105
117
|
pp_adata_in_memory: Optional[ad.AnnData] = None,
|
|
118
|
+
source_adata_path: Optional[Path] = None,
|
|
119
|
+
config_path: Optional[str] = None,
|
|
106
120
|
) -> Tuple[ad.AnnData, Path]:
|
|
107
121
|
"""
|
|
108
122
|
Core spatial analysis pipeline.
|
|
@@ -141,30 +155,30 @@ def spatial_adata_core(
|
|
|
141
155
|
import pandas as pd
|
|
142
156
|
import scanpy as sc
|
|
143
157
|
|
|
144
|
-
from ..
|
|
145
|
-
from .helpers import write_gz_h5ad
|
|
146
|
-
|
|
147
|
-
from ..preprocessing import (
|
|
148
|
-
load_sample_sheet,
|
|
149
|
-
invert_adata,
|
|
150
|
-
reindex_references_adata,
|
|
151
|
-
)
|
|
158
|
+
from ..metadata import record_smftools_metadata
|
|
152
159
|
from ..plotting import (
|
|
153
160
|
combined_raw_clustermap,
|
|
154
161
|
plot_rolling_grid,
|
|
155
162
|
plot_spatial_autocorr_grid,
|
|
156
163
|
)
|
|
164
|
+
from ..preprocessing import (
|
|
165
|
+
invert_adata,
|
|
166
|
+
load_sample_sheet,
|
|
167
|
+
reindex_references_adata,
|
|
168
|
+
)
|
|
169
|
+
from ..readwrite import make_dirs, safe_read_h5ad
|
|
157
170
|
from ..tools import calculate_umap
|
|
171
|
+
from ..tools.position_stats import (
|
|
172
|
+
compute_positionwise_statistics,
|
|
173
|
+
plot_positionwise_matrices,
|
|
174
|
+
)
|
|
158
175
|
from ..tools.spatial_autocorrelation import (
|
|
159
|
-
binary_autocorrelation_with_spacing,
|
|
160
176
|
analyze_autocorr_matrix,
|
|
177
|
+
binary_autocorrelation_with_spacing,
|
|
161
178
|
bootstrap_periodicity,
|
|
162
179
|
rolling_autocorr_metrics,
|
|
163
180
|
)
|
|
164
|
-
from
|
|
165
|
-
compute_positionwise_statistics,
|
|
166
|
-
plot_positionwise_matrices,
|
|
167
|
-
)
|
|
181
|
+
from .helpers import write_gz_h5ad
|
|
168
182
|
|
|
169
183
|
# -----------------------------
|
|
170
184
|
# General setup
|
|
@@ -207,7 +221,12 @@ def spatial_adata_core(
|
|
|
207
221
|
offsets=cfg.reindexing_offsets,
|
|
208
222
|
new_col=cfg.reindexed_var_suffix,
|
|
209
223
|
)
|
|
210
|
-
|
|
224
|
+
|
|
225
|
+
if adata.uns.get("reindex_references_adata_performed", False):
|
|
226
|
+
reindex_suffix = cfg.reindexed_var_suffix
|
|
227
|
+
else:
|
|
228
|
+
reindex_suffix = None
|
|
229
|
+
|
|
211
230
|
pp_dir = output_directory / "preprocessed"
|
|
212
231
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
213
232
|
|
|
@@ -223,7 +242,9 @@ def spatial_adata_core(
|
|
|
223
242
|
if pp_clustermap_dir.is_dir() and not getattr(
|
|
224
243
|
cfg, "force_redo_spatial_analyses", False
|
|
225
244
|
):
|
|
226
|
-
|
|
245
|
+
logger.debug(
|
|
246
|
+
f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
|
|
247
|
+
)
|
|
227
248
|
else:
|
|
228
249
|
make_dirs([pp_dir, pp_clustermap_dir])
|
|
229
250
|
|
|
@@ -232,6 +253,34 @@ def spatial_adata_core(
|
|
|
232
253
|
else:
|
|
233
254
|
pp_adata, _ = safe_read_h5ad(pp_adata_path)
|
|
234
255
|
|
|
256
|
+
# -----------------------------
|
|
257
|
+
# Optional sample sheet metadata
|
|
258
|
+
# -----------------------------
|
|
259
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
260
|
+
load_sample_sheet(
|
|
261
|
+
pp_adata,
|
|
262
|
+
cfg.sample_sheet_path,
|
|
263
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
264
|
+
as_category=True,
|
|
265
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# -----------------------------
|
|
269
|
+
# Optional inversion along positions axis
|
|
270
|
+
# -----------------------------
|
|
271
|
+
if getattr(cfg, "invert_adata", False):
|
|
272
|
+
pp_adata = invert_adata(pp_adata)
|
|
273
|
+
|
|
274
|
+
# -----------------------------
|
|
275
|
+
# Optional reindexing by reference
|
|
276
|
+
# -----------------------------
|
|
277
|
+
reindex_references_adata(
|
|
278
|
+
pp_adata,
|
|
279
|
+
reference_col=cfg.reference_column,
|
|
280
|
+
offsets=cfg.reindexing_offsets,
|
|
281
|
+
new_col=cfg.reindexed_var_suffix,
|
|
282
|
+
)
|
|
283
|
+
|
|
235
284
|
combined_raw_clustermap(
|
|
236
285
|
pp_adata,
|
|
237
286
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
@@ -247,16 +296,19 @@ def spatial_adata_core(
|
|
|
247
296
|
cmap_a=cfg.clustermap_cmap_a,
|
|
248
297
|
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
249
298
|
min_length=cfg.read_len_filter_thresholds[0],
|
|
250
|
-
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
299
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
300
|
+
0
|
|
301
|
+
],
|
|
251
302
|
min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
303
|
+
demux_types=("double", "already"),
|
|
252
304
|
bins=None,
|
|
253
305
|
sample_mapping=None,
|
|
254
306
|
save_path=pp_clustermap_dir,
|
|
255
307
|
sort_by=cfg.spatial_clustermap_sortby,
|
|
256
308
|
deaminase=deaminase,
|
|
257
|
-
index_col_suffix=
|
|
309
|
+
index_col_suffix=reindex_suffix,
|
|
258
310
|
)
|
|
259
|
-
|
|
311
|
+
|
|
260
312
|
# ============================================================
|
|
261
313
|
# 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
|
|
262
314
|
# ============================================================
|
|
@@ -265,10 +317,10 @@ def spatial_adata_core(
|
|
|
265
317
|
pp_umap_dir = pp_dir_dedup / "07_umaps"
|
|
266
318
|
|
|
267
319
|
# Clustermaps on deduplicated adata
|
|
268
|
-
if pp_clustermap_dir_dedup.is_dir() and not getattr(
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
320
|
+
if pp_clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
321
|
+
logger.debug(
|
|
322
|
+
f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
|
|
323
|
+
)
|
|
272
324
|
else:
|
|
273
325
|
make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
|
|
274
326
|
combined_raw_clustermap(
|
|
@@ -286,19 +338,22 @@ def spatial_adata_core(
|
|
|
286
338
|
cmap_a=cfg.clustermap_cmap_a,
|
|
287
339
|
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
288
340
|
min_length=cfg.read_len_filter_thresholds[0],
|
|
289
|
-
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
341
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
342
|
+
0
|
|
343
|
+
],
|
|
290
344
|
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
345
|
+
demux_types=("double", "already"),
|
|
291
346
|
bins=None,
|
|
292
347
|
sample_mapping=None,
|
|
293
348
|
save_path=pp_clustermap_dir_dedup,
|
|
294
349
|
sort_by=cfg.spatial_clustermap_sortby,
|
|
295
350
|
deaminase=deaminase,
|
|
296
|
-
index_col_suffix=
|
|
351
|
+
index_col_suffix=reindex_suffix,
|
|
297
352
|
)
|
|
298
353
|
|
|
299
354
|
# UMAP / Leiden
|
|
300
355
|
if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
301
|
-
|
|
356
|
+
logger.debug(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
|
|
302
357
|
else:
|
|
303
358
|
make_dirs([pp_umap_dir])
|
|
304
359
|
|
|
@@ -336,40 +391,48 @@ def spatial_adata_core(
|
|
|
336
391
|
pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
|
|
337
392
|
|
|
338
393
|
if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
339
|
-
|
|
394
|
+
logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
|
|
340
395
|
else:
|
|
341
396
|
positions = adata.var_names.astype(int).values
|
|
342
397
|
lags = np.arange(cfg.autocorr_max_lag + 1)
|
|
343
398
|
|
|
344
399
|
try:
|
|
345
400
|
from joblib import Parallel, delayed
|
|
401
|
+
|
|
346
402
|
_have_joblib = True
|
|
347
403
|
except Exception:
|
|
348
404
|
_have_joblib = False
|
|
349
405
|
|
|
350
|
-
samples =
|
|
406
|
+
samples = (
|
|
407
|
+
adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
|
|
408
|
+
)
|
|
351
409
|
ref_col = getattr(cfg, "reference_strand_col", "Reference_strand")
|
|
352
410
|
refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
|
|
353
411
|
|
|
354
412
|
for site_type in cfg.autocorr_site_types:
|
|
355
413
|
layer_key = f"{site_type}_site_binary"
|
|
356
414
|
if layer_key not in adata.layers:
|
|
357
|
-
|
|
415
|
+
logger.debug(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
|
|
358
416
|
continue
|
|
359
417
|
|
|
360
418
|
X = adata.layers[layer_key]
|
|
361
419
|
if getattr(X, "shape", (0,))[0] == 0:
|
|
362
|
-
|
|
420
|
+
logger.debug(f"Layer {layer_key} empty — skipping {site_type}.")
|
|
363
421
|
continue
|
|
364
422
|
|
|
365
423
|
rows = []
|
|
366
424
|
counts = []
|
|
367
425
|
|
|
368
426
|
if _have_joblib:
|
|
427
|
+
|
|
369
428
|
def _worker(row):
|
|
370
429
|
try:
|
|
371
430
|
ac, cnts = binary_autocorrelation_with_spacing(
|
|
372
|
-
row,
|
|
431
|
+
row,
|
|
432
|
+
positions,
|
|
433
|
+
max_lag=cfg.autocorr_max_lag,
|
|
434
|
+
return_counts=True,
|
|
435
|
+
normalize=cfg.autocorr_normalization_method,
|
|
373
436
|
)
|
|
374
437
|
except Exception:
|
|
375
438
|
ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
|
|
@@ -385,7 +448,11 @@ def spatial_adata_core(
|
|
|
385
448
|
else:
|
|
386
449
|
for i in range(X.shape[0]):
|
|
387
450
|
ac, cnts = binary_autocorrelation_with_spacing(
|
|
388
|
-
X[i],
|
|
451
|
+
X[i],
|
|
452
|
+
positions,
|
|
453
|
+
max_lag=cfg.autocorr_max_lag,
|
|
454
|
+
return_counts=True,
|
|
455
|
+
normalize=cfg.autocorr_normalization_method,
|
|
389
456
|
)
|
|
390
457
|
rows.append(ac)
|
|
391
458
|
counts.append(cnts)
|
|
@@ -474,7 +541,9 @@ def spatial_adata_core(
|
|
|
474
541
|
try:
|
|
475
542
|
r = analyze_autocorr_matrix(
|
|
476
543
|
ac_sel,
|
|
477
|
-
cnt_sel
|
|
544
|
+
cnt_sel
|
|
545
|
+
if cnt_sel is not None
|
|
546
|
+
else np.zeros_like(ac_sel, dtype=int),
|
|
478
547
|
lags,
|
|
479
548
|
nrl_search_bp=(120, 260),
|
|
480
549
|
pad_factor=4,
|
|
@@ -489,7 +558,9 @@ def spatial_adata_core(
|
|
|
489
558
|
|
|
490
559
|
adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
|
|
491
560
|
|
|
492
|
-
global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get(
|
|
561
|
+
global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get(
|
|
562
|
+
"nrl_bp", None
|
|
563
|
+
)
|
|
493
564
|
|
|
494
565
|
rolling_cfg = {
|
|
495
566
|
"window_size": getattr(
|
|
@@ -554,27 +625,31 @@ def spatial_adata_core(
|
|
|
554
625
|
fixed_nrl_bp=global_nrl,
|
|
555
626
|
)
|
|
556
627
|
except Exception as e:
|
|
557
|
-
|
|
628
|
+
logger.warning(
|
|
558
629
|
f"rolling_autocorr_metrics failed for {site_type} "
|
|
559
630
|
f"{sample_name} {ref_label}: {e}"
|
|
560
631
|
)
|
|
561
632
|
continue
|
|
562
633
|
|
|
563
634
|
if "center" not in df_roll.columns:
|
|
564
|
-
|
|
635
|
+
logger.warning(
|
|
565
636
|
f"rolling_autocorr_metrics returned unexpected schema "
|
|
566
637
|
f"for {site_type} {sample_name} {ref_label}"
|
|
567
638
|
)
|
|
568
639
|
continue
|
|
569
640
|
|
|
570
|
-
compact_df = df_roll[
|
|
641
|
+
compact_df = df_roll[
|
|
642
|
+
["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]
|
|
643
|
+
].copy()
|
|
571
644
|
compact_df["site"] = site_type
|
|
572
645
|
compact_df["sample"] = sample_name
|
|
573
646
|
compact_df["reference"] = ref_label if ref_label != "all" else "all"
|
|
574
647
|
|
|
575
648
|
if write_csvs:
|
|
576
649
|
safe_sample = str(sample_name).replace(os.sep, "_")
|
|
577
|
-
safe_ref = str(ref_label if ref_label != "all" else "all").replace(
|
|
650
|
+
safe_ref = str(ref_label if ref_label != "all" else "all").replace(
|
|
651
|
+
os.sep, "_"
|
|
652
|
+
)
|
|
578
653
|
out_csv = os.path.join(
|
|
579
654
|
site_out_dir,
|
|
580
655
|
f"{safe_sample}__{safe_ref}__rolling_metrics.csv",
|
|
@@ -582,7 +657,7 @@ def spatial_adata_core(
|
|
|
582
657
|
try:
|
|
583
658
|
compact_df.to_csv(out_csv, index=False)
|
|
584
659
|
except Exception as e:
|
|
585
|
-
|
|
660
|
+
logger.warning(f"Failed to write rolling CSV {out_csv}: {e}")
|
|
586
661
|
|
|
587
662
|
if write_plots:
|
|
588
663
|
try:
|
|
@@ -604,7 +679,7 @@ def spatial_adata_core(
|
|
|
604
679
|
show=False,
|
|
605
680
|
)
|
|
606
681
|
except Exception as e:
|
|
607
|
-
|
|
682
|
+
logger.warning(
|
|
608
683
|
f"Failed to create rolling plot for {site_type} "
|
|
609
684
|
f"{sample_name} {ref_label}: {e}"
|
|
610
685
|
)
|
|
@@ -612,7 +687,9 @@ def spatial_adata_core(
|
|
|
612
687
|
combined_rows.append(
|
|
613
688
|
compact_df.assign(site=site_type, sample=sample_name, reference=ref_label)
|
|
614
689
|
)
|
|
615
|
-
rolling_results_by_group[
|
|
690
|
+
rolling_results_by_group[
|
|
691
|
+
(sample_name, None if ref_label == "all" else ref_label)
|
|
692
|
+
] = compact_df
|
|
616
693
|
|
|
617
694
|
adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
|
|
618
695
|
|
|
@@ -624,9 +701,7 @@ def spatial_adata_core(
|
|
|
624
701
|
try:
|
|
625
702
|
combined_df_site.to_csv(combined_out_csv, index=False)
|
|
626
703
|
except Exception as e:
|
|
627
|
-
|
|
628
|
-
f"Failed to write combined rolling CSV for {site_type}: {e}"
|
|
629
|
-
)
|
|
704
|
+
logger.warning(f"Failed to write combined rolling CSV for {site_type}: {e}")
|
|
630
705
|
|
|
631
706
|
rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
|
|
632
707
|
plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
|
|
@@ -650,6 +725,7 @@ def spatial_adata_core(
|
|
|
650
725
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
651
726
|
window=cfg.autocorr_rolling_window_size,
|
|
652
727
|
rows_per_fig=cfg.rows_per_qc_autocorr_grid,
|
|
728
|
+
normalization_method=cfg.autocorr_normalization_method,
|
|
653
729
|
)
|
|
654
730
|
|
|
655
731
|
# ============================================================
|
|
@@ -658,7 +734,7 @@ def spatial_adata_core(
|
|
|
658
734
|
pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
|
|
659
735
|
|
|
660
736
|
if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
661
|
-
|
|
737
|
+
logger.debug(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
|
|
662
738
|
else:
|
|
663
739
|
compute_positionwise_statistics(
|
|
664
740
|
adata,
|
|
@@ -691,7 +767,15 @@ def spatial_adata_core(
|
|
|
691
767
|
# 5) Save spatial AnnData
|
|
692
768
|
# ============================================================
|
|
693
769
|
if (not spatial_adata_path.exists()) or getattr(cfg, "force_redo_spatial_analyses", False):
|
|
694
|
-
|
|
770
|
+
logger.info("Saving spatial analyzed AnnData (post preprocessing and duplicate removal).")
|
|
771
|
+
record_smftools_metadata(
|
|
772
|
+
adata,
|
|
773
|
+
step_name="spatial",
|
|
774
|
+
cfg=cfg,
|
|
775
|
+
config_path=config_path,
|
|
776
|
+
input_paths=[source_adata_path] if source_adata_path else None,
|
|
777
|
+
output_path=spatial_adata_path,
|
|
778
|
+
)
|
|
695
779
|
write_gz_h5ad(adata, spatial_adata_path)
|
|
696
780
|
|
|
697
|
-
return adata, spatial_adata_path
|
|
781
|
+
return adata, spatial_adata_path
|