smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
smftools/cli/spatial_adata.py
CHANGED
|
@@ -1,8 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
2
4
|
from typing import Optional, Tuple
|
|
3
5
|
|
|
4
6
|
import anndata as ad
|
|
5
7
|
|
|
8
|
+
from smftools.logging_utils import get_logger
|
|
9
|
+
from smftools.optional_imports import require
|
|
10
|
+
|
|
11
|
+
logger = get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
6
14
|
def spatial_adata(
|
|
7
15
|
config_path: str,
|
|
8
16
|
) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
|
|
@@ -26,10 +34,10 @@ def spatial_adata(
|
|
|
26
34
|
spatial_adata_path : Path | None
|
|
27
35
|
Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
|
|
28
36
|
"""
|
|
29
|
-
from ..readwrite import
|
|
37
|
+
from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
|
|
38
|
+
from .helpers import get_adata_paths
|
|
30
39
|
from .load_adata import load_adata
|
|
31
40
|
from .preprocess_adata import preprocess_adata
|
|
32
|
-
from .helpers import get_adata_paths
|
|
33
41
|
|
|
34
42
|
# 1) Ensure config + basic paths via load_adata
|
|
35
43
|
loaded_adata, loaded_path, cfg = load_adata(config_path)
|
|
@@ -45,21 +53,22 @@ def spatial_adata(
|
|
|
45
53
|
if not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
46
54
|
# If HMM exists, it's the most processed stage — reuse it.
|
|
47
55
|
if hmm_path.exists():
|
|
48
|
-
|
|
56
|
+
logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
|
|
49
57
|
return None, hmm_path
|
|
50
58
|
|
|
51
59
|
# If spatial exists, we consider spatial analyses already done.
|
|
52
60
|
if spatial_path.exists():
|
|
53
|
-
|
|
61
|
+
logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
|
|
54
62
|
return None, spatial_path
|
|
55
63
|
|
|
56
64
|
# 2) Ensure preprocessing has been run
|
|
57
65
|
# This will create pp/pp_dedup as needed or return them if they already exist.
|
|
58
|
-
pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
|
|
66
|
+
pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
|
|
67
|
+
config_path
|
|
68
|
+
)
|
|
59
69
|
|
|
60
70
|
# Helper to load from disk, reusing loaded_adata if it matches
|
|
61
71
|
def _load(path: Path):
|
|
62
|
-
from ..readwrite import safe_read_h5ad
|
|
63
72
|
if loaded_adata is not None and loaded_path == path:
|
|
64
73
|
return loaded_adata
|
|
65
74
|
adata, _ = safe_read_h5ad(path)
|
|
@@ -69,15 +78,19 @@ def spatial_adata(
|
|
|
69
78
|
# Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
|
|
70
79
|
if pp_dedup_adata is not None:
|
|
71
80
|
start_adata = pp_dedup_adata
|
|
81
|
+
source_path = pp_dedup_adata_path_ret
|
|
72
82
|
else:
|
|
73
83
|
if pp_dedup_path.exists():
|
|
74
84
|
start_adata = _load(pp_dedup_path)
|
|
85
|
+
source_path = pp_dedup_path
|
|
75
86
|
elif pp_path.exists():
|
|
76
87
|
start_adata = _load(pp_path)
|
|
88
|
+
source_path = pp_path
|
|
77
89
|
elif raw_path.exists():
|
|
78
90
|
start_adata = _load(raw_path)
|
|
91
|
+
source_path = raw_path
|
|
79
92
|
else:
|
|
80
|
-
|
|
93
|
+
logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
|
|
81
94
|
return None, None
|
|
82
95
|
|
|
83
96
|
# 4) Run the spatial core
|
|
@@ -88,6 +101,8 @@ def spatial_adata(
|
|
|
88
101
|
pp_adata_path=pp_path,
|
|
89
102
|
pp_dup_rem_adata_path=pp_dedup_path,
|
|
90
103
|
pp_adata_in_memory=pp_adata,
|
|
104
|
+
source_adata_path=source_path,
|
|
105
|
+
config_path=config_path,
|
|
91
106
|
)
|
|
92
107
|
|
|
93
108
|
# 5) Register spatial path in summary CSV
|
|
@@ -103,6 +118,8 @@ def spatial_adata_core(
|
|
|
103
118
|
pp_adata_path: Path,
|
|
104
119
|
pp_dup_rem_adata_path: Path,
|
|
105
120
|
pp_adata_in_memory: Optional[ad.AnnData] = None,
|
|
121
|
+
source_adata_path: Optional[Path] = None,
|
|
122
|
+
config_path: Optional[str] = None,
|
|
106
123
|
) -> Tuple[ad.AnnData, Path]:
|
|
107
124
|
"""
|
|
108
125
|
Core spatial analysis pipeline.
|
|
@@ -139,32 +156,33 @@ def spatial_adata_core(
|
|
|
139
156
|
|
|
140
157
|
import numpy as np
|
|
141
158
|
import pandas as pd
|
|
142
|
-
import scanpy as sc
|
|
143
159
|
|
|
144
|
-
|
|
145
|
-
from .helpers import write_gz_h5ad
|
|
160
|
+
sc = require("scanpy", extra="scanpy", purpose="spatial analyses")
|
|
146
161
|
|
|
147
|
-
from ..
|
|
148
|
-
load_sample_sheet,
|
|
149
|
-
invert_adata,
|
|
150
|
-
reindex_references_adata,
|
|
151
|
-
)
|
|
162
|
+
from ..metadata import record_smftools_metadata
|
|
152
163
|
from ..plotting import (
|
|
153
164
|
combined_raw_clustermap,
|
|
154
165
|
plot_rolling_grid,
|
|
155
166
|
plot_spatial_autocorr_grid,
|
|
156
167
|
)
|
|
168
|
+
from ..preprocessing import (
|
|
169
|
+
invert_adata,
|
|
170
|
+
load_sample_sheet,
|
|
171
|
+
reindex_references_adata,
|
|
172
|
+
)
|
|
173
|
+
from ..readwrite import make_dirs, safe_read_h5ad
|
|
157
174
|
from ..tools import calculate_umap
|
|
175
|
+
from ..tools.position_stats import (
|
|
176
|
+
compute_positionwise_statistics,
|
|
177
|
+
plot_positionwise_matrices,
|
|
178
|
+
)
|
|
158
179
|
from ..tools.spatial_autocorrelation import (
|
|
159
|
-
binary_autocorrelation_with_spacing,
|
|
160
180
|
analyze_autocorr_matrix,
|
|
181
|
+
binary_autocorrelation_with_spacing,
|
|
161
182
|
bootstrap_periodicity,
|
|
162
183
|
rolling_autocorr_metrics,
|
|
163
184
|
)
|
|
164
|
-
from
|
|
165
|
-
compute_positionwise_statistics,
|
|
166
|
-
plot_positionwise_matrices,
|
|
167
|
-
)
|
|
185
|
+
from .helpers import write_gz_h5ad
|
|
168
186
|
|
|
169
187
|
# -----------------------------
|
|
170
188
|
# General setup
|
|
@@ -207,7 +225,12 @@ def spatial_adata_core(
|
|
|
207
225
|
offsets=cfg.reindexing_offsets,
|
|
208
226
|
new_col=cfg.reindexed_var_suffix,
|
|
209
227
|
)
|
|
210
|
-
|
|
228
|
+
|
|
229
|
+
if adata.uns.get("reindex_references_adata_performed", False):
|
|
230
|
+
reindex_suffix = cfg.reindexed_var_suffix
|
|
231
|
+
else:
|
|
232
|
+
reindex_suffix = None
|
|
233
|
+
|
|
211
234
|
pp_dir = output_directory / "preprocessed"
|
|
212
235
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
213
236
|
|
|
@@ -223,7 +246,9 @@ def spatial_adata_core(
|
|
|
223
246
|
if pp_clustermap_dir.is_dir() and not getattr(
|
|
224
247
|
cfg, "force_redo_spatial_analyses", False
|
|
225
248
|
):
|
|
226
|
-
|
|
249
|
+
logger.debug(
|
|
250
|
+
f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
|
|
251
|
+
)
|
|
227
252
|
else:
|
|
228
253
|
make_dirs([pp_dir, pp_clustermap_dir])
|
|
229
254
|
|
|
@@ -232,6 +257,34 @@ def spatial_adata_core(
|
|
|
232
257
|
else:
|
|
233
258
|
pp_adata, _ = safe_read_h5ad(pp_adata_path)
|
|
234
259
|
|
|
260
|
+
# -----------------------------
|
|
261
|
+
# Optional sample sheet metadata
|
|
262
|
+
# -----------------------------
|
|
263
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
264
|
+
load_sample_sheet(
|
|
265
|
+
pp_adata,
|
|
266
|
+
cfg.sample_sheet_path,
|
|
267
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
268
|
+
as_category=True,
|
|
269
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# -----------------------------
|
|
273
|
+
# Optional inversion along positions axis
|
|
274
|
+
# -----------------------------
|
|
275
|
+
if getattr(cfg, "invert_adata", False):
|
|
276
|
+
pp_adata = invert_adata(pp_adata)
|
|
277
|
+
|
|
278
|
+
# -----------------------------
|
|
279
|
+
# Optional reindexing by reference
|
|
280
|
+
# -----------------------------
|
|
281
|
+
reindex_references_adata(
|
|
282
|
+
pp_adata,
|
|
283
|
+
reference_col=cfg.reference_column,
|
|
284
|
+
offsets=cfg.reindexing_offsets,
|
|
285
|
+
new_col=cfg.reindexed_var_suffix,
|
|
286
|
+
)
|
|
287
|
+
|
|
235
288
|
combined_raw_clustermap(
|
|
236
289
|
pp_adata,
|
|
237
290
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
@@ -247,16 +300,19 @@ def spatial_adata_core(
|
|
|
247
300
|
cmap_a=cfg.clustermap_cmap_a,
|
|
248
301
|
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
249
302
|
min_length=cfg.read_len_filter_thresholds[0],
|
|
250
|
-
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
303
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
304
|
+
0
|
|
305
|
+
],
|
|
251
306
|
min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
307
|
+
demux_types=("double", "already"),
|
|
252
308
|
bins=None,
|
|
253
309
|
sample_mapping=None,
|
|
254
310
|
save_path=pp_clustermap_dir,
|
|
255
311
|
sort_by=cfg.spatial_clustermap_sortby,
|
|
256
312
|
deaminase=deaminase,
|
|
257
|
-
index_col_suffix=
|
|
313
|
+
index_col_suffix=reindex_suffix,
|
|
258
314
|
)
|
|
259
|
-
|
|
315
|
+
|
|
260
316
|
# ============================================================
|
|
261
317
|
# 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
|
|
262
318
|
# ============================================================
|
|
@@ -265,10 +321,10 @@ def spatial_adata_core(
|
|
|
265
321
|
pp_umap_dir = pp_dir_dedup / "07_umaps"
|
|
266
322
|
|
|
267
323
|
# Clustermaps on deduplicated adata
|
|
268
|
-
if pp_clustermap_dir_dedup.is_dir() and not getattr(
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
324
|
+
if pp_clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
325
|
+
logger.debug(
|
|
326
|
+
f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
|
|
327
|
+
)
|
|
272
328
|
else:
|
|
273
329
|
make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
|
|
274
330
|
combined_raw_clustermap(
|
|
@@ -286,19 +342,22 @@ def spatial_adata_core(
|
|
|
286
342
|
cmap_a=cfg.clustermap_cmap_a,
|
|
287
343
|
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
288
344
|
min_length=cfg.read_len_filter_thresholds[0],
|
|
289
|
-
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
345
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
346
|
+
0
|
|
347
|
+
],
|
|
290
348
|
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
349
|
+
demux_types=("double", "already"),
|
|
291
350
|
bins=None,
|
|
292
351
|
sample_mapping=None,
|
|
293
352
|
save_path=pp_clustermap_dir_dedup,
|
|
294
353
|
sort_by=cfg.spatial_clustermap_sortby,
|
|
295
354
|
deaminase=deaminase,
|
|
296
|
-
index_col_suffix=
|
|
355
|
+
index_col_suffix=reindex_suffix,
|
|
297
356
|
)
|
|
298
357
|
|
|
299
358
|
# UMAP / Leiden
|
|
300
359
|
if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
301
|
-
|
|
360
|
+
logger.debug(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
|
|
302
361
|
else:
|
|
303
362
|
make_dirs([pp_umap_dir])
|
|
304
363
|
|
|
@@ -336,40 +395,48 @@ def spatial_adata_core(
|
|
|
336
395
|
pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
|
|
337
396
|
|
|
338
397
|
if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
339
|
-
|
|
398
|
+
logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
|
|
340
399
|
else:
|
|
341
400
|
positions = adata.var_names.astype(int).values
|
|
342
401
|
lags = np.arange(cfg.autocorr_max_lag + 1)
|
|
343
402
|
|
|
344
403
|
try:
|
|
345
404
|
from joblib import Parallel, delayed
|
|
405
|
+
|
|
346
406
|
_have_joblib = True
|
|
347
407
|
except Exception:
|
|
348
408
|
_have_joblib = False
|
|
349
409
|
|
|
350
|
-
samples =
|
|
410
|
+
samples = (
|
|
411
|
+
adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
|
|
412
|
+
)
|
|
351
413
|
ref_col = getattr(cfg, "reference_strand_col", "Reference_strand")
|
|
352
414
|
refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
|
|
353
415
|
|
|
354
416
|
for site_type in cfg.autocorr_site_types:
|
|
355
417
|
layer_key = f"{site_type}_site_binary"
|
|
356
418
|
if layer_key not in adata.layers:
|
|
357
|
-
|
|
419
|
+
logger.debug(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
|
|
358
420
|
continue
|
|
359
421
|
|
|
360
422
|
X = adata.layers[layer_key]
|
|
361
423
|
if getattr(X, "shape", (0,))[0] == 0:
|
|
362
|
-
|
|
424
|
+
logger.debug(f"Layer {layer_key} empty — skipping {site_type}.")
|
|
363
425
|
continue
|
|
364
426
|
|
|
365
427
|
rows = []
|
|
366
428
|
counts = []
|
|
367
429
|
|
|
368
430
|
if _have_joblib:
|
|
431
|
+
|
|
369
432
|
def _worker(row):
|
|
370
433
|
try:
|
|
371
434
|
ac, cnts = binary_autocorrelation_with_spacing(
|
|
372
|
-
row,
|
|
435
|
+
row,
|
|
436
|
+
positions,
|
|
437
|
+
max_lag=cfg.autocorr_max_lag,
|
|
438
|
+
return_counts=True,
|
|
439
|
+
normalize=cfg.autocorr_normalization_method,
|
|
373
440
|
)
|
|
374
441
|
except Exception:
|
|
375
442
|
ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
|
|
@@ -385,7 +452,11 @@ def spatial_adata_core(
|
|
|
385
452
|
else:
|
|
386
453
|
for i in range(X.shape[0]):
|
|
387
454
|
ac, cnts = binary_autocorrelation_with_spacing(
|
|
388
|
-
X[i],
|
|
455
|
+
X[i],
|
|
456
|
+
positions,
|
|
457
|
+
max_lag=cfg.autocorr_max_lag,
|
|
458
|
+
return_counts=True,
|
|
459
|
+
normalize=cfg.autocorr_normalization_method,
|
|
389
460
|
)
|
|
390
461
|
rows.append(ac)
|
|
391
462
|
counts.append(cnts)
|
|
@@ -474,7 +545,9 @@ def spatial_adata_core(
|
|
|
474
545
|
try:
|
|
475
546
|
r = analyze_autocorr_matrix(
|
|
476
547
|
ac_sel,
|
|
477
|
-
cnt_sel
|
|
548
|
+
cnt_sel
|
|
549
|
+
if cnt_sel is not None
|
|
550
|
+
else np.zeros_like(ac_sel, dtype=int),
|
|
478
551
|
lags,
|
|
479
552
|
nrl_search_bp=(120, 260),
|
|
480
553
|
pad_factor=4,
|
|
@@ -489,7 +562,9 @@ def spatial_adata_core(
|
|
|
489
562
|
|
|
490
563
|
adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
|
|
491
564
|
|
|
492
|
-
global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get(
|
|
565
|
+
global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get(
|
|
566
|
+
"nrl_bp", None
|
|
567
|
+
)
|
|
493
568
|
|
|
494
569
|
rolling_cfg = {
|
|
495
570
|
"window_size": getattr(
|
|
@@ -554,27 +629,31 @@ def spatial_adata_core(
|
|
|
554
629
|
fixed_nrl_bp=global_nrl,
|
|
555
630
|
)
|
|
556
631
|
except Exception as e:
|
|
557
|
-
|
|
632
|
+
logger.warning(
|
|
558
633
|
f"rolling_autocorr_metrics failed for {site_type} "
|
|
559
634
|
f"{sample_name} {ref_label}: {e}"
|
|
560
635
|
)
|
|
561
636
|
continue
|
|
562
637
|
|
|
563
638
|
if "center" not in df_roll.columns:
|
|
564
|
-
|
|
639
|
+
logger.warning(
|
|
565
640
|
f"rolling_autocorr_metrics returned unexpected schema "
|
|
566
641
|
f"for {site_type} {sample_name} {ref_label}"
|
|
567
642
|
)
|
|
568
643
|
continue
|
|
569
644
|
|
|
570
|
-
compact_df = df_roll[
|
|
645
|
+
compact_df = df_roll[
|
|
646
|
+
["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]
|
|
647
|
+
].copy()
|
|
571
648
|
compact_df["site"] = site_type
|
|
572
649
|
compact_df["sample"] = sample_name
|
|
573
650
|
compact_df["reference"] = ref_label if ref_label != "all" else "all"
|
|
574
651
|
|
|
575
652
|
if write_csvs:
|
|
576
653
|
safe_sample = str(sample_name).replace(os.sep, "_")
|
|
577
|
-
safe_ref = str(ref_label if ref_label != "all" else "all").replace(
|
|
654
|
+
safe_ref = str(ref_label if ref_label != "all" else "all").replace(
|
|
655
|
+
os.sep, "_"
|
|
656
|
+
)
|
|
578
657
|
out_csv = os.path.join(
|
|
579
658
|
site_out_dir,
|
|
580
659
|
f"{safe_sample}__{safe_ref}__rolling_metrics.csv",
|
|
@@ -582,7 +661,7 @@ def spatial_adata_core(
|
|
|
582
661
|
try:
|
|
583
662
|
compact_df.to_csv(out_csv, index=False)
|
|
584
663
|
except Exception as e:
|
|
585
|
-
|
|
664
|
+
logger.warning(f"Failed to write rolling CSV {out_csv}: {e}")
|
|
586
665
|
|
|
587
666
|
if write_plots:
|
|
588
667
|
try:
|
|
@@ -604,7 +683,7 @@ def spatial_adata_core(
|
|
|
604
683
|
show=False,
|
|
605
684
|
)
|
|
606
685
|
except Exception as e:
|
|
607
|
-
|
|
686
|
+
logger.warning(
|
|
608
687
|
f"Failed to create rolling plot for {site_type} "
|
|
609
688
|
f"{sample_name} {ref_label}: {e}"
|
|
610
689
|
)
|
|
@@ -612,7 +691,9 @@ def spatial_adata_core(
|
|
|
612
691
|
combined_rows.append(
|
|
613
692
|
compact_df.assign(site=site_type, sample=sample_name, reference=ref_label)
|
|
614
693
|
)
|
|
615
|
-
rolling_results_by_group[
|
|
694
|
+
rolling_results_by_group[
|
|
695
|
+
(sample_name, None if ref_label == "all" else ref_label)
|
|
696
|
+
] = compact_df
|
|
616
697
|
|
|
617
698
|
adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
|
|
618
699
|
|
|
@@ -624,9 +705,7 @@ def spatial_adata_core(
|
|
|
624
705
|
try:
|
|
625
706
|
combined_df_site.to_csv(combined_out_csv, index=False)
|
|
626
707
|
except Exception as e:
|
|
627
|
-
|
|
628
|
-
f"Failed to write combined rolling CSV for {site_type}: {e}"
|
|
629
|
-
)
|
|
708
|
+
logger.warning(f"Failed to write combined rolling CSV for {site_type}: {e}")
|
|
630
709
|
|
|
631
710
|
rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
|
|
632
711
|
plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
|
|
@@ -650,6 +729,7 @@ def spatial_adata_core(
|
|
|
650
729
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
651
730
|
window=cfg.autocorr_rolling_window_size,
|
|
652
731
|
rows_per_fig=cfg.rows_per_qc_autocorr_grid,
|
|
732
|
+
normalization_method=cfg.autocorr_normalization_method,
|
|
653
733
|
)
|
|
654
734
|
|
|
655
735
|
# ============================================================
|
|
@@ -658,7 +738,7 @@ def spatial_adata_core(
|
|
|
658
738
|
pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
|
|
659
739
|
|
|
660
740
|
if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
661
|
-
|
|
741
|
+
logger.debug(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
|
|
662
742
|
else:
|
|
663
743
|
compute_positionwise_statistics(
|
|
664
744
|
adata,
|
|
@@ -691,7 +771,15 @@ def spatial_adata_core(
|
|
|
691
771
|
# 5) Save spatial AnnData
|
|
692
772
|
# ============================================================
|
|
693
773
|
if (not spatial_adata_path.exists()) or getattr(cfg, "force_redo_spatial_analyses", False):
|
|
694
|
-
|
|
774
|
+
logger.info("Saving spatial analyzed AnnData (post preprocessing and duplicate removal).")
|
|
775
|
+
record_smftools_metadata(
|
|
776
|
+
adata,
|
|
777
|
+
step_name="spatial",
|
|
778
|
+
cfg=cfg,
|
|
779
|
+
config_path=config_path,
|
|
780
|
+
input_paths=[source_adata_path] if source_adata_path else None,
|
|
781
|
+
output_path=spatial_adata_path,
|
|
782
|
+
)
|
|
695
783
|
write_gz_h5ad(adata, spatial_adata_path)
|
|
696
784
|
|
|
697
|
-
return adata, spatial_adata_path
|
|
785
|
+
return adata, spatial_adata_path
|