smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +49 -7
- smftools/cli/hmm_adata.py +250 -32
- smftools/cli/latent_adata.py +773 -0
- smftools/cli/load_adata.py +78 -74
- smftools/cli/preprocess_adata.py +122 -58
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +74 -112
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +52 -4
- smftools/config/conversion.yaml +1 -1
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +85 -12
- smftools/config/experiment_config.py +146 -1
- smftools/constants.py +69 -0
- smftools/hmm/HMM.py +88 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +636 -175
- smftools/informatics/h5ad_functions.py +198 -2
- smftools/informatics/modkit_extract_to_adata.py +1007 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +26 -3
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +62 -1583
- smftools/plotting/hmm_plotting.py +1670 -8
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +4 -0
- smftools/preprocessing/append_base_context.py +18 -18
- smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +159 -99
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +10 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +130 -0
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +79 -80
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +872 -0
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +217 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
smftools/cli/spatial_adata.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Optional, Tuple
|
|
5
6
|
|
|
6
7
|
import anndata as ad
|
|
7
8
|
|
|
8
|
-
from smftools.
|
|
9
|
-
from smftools.
|
|
9
|
+
from smftools.constants import LOGGING_DIR, SPATIAL_DIR
|
|
10
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
10
11
|
|
|
11
12
|
logger = get_logger(__name__)
|
|
12
13
|
|
|
@@ -34,64 +35,61 @@ def spatial_adata(
|
|
|
34
35
|
spatial_adata_path : Path | None
|
|
35
36
|
Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
|
|
36
37
|
"""
|
|
37
|
-
from ..readwrite import
|
|
38
|
-
from .helpers import get_adata_paths
|
|
39
|
-
from .load_adata import load_adata
|
|
40
|
-
from .preprocess_adata import preprocess_adata
|
|
38
|
+
from ..readwrite import safe_read_h5ad
|
|
39
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
41
40
|
|
|
42
41
|
# 1) Ensure config + basic paths via load_adata
|
|
43
|
-
|
|
42
|
+
cfg = load_experiment_config(config_path)
|
|
43
|
+
|
|
44
44
|
paths = get_adata_paths(cfg)
|
|
45
45
|
|
|
46
|
-
raw_path = paths.raw
|
|
47
46
|
pp_path = paths.pp
|
|
48
47
|
pp_dedup_path = paths.pp_dedup
|
|
49
48
|
spatial_path = paths.spatial
|
|
49
|
+
chimeric_path = paths.chimeric
|
|
50
|
+
variant_path = paths.variant
|
|
50
51
|
hmm_path = paths.hmm
|
|
52
|
+
latent_path = paths.latent
|
|
51
53
|
|
|
52
54
|
# Stage-skipping logic for spatial
|
|
53
55
|
if not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
54
|
-
# If HMM exists, it's the most processed stage — reuse it.
|
|
55
|
-
if hmm_path.exists():
|
|
56
|
-
logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
|
|
57
|
-
return None, hmm_path
|
|
58
|
-
|
|
59
56
|
# If spatial exists, we consider spatial analyses already done.
|
|
60
57
|
if spatial_path.exists():
|
|
61
58
|
logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
|
|
62
59
|
return None, spatial_path
|
|
63
60
|
|
|
64
|
-
# 2) Ensure preprocessing has been run
|
|
65
|
-
# This will create pp/pp_dedup as needed or return them if they already exist.
|
|
66
|
-
pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
|
|
67
|
-
config_path
|
|
68
|
-
)
|
|
69
|
-
|
|
70
61
|
# Helper to load from disk, reusing loaded_adata if it matches
|
|
71
62
|
def _load(path: Path):
|
|
72
|
-
if loaded_adata is not None and loaded_path == path:
|
|
73
|
-
return loaded_adata
|
|
74
63
|
adata, _ = safe_read_h5ad(path)
|
|
75
64
|
return adata
|
|
76
65
|
|
|
77
66
|
# 3) Decide which AnnData to use as the *starting point* for spatial analyses
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
67
|
+
if hmm_path.exists():
|
|
68
|
+
start_adata = _load(hmm_path)
|
|
69
|
+
source_path = hmm_path
|
|
70
|
+
elif latent_path.exists():
|
|
71
|
+
start_adata = _load(latent_path)
|
|
72
|
+
source_path = latent_path
|
|
73
|
+
elif spatial_path.exists():
|
|
74
|
+
start_adata = _load(spatial_path)
|
|
75
|
+
source_path = spatial_path
|
|
76
|
+
elif chimeric_path.exists():
|
|
77
|
+
start_adata = _load(chimeric_path)
|
|
78
|
+
source_path = chimeric_path
|
|
79
|
+
elif variant_path.exists():
|
|
80
|
+
start_adata = _load(variant_path)
|
|
81
|
+
source_path = variant_path
|
|
82
|
+
elif pp_dedup_path.exists():
|
|
83
|
+
start_adata = _load(pp_dedup_path)
|
|
84
|
+
source_path = pp_dedup_path
|
|
85
|
+
elif pp_path.exists():
|
|
86
|
+
start_adata = _load(pp_path)
|
|
87
|
+
source_path = pp_path
|
|
82
88
|
else:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
start_adata = _load(pp_path)
|
|
88
|
-
source_path = pp_path
|
|
89
|
-
elif raw_path.exists():
|
|
90
|
-
start_adata = _load(raw_path)
|
|
91
|
-
source_path = raw_path
|
|
92
|
-
else:
|
|
93
|
-
logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
|
|
94
|
-
return None, None
|
|
89
|
+
logger.warning(
|
|
90
|
+
"No suitable AnnData found for spatial analyses (need at least preprocessed)."
|
|
91
|
+
)
|
|
92
|
+
return None, None
|
|
95
93
|
|
|
96
94
|
# 4) Run the spatial core
|
|
97
95
|
adata_spatial, spatial_path = spatial_adata_core(
|
|
@@ -99,15 +97,10 @@ def spatial_adata(
|
|
|
99
97
|
cfg=cfg,
|
|
100
98
|
spatial_adata_path=spatial_path,
|
|
101
99
|
pp_adata_path=pp_path,
|
|
102
|
-
pp_dup_rem_adata_path=pp_dedup_path,
|
|
103
|
-
pp_adata_in_memory=pp_adata,
|
|
104
100
|
source_adata_path=source_path,
|
|
105
101
|
config_path=config_path,
|
|
106
102
|
)
|
|
107
103
|
|
|
108
|
-
# 5) Register spatial path in summary CSV
|
|
109
|
-
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
|
|
110
|
-
|
|
111
104
|
return adata_spatial, spatial_path
|
|
112
105
|
|
|
113
106
|
|
|
@@ -116,8 +109,6 @@ def spatial_adata_core(
|
|
|
116
109
|
cfg,
|
|
117
110
|
spatial_adata_path: Path,
|
|
118
111
|
pp_adata_path: Path,
|
|
119
|
-
pp_dup_rem_adata_path: Path,
|
|
120
|
-
pp_adata_in_memory: Optional[ad.AnnData] = None,
|
|
121
112
|
source_adata_path: Optional[Path] = None,
|
|
122
113
|
config_path: Optional[str] = None,
|
|
123
114
|
) -> Tuple[ad.AnnData, Path]:
|
|
@@ -129,8 +120,6 @@ def spatial_adata_core(
|
|
|
129
120
|
- `cfg` is the ExperimentConfig.
|
|
130
121
|
- `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
|
|
131
122
|
from `get_adata_paths`.
|
|
132
|
-
- `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
|
|
133
|
-
the same run of `preprocess_adata`, to avoid re-reading from disk.
|
|
134
123
|
|
|
135
124
|
Does:
|
|
136
125
|
- Optional sample sheet load.
|
|
@@ -152,13 +141,12 @@ def spatial_adata_core(
|
|
|
152
141
|
"""
|
|
153
142
|
import os
|
|
154
143
|
import warnings
|
|
144
|
+
from datetime import datetime
|
|
155
145
|
from pathlib import Path
|
|
156
146
|
|
|
157
147
|
import numpy as np
|
|
158
148
|
import pandas as pd
|
|
159
149
|
|
|
160
|
-
sc = require("scanpy", extra="scanpy", purpose="spatial analyses")
|
|
161
|
-
|
|
162
150
|
from ..metadata import record_smftools_metadata
|
|
163
151
|
from ..plotting import (
|
|
164
152
|
combined_raw_clustermap,
|
|
@@ -171,7 +159,6 @@ def spatial_adata_core(
|
|
|
171
159
|
reindex_references_adata,
|
|
172
160
|
)
|
|
173
161
|
from ..readwrite import make_dirs, safe_read_h5ad
|
|
174
|
-
from ..tools import calculate_umap
|
|
175
162
|
from ..tools.position_stats import (
|
|
176
163
|
compute_positionwise_statistics,
|
|
177
164
|
plot_positionwise_matrices,
|
|
@@ -187,8 +174,24 @@ def spatial_adata_core(
|
|
|
187
174
|
# -----------------------------
|
|
188
175
|
# General setup
|
|
189
176
|
# -----------------------------
|
|
177
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
178
|
+
now = datetime.now()
|
|
179
|
+
time_str = now.strftime("%H%M%S")
|
|
180
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
181
|
+
|
|
190
182
|
output_directory = Path(cfg.output_directory)
|
|
191
|
-
|
|
183
|
+
spatial_directory = output_directory / SPATIAL_DIR
|
|
184
|
+
logging_directory = spatial_directory / LOGGING_DIR
|
|
185
|
+
|
|
186
|
+
make_dirs([output_directory, spatial_directory])
|
|
187
|
+
|
|
188
|
+
if cfg.emit_log_file:
|
|
189
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
190
|
+
make_dirs([logging_directory])
|
|
191
|
+
else:
|
|
192
|
+
log_file = None
|
|
193
|
+
|
|
194
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
192
195
|
|
|
193
196
|
smf_modality = cfg.smf_modality
|
|
194
197
|
if smf_modality == "conversion":
|
|
@@ -196,8 +199,6 @@ def spatial_adata_core(
|
|
|
196
199
|
else:
|
|
197
200
|
deaminase = True
|
|
198
201
|
|
|
199
|
-
first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
|
|
200
|
-
|
|
201
202
|
# -----------------------------
|
|
202
203
|
# Optional sample sheet metadata
|
|
203
204
|
# -----------------------------
|
|
@@ -231,17 +232,16 @@ def spatial_adata_core(
|
|
|
231
232
|
else:
|
|
232
233
|
reindex_suffix = None
|
|
233
234
|
|
|
234
|
-
pp_dir = output_directory / "preprocessed"
|
|
235
235
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
236
236
|
|
|
237
237
|
# ============================================================
|
|
238
|
-
# 1) Clustermaps (non-direct modalities) on
|
|
238
|
+
# 1) Clustermaps (non-direct modalities) on preprocessed adata
|
|
239
239
|
# ============================================================
|
|
240
240
|
if smf_modality != "direct":
|
|
241
241
|
preprocessed_version_available = pp_adata_path.exists()
|
|
242
242
|
|
|
243
243
|
if preprocessed_version_available:
|
|
244
|
-
pp_clustermap_dir =
|
|
244
|
+
pp_clustermap_dir = spatial_directory / "01_clustermaps"
|
|
245
245
|
|
|
246
246
|
if pp_clustermap_dir.is_dir() and not getattr(
|
|
247
247
|
cfg, "force_redo_spatial_analyses", False
|
|
@@ -250,12 +250,9 @@ def spatial_adata_core(
|
|
|
250
250
|
f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
|
|
251
251
|
)
|
|
252
252
|
else:
|
|
253
|
-
make_dirs([
|
|
253
|
+
make_dirs([spatial_directory, pp_clustermap_dir])
|
|
254
254
|
|
|
255
|
-
|
|
256
|
-
pp_adata = pp_adata_in_memory
|
|
257
|
-
else:
|
|
258
|
-
pp_adata, _ = safe_read_h5ad(pp_adata_path)
|
|
255
|
+
pp_adata, _ = safe_read_h5ad(pp_adata_path)
|
|
259
256
|
|
|
260
257
|
# -----------------------------
|
|
261
258
|
# Optional sample sheet metadata
|
|
@@ -303,8 +300,8 @@ def spatial_adata_core(
|
|
|
303
300
|
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
304
301
|
0
|
|
305
302
|
],
|
|
306
|
-
min_position_valid_fraction=cfg.
|
|
307
|
-
demux_types=
|
|
303
|
+
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
304
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
308
305
|
bins=None,
|
|
309
306
|
sample_mapping=None,
|
|
310
307
|
save_path=pp_clustermap_dir,
|
|
@@ -314,19 +311,18 @@ def spatial_adata_core(
|
|
|
314
311
|
)
|
|
315
312
|
|
|
316
313
|
# ============================================================
|
|
317
|
-
# 2) Clustermaps
|
|
314
|
+
# 2) Clustermaps on deduplicated preprocessed AnnDatas
|
|
318
315
|
# ============================================================
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
pp_umap_dir = pp_dir_dedup / "07_umaps"
|
|
316
|
+
spatial_dir_dedup = spatial_directory / "deduplicated"
|
|
317
|
+
clustermap_dir_dedup = spatial_dir_dedup / "01_clustermaps"
|
|
322
318
|
|
|
323
319
|
# Clustermaps on deduplicated adata
|
|
324
|
-
if
|
|
320
|
+
if clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
325
321
|
logger.debug(
|
|
326
|
-
f"{
|
|
322
|
+
f"{clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
|
|
327
323
|
)
|
|
328
324
|
else:
|
|
329
|
-
make_dirs([
|
|
325
|
+
make_dirs([spatial_dir_dedup, clustermap_dir_dedup])
|
|
330
326
|
combined_raw_clustermap(
|
|
331
327
|
adata,
|
|
332
328
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
@@ -346,53 +342,19 @@ def spatial_adata_core(
|
|
|
346
342
|
0
|
|
347
343
|
],
|
|
348
344
|
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
349
|
-
demux_types=
|
|
345
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
350
346
|
bins=None,
|
|
351
347
|
sample_mapping=None,
|
|
352
|
-
save_path=
|
|
348
|
+
save_path=clustermap_dir_dedup,
|
|
353
349
|
sort_by=cfg.spatial_clustermap_sortby,
|
|
354
350
|
deaminase=deaminase,
|
|
355
351
|
index_col_suffix=reindex_suffix,
|
|
356
352
|
)
|
|
357
353
|
|
|
358
|
-
# UMAP / Leiden
|
|
359
|
-
if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
360
|
-
logger.debug(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
|
|
361
|
-
else:
|
|
362
|
-
make_dirs([pp_umap_dir])
|
|
363
|
-
|
|
364
|
-
var_filters = []
|
|
365
|
-
if smf_modality == "direct":
|
|
366
|
-
for ref in references:
|
|
367
|
-
for base in cfg.mod_target_bases:
|
|
368
|
-
var_filters.append(f"{ref}_{base}_site")
|
|
369
|
-
elif deaminase:
|
|
370
|
-
for ref in references:
|
|
371
|
-
var_filters.append(f"{ref}_C_site")
|
|
372
|
-
else:
|
|
373
|
-
for ref in references:
|
|
374
|
-
for base in cfg.mod_target_bases:
|
|
375
|
-
var_filters.append(f"{ref}_{base}_site")
|
|
376
|
-
|
|
377
|
-
adata = calculate_umap(
|
|
378
|
-
adata,
|
|
379
|
-
layer=cfg.layer_for_umap_plotting,
|
|
380
|
-
var_filters=var_filters,
|
|
381
|
-
n_pcs=10,
|
|
382
|
-
knn_neighbors=15,
|
|
383
|
-
)
|
|
384
|
-
|
|
385
|
-
sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
|
|
386
|
-
|
|
387
|
-
sc.settings.figdir = pp_umap_dir
|
|
388
|
-
umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
389
|
-
umap_layers += cfg.umap_layers_to_plot
|
|
390
|
-
sc.pl.umap(adata, color=umap_layers, show=False, save=True)
|
|
391
|
-
|
|
392
354
|
# ============================================================
|
|
393
355
|
# 3) Spatial autocorrelation + rolling metrics
|
|
394
356
|
# ============================================================
|
|
395
|
-
pp_autocorr_dir =
|
|
357
|
+
pp_autocorr_dir = spatial_dir_dedup / "02_autocorrelations"
|
|
396
358
|
|
|
397
359
|
if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
398
360
|
logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
|
|
@@ -735,10 +697,10 @@ def spatial_adata_core(
|
|
|
735
697
|
# ============================================================
|
|
736
698
|
# 4) Pearson / correlation matrices
|
|
737
699
|
# ============================================================
|
|
738
|
-
|
|
700
|
+
corr_dir = spatial_dir_dedup / "03_correlation_matrices"
|
|
739
701
|
|
|
740
|
-
if
|
|
741
|
-
logger.debug(f"{
|
|
702
|
+
if corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
703
|
+
logger.debug(f"{corr_dir} already exists. Skipping correlation matrix plotting.")
|
|
742
704
|
else:
|
|
743
705
|
compute_positionwise_statistics(
|
|
744
706
|
adata,
|
|
@@ -763,15 +725,15 @@ def spatial_adata_core(
|
|
|
763
725
|
cmaps=cfg.correlation_matrix_cmaps,
|
|
764
726
|
vmin=None,
|
|
765
727
|
vmax=None,
|
|
766
|
-
output_dir=
|
|
728
|
+
output_dir=corr_dir,
|
|
767
729
|
output_key="positionwise_result",
|
|
768
730
|
)
|
|
769
731
|
|
|
770
732
|
# ============================================================
|
|
771
|
-
#
|
|
733
|
+
# 4) Save spatial AnnData
|
|
772
734
|
# ============================================================
|
|
773
735
|
if (not spatial_adata_path.exists()) or getattr(cfg, "force_redo_spatial_analyses", False):
|
|
774
|
-
logger.info("Saving spatial analyzed AnnData
|
|
736
|
+
logger.info("Saving spatial analyzed AnnData.")
|
|
775
737
|
record_smftools_metadata(
|
|
776
738
|
adata,
|
|
777
739
|
step_name="spatial",
|