smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +32 -6
- smftools/cli/hmm_adata.py +232 -31
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +77 -73
- smftools/cli/preprocess_adata.py +178 -53
- smftools/cli/spatial_adata.py +149 -101
- smftools/cli_entry.py +12 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +38 -1
- smftools/config/experiment_config.py +53 -1
- smftools/constants.py +65 -0
- smftools/hmm/HMM.py +88 -0
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/converted_BAM_to_adata.py +584 -163
- smftools/informatics/h5ad_functions.py +115 -2
- smftools/informatics/modkit_extract_to_adata.py +1003 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +9 -0
- smftools/plotting/general_plotting.py +2411 -628
- smftools/plotting/hmm_plotting.py +85 -7
- smftools/preprocessing/__init__.py +1 -0
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +4 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +91 -8
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
smftools/cli/spatial_adata.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Optional, Tuple
|
|
5
6
|
|
|
6
7
|
import anndata as ad
|
|
7
8
|
|
|
8
|
-
from smftools.
|
|
9
|
+
from smftools.constants import LOGGING_DIR, SEQUENCE_INTEGER_ENCODING, SPATIAL_DIR
|
|
10
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
9
11
|
from smftools.optional_imports import require
|
|
10
12
|
|
|
11
13
|
logger = get_logger(__name__)
|
|
@@ -35,15 +37,13 @@ def spatial_adata(
|
|
|
35
37
|
Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
|
|
36
38
|
"""
|
|
37
39
|
from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
|
|
38
|
-
from .helpers import get_adata_paths
|
|
39
|
-
from .load_adata import load_adata
|
|
40
|
-
from .preprocess_adata import preprocess_adata
|
|
40
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
41
41
|
|
|
42
42
|
# 1) Ensure config + basic paths via load_adata
|
|
43
|
-
|
|
43
|
+
cfg = load_experiment_config(config_path)
|
|
44
|
+
|
|
44
45
|
paths = get_adata_paths(cfg)
|
|
45
46
|
|
|
46
|
-
raw_path = paths.raw
|
|
47
47
|
pp_path = paths.pp
|
|
48
48
|
pp_dedup_path = paths.pp_dedup
|
|
49
49
|
spatial_path = paths.spatial
|
|
@@ -51,47 +51,34 @@ def spatial_adata(
|
|
|
51
51
|
|
|
52
52
|
# Stage-skipping logic for spatial
|
|
53
53
|
if not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
54
|
-
# If HMM exists, it's the most processed stage — reuse it.
|
|
55
|
-
if hmm_path.exists():
|
|
56
|
-
logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
|
|
57
|
-
return None, hmm_path
|
|
58
|
-
|
|
59
54
|
# If spatial exists, we consider spatial analyses already done.
|
|
60
55
|
if spatial_path.exists():
|
|
61
56
|
logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
|
|
62
57
|
return None, spatial_path
|
|
63
58
|
|
|
64
|
-
# 2) Ensure preprocessing has been run
|
|
65
|
-
# This will create pp/pp_dedup as needed or return them if they already exist.
|
|
66
|
-
pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
|
|
67
|
-
config_path
|
|
68
|
-
)
|
|
69
|
-
|
|
70
59
|
# Helper to load from disk, reusing loaded_adata if it matches
|
|
71
60
|
def _load(path: Path):
|
|
72
|
-
if loaded_adata is not None and loaded_path == path:
|
|
73
|
-
return loaded_adata
|
|
74
61
|
adata, _ = safe_read_h5ad(path)
|
|
75
62
|
return adata
|
|
76
63
|
|
|
77
64
|
# 3) Decide which AnnData to use as the *starting point* for spatial analyses
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
65
|
+
if hmm_path.exists():
|
|
66
|
+
start_adata = _load(hmm_path)
|
|
67
|
+
source_path = hmm_path
|
|
68
|
+
elif spatial_path.exists():
|
|
69
|
+
start_adata = _load(spatial_path)
|
|
70
|
+
source_path = spatial_path
|
|
71
|
+
elif pp_dedup_path.exists():
|
|
72
|
+
start_adata = _load(pp_dedup_path)
|
|
73
|
+
source_path = pp_dedup_path
|
|
74
|
+
elif pp_path.exists():
|
|
75
|
+
start_adata = _load(pp_path)
|
|
76
|
+
source_path = pp_path
|
|
82
77
|
else:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
start_adata = _load(pp_path)
|
|
88
|
-
source_path = pp_path
|
|
89
|
-
elif raw_path.exists():
|
|
90
|
-
start_adata = _load(raw_path)
|
|
91
|
-
source_path = raw_path
|
|
92
|
-
else:
|
|
93
|
-
logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
|
|
94
|
-
return None, None
|
|
78
|
+
logger.warning(
|
|
79
|
+
"No suitable AnnData found for spatial analyses (need at least preprocessed)."
|
|
80
|
+
)
|
|
81
|
+
return None, None
|
|
95
82
|
|
|
96
83
|
# 4) Run the spatial core
|
|
97
84
|
adata_spatial, spatial_path = spatial_adata_core(
|
|
@@ -99,15 +86,10 @@ def spatial_adata(
|
|
|
99
86
|
cfg=cfg,
|
|
100
87
|
spatial_adata_path=spatial_path,
|
|
101
88
|
pp_adata_path=pp_path,
|
|
102
|
-
pp_dup_rem_adata_path=pp_dedup_path,
|
|
103
|
-
pp_adata_in_memory=pp_adata,
|
|
104
89
|
source_adata_path=source_path,
|
|
105
90
|
config_path=config_path,
|
|
106
91
|
)
|
|
107
92
|
|
|
108
|
-
# 5) Register spatial path in summary CSV
|
|
109
|
-
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
|
|
110
|
-
|
|
111
93
|
return adata_spatial, spatial_path
|
|
112
94
|
|
|
113
95
|
|
|
@@ -116,8 +98,6 @@ def spatial_adata_core(
|
|
|
116
98
|
cfg,
|
|
117
99
|
spatial_adata_path: Path,
|
|
118
100
|
pp_adata_path: Path,
|
|
119
|
-
pp_dup_rem_adata_path: Path,
|
|
120
|
-
pp_adata_in_memory: Optional[ad.AnnData] = None,
|
|
121
101
|
source_adata_path: Optional[Path] = None,
|
|
122
102
|
config_path: Optional[str] = None,
|
|
123
103
|
) -> Tuple[ad.AnnData, Path]:
|
|
@@ -129,8 +109,6 @@ def spatial_adata_core(
|
|
|
129
109
|
- `cfg` is the ExperimentConfig.
|
|
130
110
|
- `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
|
|
131
111
|
from `get_adata_paths`.
|
|
132
|
-
- `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
|
|
133
|
-
the same run of `preprocess_adata`, to avoid re-reading from disk.
|
|
134
112
|
|
|
135
113
|
Does:
|
|
136
114
|
- Optional sample sheet load.
|
|
@@ -152,17 +130,17 @@ def spatial_adata_core(
|
|
|
152
130
|
"""
|
|
153
131
|
import os
|
|
154
132
|
import warnings
|
|
133
|
+
from datetime import datetime
|
|
155
134
|
from pathlib import Path
|
|
156
135
|
|
|
157
136
|
import numpy as np
|
|
158
137
|
import pandas as pd
|
|
159
138
|
|
|
160
|
-
sc = require("scanpy", extra="scanpy", purpose="spatial analyses")
|
|
161
|
-
|
|
162
139
|
from ..metadata import record_smftools_metadata
|
|
163
140
|
from ..plotting import (
|
|
164
141
|
combined_raw_clustermap,
|
|
165
142
|
plot_rolling_grid,
|
|
143
|
+
plot_rolling_nn_and_layer,
|
|
166
144
|
plot_spatial_autocorr_grid,
|
|
167
145
|
)
|
|
168
146
|
from ..preprocessing import (
|
|
@@ -171,11 +149,12 @@ def spatial_adata_core(
|
|
|
171
149
|
reindex_references_adata,
|
|
172
150
|
)
|
|
173
151
|
from ..readwrite import make_dirs, safe_read_h5ad
|
|
174
|
-
from ..tools import
|
|
152
|
+
from ..tools import rolling_window_nn_distance
|
|
175
153
|
from ..tools.position_stats import (
|
|
176
154
|
compute_positionwise_statistics,
|
|
177
155
|
plot_positionwise_matrices,
|
|
178
156
|
)
|
|
157
|
+
from ..tools.rolling_nn_distance import assign_rolling_nn_results
|
|
179
158
|
from ..tools.spatial_autocorrelation import (
|
|
180
159
|
analyze_autocorr_matrix,
|
|
181
160
|
binary_autocorrelation_with_spacing,
|
|
@@ -187,8 +166,24 @@ def spatial_adata_core(
|
|
|
187
166
|
# -----------------------------
|
|
188
167
|
# General setup
|
|
189
168
|
# -----------------------------
|
|
169
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
170
|
+
now = datetime.now()
|
|
171
|
+
time_str = now.strftime("%H%M%S")
|
|
172
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
173
|
+
|
|
190
174
|
output_directory = Path(cfg.output_directory)
|
|
191
|
-
|
|
175
|
+
spatial_directory = output_directory / SPATIAL_DIR
|
|
176
|
+
logging_directory = spatial_directory / LOGGING_DIR
|
|
177
|
+
|
|
178
|
+
make_dirs([output_directory, spatial_directory])
|
|
179
|
+
|
|
180
|
+
if cfg.emit_log_file:
|
|
181
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
182
|
+
make_dirs([logging_directory])
|
|
183
|
+
else:
|
|
184
|
+
log_file = None
|
|
185
|
+
|
|
186
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
192
187
|
|
|
193
188
|
smf_modality = cfg.smf_modality
|
|
194
189
|
if smf_modality == "conversion":
|
|
@@ -196,8 +191,6 @@ def spatial_adata_core(
|
|
|
196
191
|
else:
|
|
197
192
|
deaminase = True
|
|
198
193
|
|
|
199
|
-
first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
|
|
200
|
-
|
|
201
194
|
# -----------------------------
|
|
202
195
|
# Optional sample sheet metadata
|
|
203
196
|
# -----------------------------
|
|
@@ -231,7 +224,6 @@ def spatial_adata_core(
|
|
|
231
224
|
else:
|
|
232
225
|
reindex_suffix = None
|
|
233
226
|
|
|
234
|
-
pp_dir = output_directory / "preprocessed"
|
|
235
227
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
236
228
|
|
|
237
229
|
# ============================================================
|
|
@@ -241,7 +233,7 @@ def spatial_adata_core(
|
|
|
241
233
|
preprocessed_version_available = pp_adata_path.exists()
|
|
242
234
|
|
|
243
235
|
if preprocessed_version_available:
|
|
244
|
-
pp_clustermap_dir =
|
|
236
|
+
pp_clustermap_dir = spatial_directory / "06_clustermaps"
|
|
245
237
|
|
|
246
238
|
if pp_clustermap_dir.is_dir() and not getattr(
|
|
247
239
|
cfg, "force_redo_spatial_analyses", False
|
|
@@ -250,12 +242,9 @@ def spatial_adata_core(
|
|
|
250
242
|
f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
|
|
251
243
|
)
|
|
252
244
|
else:
|
|
253
|
-
make_dirs([
|
|
245
|
+
make_dirs([spatial_directory, pp_clustermap_dir])
|
|
254
246
|
|
|
255
|
-
|
|
256
|
-
pp_adata = pp_adata_in_memory
|
|
257
|
-
else:
|
|
258
|
-
pp_adata, _ = safe_read_h5ad(pp_adata_path)
|
|
247
|
+
pp_adata, _ = safe_read_h5ad(pp_adata_path)
|
|
259
248
|
|
|
260
249
|
# -----------------------------
|
|
261
250
|
# Optional sample sheet metadata
|
|
@@ -304,7 +293,7 @@ def spatial_adata_core(
|
|
|
304
293
|
0
|
|
305
294
|
],
|
|
306
295
|
min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
307
|
-
demux_types=
|
|
296
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
308
297
|
bins=None,
|
|
309
298
|
sample_mapping=None,
|
|
310
299
|
save_path=pp_clustermap_dir,
|
|
@@ -314,19 +303,18 @@ def spatial_adata_core(
|
|
|
314
303
|
)
|
|
315
304
|
|
|
316
305
|
# ============================================================
|
|
317
|
-
# 2) Clustermaps
|
|
306
|
+
# 2) Clustermaps on *deduplicated* preprocessed AnnData
|
|
318
307
|
# ============================================================
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
pp_umap_dir = pp_dir_dedup / "07_umaps"
|
|
308
|
+
spatial_dir_dedup = spatial_directory / "deduplicated"
|
|
309
|
+
clustermap_dir_dedup = spatial_dir_dedup / "06_clustermaps"
|
|
322
310
|
|
|
323
311
|
# Clustermaps on deduplicated adata
|
|
324
|
-
if
|
|
312
|
+
if clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
325
313
|
logger.debug(
|
|
326
|
-
f"{
|
|
314
|
+
f"{clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
|
|
327
315
|
)
|
|
328
316
|
else:
|
|
329
|
-
make_dirs([
|
|
317
|
+
make_dirs([spatial_dir_dedup, clustermap_dir_dedup])
|
|
330
318
|
combined_raw_clustermap(
|
|
331
319
|
adata,
|
|
332
320
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
@@ -346,53 +334,113 @@ def spatial_adata_core(
|
|
|
346
334
|
0
|
|
347
335
|
],
|
|
348
336
|
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
349
|
-
demux_types=
|
|
337
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
350
338
|
bins=None,
|
|
351
339
|
sample_mapping=None,
|
|
352
|
-
save_path=
|
|
340
|
+
save_path=clustermap_dir_dedup,
|
|
353
341
|
sort_by=cfg.spatial_clustermap_sortby,
|
|
354
342
|
deaminase=deaminase,
|
|
355
343
|
index_col_suffix=reindex_suffix,
|
|
356
344
|
)
|
|
357
345
|
|
|
358
|
-
#
|
|
359
|
-
|
|
360
|
-
|
|
346
|
+
# ============================================================
|
|
347
|
+
# 2b) Rolling NN distances + layer clustermaps
|
|
348
|
+
# ============================================================
|
|
349
|
+
pp_rolling_nn_dir = spatial_dir_dedup / "06b_rolling_nn_clustermaps"
|
|
350
|
+
|
|
351
|
+
if pp_rolling_nn_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
352
|
+
logger.debug(f"{pp_rolling_nn_dir} already exists. Skipping rolling NN distance plots.")
|
|
361
353
|
else:
|
|
362
|
-
make_dirs([
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
if smf_modality == "direct":
|
|
366
|
-
for ref in references:
|
|
367
|
-
for base in cfg.mod_target_bases:
|
|
368
|
-
var_filters.append(f"{ref}_{base}_site")
|
|
369
|
-
elif deaminase:
|
|
370
|
-
for ref in references:
|
|
371
|
-
var_filters.append(f"{ref}_C_site")
|
|
372
|
-
else:
|
|
373
|
-
for ref in references:
|
|
374
|
-
for base in cfg.mod_target_bases:
|
|
375
|
-
var_filters.append(f"{ref}_{base}_site")
|
|
376
|
-
|
|
377
|
-
adata = calculate_umap(
|
|
378
|
-
adata,
|
|
379
|
-
layer=cfg.layer_for_umap_plotting,
|
|
380
|
-
var_filters=var_filters,
|
|
381
|
-
n_pcs=10,
|
|
382
|
-
knn_neighbors=15,
|
|
354
|
+
make_dirs([pp_rolling_nn_dir])
|
|
355
|
+
samples = (
|
|
356
|
+
adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
|
|
383
357
|
)
|
|
358
|
+
references = adata.obs[cfg.reference_column].astype("category").cat.categories.tolist()
|
|
384
359
|
|
|
385
|
-
|
|
360
|
+
for reference in references:
|
|
361
|
+
for sample in samples:
|
|
362
|
+
mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (
|
|
363
|
+
adata.obs[cfg.reference_column] == reference
|
|
364
|
+
)
|
|
365
|
+
if not mask.any():
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
subset = adata[mask]
|
|
369
|
+
site_mask = (
|
|
370
|
+
adata.var[[f"{reference}_{st}_site" for st in cfg.rolling_nn_site_types]]
|
|
371
|
+
.fillna(False)
|
|
372
|
+
.any(axis=1)
|
|
373
|
+
)
|
|
374
|
+
subset = subset[:, site_mask].copy()
|
|
375
|
+
try:
|
|
376
|
+
rolling_values, rolling_starts = rolling_window_nn_distance(
|
|
377
|
+
subset,
|
|
378
|
+
layer=cfg.rolling_nn_layer,
|
|
379
|
+
window=cfg.rolling_nn_window,
|
|
380
|
+
step=cfg.rolling_nn_step,
|
|
381
|
+
min_overlap=cfg.rolling_nn_min_overlap,
|
|
382
|
+
return_fraction=cfg.rolling_nn_return_fraction,
|
|
383
|
+
store_obsm=cfg.rolling_nn_obsm_key,
|
|
384
|
+
)
|
|
385
|
+
except Exception as exc:
|
|
386
|
+
logger.warning(
|
|
387
|
+
"Rolling NN distance computation failed for sample=%s ref=%s: %s",
|
|
388
|
+
sample,
|
|
389
|
+
reference,
|
|
390
|
+
exc,
|
|
391
|
+
)
|
|
392
|
+
continue
|
|
386
393
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
394
|
+
safe_sample = str(sample).replace(os.sep, "_")
|
|
395
|
+
safe_ref = str(reference).replace(os.sep, "_")
|
|
396
|
+
parent_obsm_key = f"{cfg.rolling_nn_obsm_key}__{safe_ref}"
|
|
397
|
+
try:
|
|
398
|
+
assign_rolling_nn_results(
|
|
399
|
+
adata,
|
|
400
|
+
subset,
|
|
401
|
+
rolling_values,
|
|
402
|
+
rolling_starts,
|
|
403
|
+
obsm_key=parent_obsm_key,
|
|
404
|
+
window=cfg.rolling_nn_window,
|
|
405
|
+
step=cfg.rolling_nn_step,
|
|
406
|
+
min_overlap=cfg.rolling_nn_min_overlap,
|
|
407
|
+
return_fraction=cfg.rolling_nn_return_fraction,
|
|
408
|
+
layer=cfg.rolling_nn_layer,
|
|
409
|
+
)
|
|
410
|
+
except Exception as exc:
|
|
411
|
+
logger.warning(
|
|
412
|
+
"Failed to merge rolling NN results for sample=%s ref=%s: %s",
|
|
413
|
+
sample,
|
|
414
|
+
reference,
|
|
415
|
+
exc,
|
|
416
|
+
)
|
|
417
|
+
adata.uns.setdefault(f"{cfg.rolling_nn_obsm_key}_reference_map", {})[reference] = (
|
|
418
|
+
parent_obsm_key
|
|
419
|
+
)
|
|
420
|
+
out_png = pp_rolling_nn_dir / f"{safe_sample}__{safe_ref}.png"
|
|
421
|
+
title = f"{sample} {reference}"
|
|
422
|
+
try:
|
|
423
|
+
plot_rolling_nn_and_layer(
|
|
424
|
+
subset,
|
|
425
|
+
obsm_key=cfg.rolling_nn_obsm_key,
|
|
426
|
+
layer_key=cfg.rolling_nn_plot_layer,
|
|
427
|
+
max_nan_fraction=cfg.position_max_nan_threshold,
|
|
428
|
+
var_valid_fraction_col=f"{reference}_valid_fraction",
|
|
429
|
+
title=title,
|
|
430
|
+
save_name=out_png,
|
|
431
|
+
)
|
|
432
|
+
except Exception as exc:
|
|
433
|
+
logger.warning(
|
|
434
|
+
"Failed rolling NN plot for sample=%s ref=%s: %s",
|
|
435
|
+
sample,
|
|
436
|
+
reference,
|
|
437
|
+
exc,
|
|
438
|
+
)
|
|
391
439
|
|
|
392
440
|
# ============================================================
|
|
393
441
|
# 3) Spatial autocorrelation + rolling metrics
|
|
394
442
|
# ============================================================
|
|
395
|
-
pp_autocorr_dir =
|
|
443
|
+
pp_autocorr_dir = spatial_dir_dedup / "08_autocorrelations"
|
|
396
444
|
|
|
397
445
|
if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
398
446
|
logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
|
|
@@ -735,10 +783,10 @@ def spatial_adata_core(
|
|
|
735
783
|
# ============================================================
|
|
736
784
|
# 4) Pearson / correlation matrices
|
|
737
785
|
# ============================================================
|
|
738
|
-
|
|
786
|
+
corr_dir = spatial_dir_dedup / "09_correlation_matrices"
|
|
739
787
|
|
|
740
|
-
if
|
|
741
|
-
logger.debug(f"{
|
|
788
|
+
if corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
789
|
+
logger.debug(f"{corr_dir} already exists. Skipping correlation matrix plotting.")
|
|
742
790
|
else:
|
|
743
791
|
compute_positionwise_statistics(
|
|
744
792
|
adata,
|
|
@@ -763,7 +811,7 @@ def spatial_adata_core(
|
|
|
763
811
|
cmaps=cfg.correlation_matrix_cmaps,
|
|
764
812
|
vmin=None,
|
|
765
813
|
vmax=None,
|
|
766
|
-
output_dir=
|
|
814
|
+
output_dir=corr_dir,
|
|
767
815
|
output_key="positionwise_result",
|
|
768
816
|
)
|
|
769
817
|
|
smftools/cli_entry.py
CHANGED
|
@@ -8,6 +8,7 @@ import click
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
|
|
10
10
|
from .cli.hmm_adata import hmm_adata
|
|
11
|
+
from .cli.latent_adata import latent_adata
|
|
11
12
|
from .cli.load_adata import load_adata
|
|
12
13
|
from .cli.preprocess_adata import preprocess_adata
|
|
13
14
|
from .cli.spatial_adata import spatial_adata
|
|
@@ -103,6 +104,17 @@ def hmm(config_path):
|
|
|
103
104
|
##########################################
|
|
104
105
|
|
|
105
106
|
|
|
107
|
+
####### Latent ###########
|
|
108
|
+
@cli.command()
|
|
109
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
110
|
+
def latent(config_path):
|
|
111
|
+
"""Process data from CONFIG_PATH."""
|
|
112
|
+
latent_adata(config_path)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
##########################################
|
|
116
|
+
|
|
117
|
+
|
|
106
118
|
####### batch command ###########
|
|
107
119
|
@cli.command()
|
|
108
120
|
@click.argument(
|
smftools/config/conversion.yaml
CHANGED
|
@@ -15,6 +15,16 @@ autocorr_site_types:
|
|
|
15
15
|
|
|
16
16
|
# Spatial Analysis - Clustermap params
|
|
17
17
|
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
18
|
+
rolling_nn_layer: "nan0_0minus1"
|
|
19
|
+
rolling_nn_plot_layer: "nan0_0minus1"
|
|
20
|
+
rolling_nn_window: 30
|
|
21
|
+
rolling_nn_step: 2
|
|
22
|
+
rolling_nn_min_overlap: 20
|
|
23
|
+
rolling_nn_return_fraction: true
|
|
24
|
+
rolling_nn_obsm_key: "rolling_nn_dist"
|
|
25
|
+
rolling_nn_site_types:
|
|
26
|
+
- "GpC"
|
|
27
|
+
- "CpG"
|
|
18
28
|
clustermap_cmap_c: "coolwarm"
|
|
19
29
|
clustermap_cmap_gpc: "coolwarm"
|
|
20
30
|
clustermap_cmap_cpg: "viridis"
|
|
@@ -46,4 +56,4 @@ hmm_feature_sets:
|
|
|
46
56
|
cpg_patch: [0, inf]
|
|
47
57
|
|
|
48
58
|
hmm_merge_layer_features:
|
|
49
|
-
- ["all_accessible_features", 60]
|
|
59
|
+
- ["all_accessible_features", 60]
|
smftools/config/default.yaml
CHANGED
|
@@ -18,8 +18,9 @@ conversions:
|
|
|
18
18
|
fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
|
|
19
19
|
fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
|
|
20
20
|
input_already_demuxed: False # If the input files are already demultiplexed.
|
|
21
|
+
|
|
21
22
|
delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
|
|
22
|
-
delete_intermediate_bams:
|
|
23
|
+
delete_intermediate_bams: False # Whether to delete intermediate BAM files.
|
|
23
24
|
delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
|
|
24
25
|
|
|
25
26
|
# Sequencing modality and general experiment params
|
|
@@ -77,6 +78,7 @@ aligner_args:
|
|
|
77
78
|
# Sorted BAM and BED specific handling
|
|
78
79
|
make_bigwigs: False # Whether to make coverage bigwigs
|
|
79
80
|
make_beds: False # Whether to make beds from the aligned bams
|
|
81
|
+
annotate_secondary_supplementary: True # Whether to annotate reads with secondary/supplementary alignments from the aligned BAM
|
|
80
82
|
samtools_backend: auto # auto|python|cli for samtools-compatible operations
|
|
81
83
|
bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
|
|
82
84
|
bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
|
|
@@ -90,6 +92,12 @@ mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall w
|
|
|
90
92
|
reference_column: 'Reference_strand'
|
|
91
93
|
sample_column: 'Experiment_name_and_barcode'
|
|
92
94
|
|
|
95
|
+
# Plotting params
|
|
96
|
+
clustermap_demux_types_to_plot:
|
|
97
|
+
- "single"
|
|
98
|
+
- "double"
|
|
99
|
+
- "already"
|
|
100
|
+
|
|
93
101
|
######## smftools preprocess params #########
|
|
94
102
|
# Read length, quality, and mapping filtering params
|
|
95
103
|
read_coord_filter:
|
|
@@ -140,6 +148,10 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
|
|
|
140
148
|
- "CpG"
|
|
141
149
|
- "ambiguous_GpC_CpG"
|
|
142
150
|
duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
|
|
151
|
+
duplicate_detection_demux_types_to_use:
|
|
152
|
+
- "single"
|
|
153
|
+
- "double"
|
|
154
|
+
- "already"
|
|
143
155
|
hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
|
|
144
156
|
- Fraction_C_site_modified
|
|
145
157
|
duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
|
|
@@ -151,6 +163,11 @@ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkag
|
|
|
151
163
|
|
|
152
164
|
# Position QC params
|
|
153
165
|
position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
|
|
166
|
+
mismatch_frequency_range:
|
|
167
|
+
- 0.01
|
|
168
|
+
- 0.99
|
|
169
|
+
mismatch_frequency_layer: "mismatch_integer_encoding"
|
|
170
|
+
mismatch_frequency_read_span_layer: "read_span_mask"
|
|
154
171
|
|
|
155
172
|
######## smftools spatial params #########
|
|
156
173
|
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
@@ -169,6 +186,9 @@ clustermap_cmap_gpc: "coolwarm"
|
|
|
169
186
|
clustermap_cmap_cpg: "coolwarm"
|
|
170
187
|
clustermap_cmap_a: "coolwarm"
|
|
171
188
|
spatial_clustermap_sortby: "gpc"
|
|
189
|
+
rolling_nn_site_types:
|
|
190
|
+
- "GpC"
|
|
191
|
+
- "CpG"
|
|
172
192
|
|
|
173
193
|
# Spatial Analysis - UMAP/Leiden params
|
|
174
194
|
layer_for_umap_plotting: 'nan_half'
|
|
@@ -243,6 +263,18 @@ hmm_feature_sets:
|
|
|
243
263
|
mid_accessible_patch: [20, 40]
|
|
244
264
|
large_accessible_patch: [40, 110]
|
|
245
265
|
nucleosome_depleted_region: [110, inf]
|
|
266
|
+
hmm_feature_colormaps:
|
|
267
|
+
small_accessible_patch: "#A5D6A7"
|
|
268
|
+
mid_accessible_patch: "#2E7D32"
|
|
269
|
+
large_accessible_patch: "#006400"
|
|
270
|
+
nucleosome_depleted_region: "#00441B"
|
|
271
|
+
all_accessible_features: "#2E7D32"
|
|
272
|
+
small_bound_stretch: "#1E88E5"
|
|
273
|
+
medium_bound_stretch: "#6A1B9A"
|
|
274
|
+
large_bound_stretch: "#FB8C00"
|
|
275
|
+
putative_nucleosome: "#6D4C41"
|
|
276
|
+
all_footprint_features: "#6A1B9A"
|
|
277
|
+
cpg_patch: "#6D4C41"
|
|
246
278
|
hmm_merge_layer_features:
|
|
247
279
|
- ["all_accessible_features", 60]
|
|
248
280
|
clustermap_cmap_hmm: "coolwarm"
|
|
@@ -259,6 +291,11 @@ hmm_clustermap_feature_layers:
|
|
|
259
291
|
- medium_bound_stretch
|
|
260
292
|
- putative_nucleosome
|
|
261
293
|
- large_bound_stretch
|
|
294
|
+
- all_footprint_features
|
|
295
|
+
hmm_clustermap_length_layers:
|
|
296
|
+
- all_accessible_features
|
|
297
|
+
- all_accessible_features_merged
|
|
298
|
+
- all_footprint_features
|
|
262
299
|
hmm_clustermap_sortby: "hmm"
|
|
263
300
|
hmm_peak_feature_configs:
|
|
264
301
|
all_accessible_features:
|