smftools 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +18 -2
- smftools/cli/hmm_adata.py +18 -1
- smftools/cli/latent_adata.py +522 -67
- smftools/cli/load_adata.py +2 -2
- smftools/cli/preprocess_adata.py +32 -93
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +23 -109
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +41 -5
- smftools/config/conversion.yaml +0 -10
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +49 -13
- smftools/config/experiment_config.py +96 -3
- smftools/constants.py +4 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +53 -13
- smftools/informatics/h5ad_functions.py +83 -0
- smftools/informatics/modkit_extract_to_adata.py +4 -0
- smftools/plotting/__init__.py +26 -12
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +58 -3362
- smftools/plotting/hmm_plotting.py +1586 -2
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +3 -0
- smftools/preprocessing/append_base_context.py +1 -1
- smftools/preprocessing/append_mismatch_frequency_sites.py +35 -6
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +109 -85
- smftools/tools/__init__.py +6 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_nmf.py +18 -7
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +70 -154
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +640 -3
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +52 -4
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/METADATA +3 -1
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/RECORD +56 -42
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
smftools/cli/latent_adata.py
CHANGED
|
@@ -2,16 +2,118 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Optional, Tuple
|
|
5
|
+
from typing import Optional, Sequence, Tuple
|
|
6
6
|
|
|
7
7
|
import anndata as ad
|
|
8
8
|
|
|
9
|
-
from smftools.constants import LATENT_DIR, LOGGING_DIR, SEQUENCE_INTEGER_ENCODING
|
|
9
|
+
from smftools.constants import LATENT_DIR, LOGGING_DIR, REFERENCE_STRAND, SEQUENCE_INTEGER_ENCODING
|
|
10
10
|
from smftools.logging_utils import get_logger, setup_logging
|
|
11
11
|
|
|
12
12
|
logger = get_logger(__name__)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def _build_mod_sites_var_filter_mask(
|
|
16
|
+
adata: ad.AnnData,
|
|
17
|
+
references: Sequence[str],
|
|
18
|
+
cfg,
|
|
19
|
+
smf_modality: str,
|
|
20
|
+
deaminase: bool,
|
|
21
|
+
) -> "np.ndarray":
|
|
22
|
+
"""Build a boolean var mask for mod sites across references."""
|
|
23
|
+
import numpy as np
|
|
24
|
+
|
|
25
|
+
mod_target_bases = _expand_mod_target_bases(cfg.mod_target_bases)
|
|
26
|
+
ref_masks = []
|
|
27
|
+
for ref in references:
|
|
28
|
+
if deaminase and smf_modality != "direct":
|
|
29
|
+
mod_site_cols = [f"{ref}_C_site"]
|
|
30
|
+
else:
|
|
31
|
+
mod_site_cols = [f"{ref}_{base}_site" for base in mod_target_bases]
|
|
32
|
+
|
|
33
|
+
position_col = f"position_in_{ref}"
|
|
34
|
+
required_cols = mod_site_cols + [position_col]
|
|
35
|
+
missing = [col for col in required_cols if col not in adata.var.columns]
|
|
36
|
+
if missing:
|
|
37
|
+
raise KeyError(f"var_filters not found in adata.var: {missing}")
|
|
38
|
+
|
|
39
|
+
mod_masks = [np.asarray(adata.var[col].values, dtype=bool) for col in mod_site_cols]
|
|
40
|
+
mod_mask = mod_masks[0] if len(mod_masks) == 1 else np.logical_or.reduce(mod_masks)
|
|
41
|
+
position_mask = np.asarray(adata.var[position_col].values, dtype=bool)
|
|
42
|
+
ref_masks.append(np.logical_and(mod_mask, position_mask))
|
|
43
|
+
|
|
44
|
+
if not ref_masks:
|
|
45
|
+
return np.ones(adata.n_vars, dtype=bool)
|
|
46
|
+
|
|
47
|
+
return np.logical_and.reduce(ref_masks)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _build_shared_valid_non_mod_sites_mask(
|
|
51
|
+
adata: ad.AnnData,
|
|
52
|
+
references: Sequence[str],
|
|
53
|
+
cfg,
|
|
54
|
+
smf_modality: str,
|
|
55
|
+
deaminase: bool,
|
|
56
|
+
) -> "np.ndarray":
|
|
57
|
+
"""Build a boolean var mask for shared valid positions without mod sites."""
|
|
58
|
+
import numpy as np
|
|
59
|
+
|
|
60
|
+
shared_position_mask = _build_reference_position_mask(adata, references)
|
|
61
|
+
if len(references) == 0:
|
|
62
|
+
return shared_position_mask
|
|
63
|
+
|
|
64
|
+
mod_target_bases = _expand_mod_target_bases(cfg.mod_target_bases)
|
|
65
|
+
ref_mod_masks = []
|
|
66
|
+
for ref in references:
|
|
67
|
+
if deaminase and smf_modality != "direct":
|
|
68
|
+
mod_site_cols = [f"{ref}_C_site"]
|
|
69
|
+
else:
|
|
70
|
+
mod_site_cols = [f"{ref}_{base}_site" for base in mod_target_bases]
|
|
71
|
+
|
|
72
|
+
required_cols = mod_site_cols
|
|
73
|
+
missing = [col for col in required_cols if col not in adata.var.columns]
|
|
74
|
+
if missing:
|
|
75
|
+
raise KeyError(f"var_filters not found in adata.var: {missing}")
|
|
76
|
+
|
|
77
|
+
mod_masks = [np.asarray(adata.var[col].values, dtype=bool) for col in mod_site_cols]
|
|
78
|
+
ref_mod_masks.append(
|
|
79
|
+
mod_masks[0] if len(mod_masks) == 1 else np.logical_or.reduce(mod_masks)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
any_mod_mask = (
|
|
83
|
+
np.logical_or.reduce(ref_mod_masks) if ref_mod_masks else np.zeros(adata.n_vars, dtype=bool)
|
|
84
|
+
)
|
|
85
|
+
return np.logical_and(shared_position_mask, np.logical_not(any_mod_mask))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _expand_mod_target_bases(mod_target_bases: Sequence[str]) -> list[str]:
|
|
89
|
+
"""Ensure ambiguous GpC/CpG sites are included when requested."""
|
|
90
|
+
bases = list(mod_target_bases)
|
|
91
|
+
if any(base in {"GpC", "CpG"} for base in bases) and "ambiguous_GpC_CpG" not in bases:
|
|
92
|
+
bases.append("ambiguous_GpC_CpG")
|
|
93
|
+
return bases
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _build_reference_position_mask(
|
|
97
|
+
adata: ad.AnnData,
|
|
98
|
+
references: Sequence[str],
|
|
99
|
+
) -> "np.ndarray":
|
|
100
|
+
"""Build a boolean var mask for positions valid across references."""
|
|
101
|
+
import numpy as np
|
|
102
|
+
|
|
103
|
+
ref_masks = []
|
|
104
|
+
for ref in references:
|
|
105
|
+
position_col = f"position_in_{ref}"
|
|
106
|
+
if position_col not in adata.var.columns:
|
|
107
|
+
raise KeyError(f"var_filters not found in adata.var: {position_col}")
|
|
108
|
+
position_mask = np.asarray(adata.var[position_col].values, dtype=bool)
|
|
109
|
+
ref_masks.append(position_mask)
|
|
110
|
+
|
|
111
|
+
if not ref_masks:
|
|
112
|
+
return np.ones(adata.n_vars, dtype=bool)
|
|
113
|
+
|
|
114
|
+
return np.logical_and.reduce(ref_masks)
|
|
115
|
+
|
|
116
|
+
|
|
15
117
|
def latent_adata(
|
|
16
118
|
config_path: str,
|
|
17
119
|
) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
|
|
@@ -43,6 +145,8 @@ def latent_adata(
|
|
|
43
145
|
pp_path = paths.pp
|
|
44
146
|
pp_dedup_path = paths.pp_dedup
|
|
45
147
|
spatial_path = paths.spatial
|
|
148
|
+
chimeric_path = paths.chimeric
|
|
149
|
+
variant_path = paths.variant
|
|
46
150
|
hmm_path = paths.hmm
|
|
47
151
|
latent_path = paths.latent
|
|
48
152
|
|
|
@@ -59,15 +163,21 @@ def latent_adata(
|
|
|
59
163
|
return adata
|
|
60
164
|
|
|
61
165
|
# 3) Decide which AnnData to use as the *starting point* for latent analyses
|
|
62
|
-
if
|
|
63
|
-
start_adata = _load(latent_path)
|
|
64
|
-
source_path = latent_path
|
|
65
|
-
elif hmm_path.exists():
|
|
166
|
+
if hmm_path.exists():
|
|
66
167
|
start_adata = _load(hmm_path)
|
|
67
168
|
source_path = hmm_path
|
|
169
|
+
elif latent_path.exists():
|
|
170
|
+
start_adata = _load(latent_path)
|
|
171
|
+
source_path = latent_path
|
|
68
172
|
elif spatial_path.exists():
|
|
69
173
|
start_adata = _load(spatial_path)
|
|
70
174
|
source_path = spatial_path
|
|
175
|
+
elif chimeric_path.exists():
|
|
176
|
+
start_adata = _load(chimeric_path)
|
|
177
|
+
source_path = chimeric_path
|
|
178
|
+
elif variant_path.exists():
|
|
179
|
+
start_adata = _load(variant_path)
|
|
180
|
+
source_path = variant_path
|
|
71
181
|
elif pp_dedup_path.exists():
|
|
72
182
|
start_adata = _load(pp_dedup_path)
|
|
73
183
|
source_path = pp_dedup_path
|
|
@@ -109,7 +219,7 @@ def latent_adata_core(
|
|
|
109
219
|
Does:
|
|
110
220
|
- Optional sample sheet load.
|
|
111
221
|
- Optional inversion & reindexing.
|
|
112
|
-
- PCA/UMAP/Leiden
|
|
222
|
+
- PCA/KNN/UMAP/Leiden/NMP/PARAFAC
|
|
113
223
|
- Save latent AnnData to `latent_adata_path`.
|
|
114
224
|
|
|
115
225
|
Returns
|
|
@@ -130,20 +240,24 @@ def latent_adata_core(
|
|
|
130
240
|
from ..metadata import record_smftools_metadata
|
|
131
241
|
from ..plotting import (
|
|
132
242
|
plot_cp_sequence_components,
|
|
133
|
-
|
|
243
|
+
plot_embedding_grid,
|
|
134
244
|
plot_nmf_components,
|
|
135
|
-
|
|
136
|
-
|
|
245
|
+
plot_pca_components,
|
|
246
|
+
plot_pca_explained_variance,
|
|
247
|
+
plot_pca_grid,
|
|
248
|
+
plot_umap_grid,
|
|
137
249
|
)
|
|
138
250
|
from ..preprocessing import (
|
|
139
251
|
invert_adata,
|
|
140
252
|
load_sample_sheet,
|
|
141
253
|
reindex_references_adata,
|
|
142
254
|
)
|
|
143
|
-
from ..readwrite import make_dirs
|
|
255
|
+
from ..readwrite import make_dirs
|
|
144
256
|
from ..tools import (
|
|
257
|
+
calculate_knn,
|
|
145
258
|
calculate_leiden,
|
|
146
259
|
calculate_nmf,
|
|
260
|
+
calculate_pca,
|
|
147
261
|
calculate_sequence_cp_decomposition,
|
|
148
262
|
calculate_umap,
|
|
149
263
|
)
|
|
@@ -214,97 +328,438 @@ def latent_adata_core(
|
|
|
214
328
|
|
|
215
329
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
216
330
|
|
|
331
|
+
latent_dir_dedup = latent_directory / "deduplicated"
|
|
332
|
+
|
|
217
333
|
# ============================================================
|
|
218
|
-
# 2) PCA/UMAP
|
|
334
|
+
# 2) PCA/UMAP/NMF at valid modified base site binary encodings shared across references
|
|
219
335
|
# ============================================================
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
336
|
+
SUBSET = "shared_valid_mod_sites_binary_mod_arrays"
|
|
337
|
+
|
|
338
|
+
pca_dir = latent_dir_dedup / f"01_pca_{SUBSET}"
|
|
339
|
+
umap_dir = latent_dir_dedup / f"01_umap_{SUBSET}"
|
|
340
|
+
nmf_dir = latent_dir_dedup / f"01_nmf_{SUBSET}"
|
|
341
|
+
|
|
342
|
+
mod_site_layers = []
|
|
343
|
+
for mod_base in cfg.mod_target_bases:
|
|
344
|
+
mod_site_layers += [f"Modified_{mod_base}_site_count", f"Fraction_{mod_base}_site_modified"]
|
|
345
|
+
|
|
346
|
+
plotting_layers = [cfg.sample_name_col_for_plotting, REFERENCE_STRAND] + mod_site_layers
|
|
347
|
+
plotting_layers += cfg.umap_layers_to_plot
|
|
348
|
+
|
|
349
|
+
mod_sites_mask = _build_mod_sites_var_filter_mask(
|
|
350
|
+
adata=adata,
|
|
351
|
+
references=references,
|
|
352
|
+
cfg=cfg,
|
|
353
|
+
smf_modality=smf_modality,
|
|
354
|
+
deaminase=deaminase,
|
|
355
|
+
)
|
|
356
|
+
non_mod_sites_mask = _build_shared_valid_non_mod_sites_mask(
|
|
357
|
+
adata=adata,
|
|
358
|
+
references=references,
|
|
359
|
+
cfg=cfg,
|
|
360
|
+
smf_modality=smf_modality,
|
|
361
|
+
deaminase=deaminase,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
# PCA calculation
|
|
365
|
+
adata = calculate_pca(
|
|
366
|
+
adata,
|
|
367
|
+
layer=cfg.layer_for_umap_plotting,
|
|
368
|
+
var_mask=mod_sites_mask,
|
|
369
|
+
n_pcs=10,
|
|
370
|
+
output_suffix=SUBSET,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# KNN calculation
|
|
374
|
+
adata = calculate_knn(
|
|
375
|
+
adata,
|
|
376
|
+
obsm=f"X_pca_{SUBSET}",
|
|
377
|
+
knn_neighbors=15,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
# UMAP Calculation
|
|
381
|
+
adata = calculate_umap(
|
|
382
|
+
adata,
|
|
383
|
+
obsm=f"X_pca_{SUBSET}",
|
|
384
|
+
output_suffix=SUBSET,
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# Leiden clustering
|
|
388
|
+
calculate_leiden(adata, resolution=0.1, connectivities_key=f"connectivities_X_pca_{SUBSET}")
|
|
389
|
+
|
|
390
|
+
# NMF Calculation
|
|
391
|
+
adata = calculate_nmf(
|
|
392
|
+
adata,
|
|
393
|
+
layer=cfg.layer_for_umap_plotting,
|
|
394
|
+
var_mask=mod_sites_mask,
|
|
395
|
+
n_components=2,
|
|
396
|
+
suffix=SUBSET,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# PCA
|
|
400
|
+
if pca_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
401
|
+
logger.debug(f"{pca_dir} already exists. Skipping PCA calculation and plotting.")
|
|
233
402
|
else:
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
403
|
+
make_dirs([pca_dir])
|
|
404
|
+
plot_pca_grid(adata, subset=SUBSET, color=plotting_layers, output_dir=pca_dir)
|
|
405
|
+
plot_pca_explained_variance(adata, subset=SUBSET, output_dir=pca_dir)
|
|
406
|
+
plot_pca_components(adata, output_dir=pca_dir, suffix=SUBSET)
|
|
237
407
|
|
|
238
|
-
# UMAP
|
|
408
|
+
# UMAP
|
|
239
409
|
if umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
240
410
|
logger.debug(f"{umap_dir} already exists. Skipping UMAP plotting.")
|
|
241
411
|
else:
|
|
242
412
|
make_dirs([umap_dir])
|
|
413
|
+
plot_umap_grid(adata, subset=SUBSET, color=plotting_layers, output_dir=umap_dir)
|
|
243
414
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
knn_neighbors=15,
|
|
250
|
-
)
|
|
415
|
+
# NMF
|
|
416
|
+
if nmf_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
417
|
+
logger.debug(f"{nmf_dir} already exists. Skipping NMF plotting.")
|
|
418
|
+
else:
|
|
419
|
+
make_dirs([nmf_dir])
|
|
251
420
|
|
|
252
|
-
|
|
421
|
+
plot_embedding_grid(adata, basis=f"nmf_{SUBSET}", color=plotting_layers, output_dir=nmf_dir)
|
|
422
|
+
plot_nmf_components(adata, output_dir=nmf_dir, suffix=SUBSET)
|
|
253
423
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
424
|
+
# ============================================================
|
|
425
|
+
# 3) PCA/UMAP/NMF at valid base site integer encodings shared across references
|
|
426
|
+
# ============================================================
|
|
427
|
+
SUBSET = "shared_valid_ref_sites_integer_sequence_encodings"
|
|
428
|
+
|
|
429
|
+
pca_dir = latent_dir_dedup / f"02_pca_{SUBSET}"
|
|
430
|
+
umap_dir = latent_dir_dedup / f"02_umap_{SUBSET}"
|
|
431
|
+
nmf_dir = latent_dir_dedup / f"02_nmf_{SUBSET}"
|
|
432
|
+
|
|
433
|
+
valid_sites = _build_reference_position_mask(adata, references)
|
|
434
|
+
|
|
435
|
+
# PCA calculation
|
|
436
|
+
adata = calculate_pca(
|
|
437
|
+
adata,
|
|
438
|
+
layer=SEQUENCE_INTEGER_ENCODING,
|
|
439
|
+
var_mask=valid_sites,
|
|
440
|
+
n_pcs=10,
|
|
441
|
+
output_suffix=SUBSET,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# KNN calculation
|
|
445
|
+
adata = calculate_knn(
|
|
446
|
+
adata,
|
|
447
|
+
obsm=f"X_pca_{SUBSET}",
|
|
448
|
+
knn_neighbors=15,
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# UMAP Calculation
|
|
452
|
+
adata = calculate_umap(
|
|
453
|
+
adata,
|
|
454
|
+
obsm=f"X_pca_{SUBSET}",
|
|
455
|
+
output_suffix=SUBSET,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# Leiden clustering
|
|
459
|
+
calculate_leiden(adata, resolution=0.1, connectivities_key=f"connectivities_X_pca_{SUBSET}")
|
|
460
|
+
|
|
461
|
+
# NMF Calculation
|
|
462
|
+
adata = calculate_nmf(
|
|
463
|
+
adata,
|
|
464
|
+
layer=SEQUENCE_INTEGER_ENCODING,
|
|
465
|
+
var_mask=valid_sites,
|
|
466
|
+
n_components=2,
|
|
467
|
+
suffix=SUBSET,
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
# PCA
|
|
471
|
+
if pca_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
|
|
472
|
+
logger.debug(f"{pca_dir} already exists. Skipping PCA calculation and plotting.")
|
|
473
|
+
else:
|
|
474
|
+
make_dirs([pca_dir])
|
|
475
|
+
plot_pca_grid(adata, subset=SUBSET, color=plotting_layers, output_dir=pca_dir)
|
|
476
|
+
plot_pca_explained_variance(adata, subset=SUBSET, output_dir=pca_dir)
|
|
477
|
+
plot_pca_components(adata, output_dir=pca_dir, suffix=SUBSET)
|
|
478
|
+
|
|
479
|
+
# UMAP
|
|
480
|
+
if umap_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
|
|
481
|
+
logger.debug(f"{umap_dir} already exists. Skipping UMAP plotting.")
|
|
482
|
+
else:
|
|
483
|
+
make_dirs([umap_dir])
|
|
484
|
+
plot_umap_grid(adata, subset=SUBSET, color=plotting_layers, output_dir=umap_dir)
|
|
258
485
|
|
|
259
486
|
# NMF
|
|
260
|
-
if nmf_dir.is_dir() and not getattr(cfg, "
|
|
487
|
+
if nmf_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
|
|
261
488
|
logger.debug(f"{nmf_dir} already exists. Skipping NMF plotting.")
|
|
262
489
|
else:
|
|
263
490
|
make_dirs([nmf_dir])
|
|
264
|
-
|
|
491
|
+
|
|
492
|
+
plot_embedding_grid(adata, basis=f"nmf_{SUBSET}", color=plotting_layers, output_dir=nmf_dir)
|
|
493
|
+
plot_nmf_components(adata, output_dir=nmf_dir, suffix=SUBSET)
|
|
494
|
+
|
|
495
|
+
# ============================================================
|
|
496
|
+
# 3) CP PARAFAC factorization of shared mod site OHE sequences with mask layer
|
|
497
|
+
# ============================================================
|
|
498
|
+
SUBSET = "shared_valid_mod_sites_ohe_sequence_N_masked"
|
|
499
|
+
|
|
500
|
+
cp_sequence_dir = latent_dir_dedup / f"03_cp_{SUBSET}"
|
|
501
|
+
|
|
502
|
+
# Calculate CP tensor factorization
|
|
503
|
+
if SEQUENCE_INTEGER_ENCODING not in adata.layers:
|
|
504
|
+
logger.warning(
|
|
505
|
+
"Layer %s not found; skipping sequence integer encoding CP.",
|
|
506
|
+
SEQUENCE_INTEGER_ENCODING,
|
|
507
|
+
)
|
|
508
|
+
else:
|
|
509
|
+
adata = calculate_sequence_cp_decomposition(
|
|
510
|
+
adata,
|
|
511
|
+
layer=SEQUENCE_INTEGER_ENCODING,
|
|
512
|
+
var_mask=mod_sites_mask,
|
|
513
|
+
var_mask_name="shared_reference_and_mod_site_positions",
|
|
514
|
+
rank=2,
|
|
515
|
+
embedding_key=f"X_cp_{SUBSET}",
|
|
516
|
+
components_key=f"H_cp_{SUBSET}",
|
|
517
|
+
uns_key=f"cp_{SUBSET}",
|
|
518
|
+
non_negative=False,
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
# CP decomposition using sequence integer encoding (no var filters)
|
|
522
|
+
if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
|
|
523
|
+
logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
|
|
524
|
+
else:
|
|
525
|
+
make_dirs([cp_sequence_dir])
|
|
526
|
+
plot_embedding_grid(
|
|
527
|
+
adata,
|
|
528
|
+
basis=f"cp_{SUBSET}",
|
|
529
|
+
color=plotting_layers,
|
|
530
|
+
output_dir=cp_sequence_dir,
|
|
531
|
+
)
|
|
532
|
+
plot_cp_sequence_components(
|
|
533
|
+
adata,
|
|
534
|
+
output_dir=cp_sequence_dir,
|
|
535
|
+
components_key=f"H_cp_{SUBSET}",
|
|
536
|
+
uns_key=f"cp_{SUBSET}",
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
# ============================================================
|
|
540
|
+
# 4) Non-negative CP PARAFAC factorization of shared mod site OHE sequences with mask layer
|
|
541
|
+
# ============================================================
|
|
542
|
+
SUBSET = "shared_valid_mod_sites_ohe_sequence_N_masked_non_negative"
|
|
543
|
+
|
|
544
|
+
cp_sequence_dir = latent_dir_dedup / f"04_cp_{SUBSET}"
|
|
545
|
+
|
|
546
|
+
# Calculate CP tensor factorization
|
|
547
|
+
if SEQUENCE_INTEGER_ENCODING not in adata.layers:
|
|
548
|
+
logger.warning(
|
|
549
|
+
"Layer %s not found; skipping sequence integer encoding CP.",
|
|
550
|
+
SEQUENCE_INTEGER_ENCODING,
|
|
551
|
+
)
|
|
552
|
+
else:
|
|
553
|
+
adata = calculate_sequence_cp_decomposition(
|
|
554
|
+
adata,
|
|
555
|
+
layer=SEQUENCE_INTEGER_ENCODING,
|
|
556
|
+
var_mask=mod_sites_mask,
|
|
557
|
+
var_mask_name="shared_reference_mod_site_positions",
|
|
558
|
+
rank=2,
|
|
559
|
+
embedding_key=f"X_cp_{SUBSET}",
|
|
560
|
+
components_key=f"H_cp_{SUBSET}",
|
|
561
|
+
uns_key=f"cp_{SUBSET}",
|
|
562
|
+
non_negative=True,
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
# CP decomposition using sequence integer encoding (no var filters)
|
|
566
|
+
if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
|
|
567
|
+
logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
|
|
568
|
+
else:
|
|
569
|
+
make_dirs([cp_sequence_dir])
|
|
570
|
+
plot_embedding_grid(
|
|
571
|
+
adata,
|
|
572
|
+
basis=f"cp_{SUBSET}",
|
|
573
|
+
color=plotting_layers,
|
|
574
|
+
output_dir=cp_sequence_dir,
|
|
575
|
+
)
|
|
576
|
+
plot_cp_sequence_components(
|
|
577
|
+
adata,
|
|
578
|
+
output_dir=cp_sequence_dir,
|
|
579
|
+
components_key=f"H_cp_{SUBSET}",
|
|
580
|
+
uns_key=f"cp_{SUBSET}",
|
|
581
|
+
)
|
|
582
|
+
# ============================================================
|
|
583
|
+
# 6) CP PARAFAC factorization of non mod-site OHE sequences with mask layer
|
|
584
|
+
# ============================================================
|
|
585
|
+
SUBSET = "non_mod_site_ohe_sequence_N_masked"
|
|
586
|
+
|
|
587
|
+
cp_sequence_dir = latent_dir_dedup / f"05_cp_{SUBSET}"
|
|
588
|
+
|
|
589
|
+
# Calculate CP tensor factorization
|
|
590
|
+
if SEQUENCE_INTEGER_ENCODING not in adata.layers:
|
|
591
|
+
logger.warning(
|
|
592
|
+
"Layer %s not found; skipping sequence integer encoding CP.",
|
|
593
|
+
SEQUENCE_INTEGER_ENCODING,
|
|
594
|
+
)
|
|
595
|
+
else:
|
|
596
|
+
adata = calculate_sequence_cp_decomposition(
|
|
597
|
+
adata,
|
|
598
|
+
layer=SEQUENCE_INTEGER_ENCODING,
|
|
599
|
+
var_mask=non_mod_sites_mask,
|
|
600
|
+
var_mask_name="non_mod_site_reference_positions",
|
|
601
|
+
rank=2,
|
|
602
|
+
embedding_key=f"X_cp_{SUBSET}",
|
|
603
|
+
components_key=f"H_cp_{SUBSET}",
|
|
604
|
+
uns_key=f"cp_{SUBSET}",
|
|
605
|
+
non_negative=False,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
# CP decomposition using sequence integer encoding (no var filters)
|
|
609
|
+
if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
|
|
610
|
+
logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
|
|
611
|
+
else:
|
|
612
|
+
make_dirs([cp_sequence_dir])
|
|
613
|
+
plot_embedding_grid(
|
|
614
|
+
adata,
|
|
615
|
+
basis=f"cp_{SUBSET}",
|
|
616
|
+
color=plotting_layers,
|
|
617
|
+
output_dir=cp_sequence_dir,
|
|
618
|
+
)
|
|
619
|
+
plot_cp_sequence_components(
|
|
620
|
+
adata,
|
|
621
|
+
output_dir=cp_sequence_dir,
|
|
622
|
+
components_key=f"H_cp_{SUBSET}",
|
|
623
|
+
uns_key=f"cp_{SUBSET}",
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
# ============================================================
|
|
627
|
+
# 7) Non-negative CP PARAFAC factorization of full OHE sequences with mask layer
|
|
628
|
+
# ============================================================
|
|
629
|
+
SUBSET = "non_mod_site_ohe_sequence_N_masked_non_negative"
|
|
630
|
+
|
|
631
|
+
cp_sequence_dir = latent_dir_dedup / f"06_cp_{SUBSET}"
|
|
632
|
+
|
|
633
|
+
# Calculate CP tensor factorization
|
|
634
|
+
if SEQUENCE_INTEGER_ENCODING not in adata.layers:
|
|
635
|
+
logger.warning(
|
|
636
|
+
"Layer %s not found; skipping sequence integer encoding CP.",
|
|
637
|
+
SEQUENCE_INTEGER_ENCODING,
|
|
638
|
+
)
|
|
639
|
+
else:
|
|
640
|
+
adata = calculate_sequence_cp_decomposition(
|
|
265
641
|
adata,
|
|
266
|
-
layer=
|
|
267
|
-
|
|
268
|
-
|
|
642
|
+
layer=SEQUENCE_INTEGER_ENCODING,
|
|
643
|
+
var_mask=non_mod_sites_mask,
|
|
644
|
+
var_mask_name="non_mod_site_reference_positions",
|
|
645
|
+
rank=2,
|
|
646
|
+
embedding_key=f"X_cp_{SUBSET}",
|
|
647
|
+
components_key=f"H_cp_{SUBSET}",
|
|
648
|
+
uns_key=f"cp_{SUBSET}",
|
|
649
|
+
non_negative=True,
|
|
269
650
|
)
|
|
270
|
-
nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
271
|
-
nmf_layers += cfg.umap_layers_to_plot
|
|
272
|
-
plot_embedding(adata, basis="nmf", color=nmf_layers, output_dir=nmf_dir)
|
|
273
|
-
plot_nmf_components(adata, output_dir=nmf_dir)
|
|
274
651
|
|
|
275
652
|
# CP decomposition using sequence integer encoding (no var filters)
|
|
276
|
-
if
|
|
277
|
-
logger.debug(f"{
|
|
278
|
-
|
|
653
|
+
if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
|
|
654
|
+
logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
|
|
655
|
+
else:
|
|
656
|
+
make_dirs([cp_sequence_dir])
|
|
657
|
+
plot_embedding_grid(
|
|
658
|
+
adata,
|
|
659
|
+
basis=f"cp_{SUBSET}",
|
|
660
|
+
color=plotting_layers,
|
|
661
|
+
output_dir=cp_sequence_dir,
|
|
662
|
+
)
|
|
663
|
+
plot_cp_sequence_components(
|
|
664
|
+
adata,
|
|
665
|
+
output_dir=cp_sequence_dir,
|
|
666
|
+
components_key=f"H_cp_{SUBSET}",
|
|
667
|
+
uns_key=f"cp_{SUBSET}",
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
# ============================================================
|
|
671
|
+
# 8) CP PARAFAC factorization of full OHE sequences with mask layer
|
|
672
|
+
# ============================================================
|
|
673
|
+
SUBSET = "full_ohe_sequence_N_masked"
|
|
674
|
+
|
|
675
|
+
cp_sequence_dir = latent_dir_dedup / f"07_cp_{SUBSET}"
|
|
676
|
+
|
|
677
|
+
# Calculate CP tensor factorization
|
|
678
|
+
if SEQUENCE_INTEGER_ENCODING not in adata.layers:
|
|
279
679
|
logger.warning(
|
|
280
680
|
"Layer %s not found; skipping sequence integer encoding CP.",
|
|
281
681
|
SEQUENCE_INTEGER_ENCODING,
|
|
282
682
|
)
|
|
283
683
|
else:
|
|
284
|
-
make_dirs([nmf_sequence_dir])
|
|
285
684
|
adata = calculate_sequence_cp_decomposition(
|
|
286
685
|
adata,
|
|
287
686
|
layer=SEQUENCE_INTEGER_ENCODING,
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
687
|
+
var_mask=_build_reference_position_mask(adata, references),
|
|
688
|
+
var_mask_name="shared_reference_positions",
|
|
689
|
+
rank=2,
|
|
690
|
+
embedding_key=f"X_cp_{SUBSET}",
|
|
691
|
+
components_key=f"H_cp_{SUBSET}",
|
|
692
|
+
uns_key=f"cp_{SUBSET}",
|
|
693
|
+
non_negative=False,
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
# CP decomposition using sequence integer encoding (no var filters)
|
|
697
|
+
if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
|
|
698
|
+
logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
|
|
699
|
+
else:
|
|
700
|
+
make_dirs([cp_sequence_dir])
|
|
701
|
+
plot_embedding_grid(
|
|
702
|
+
adata,
|
|
703
|
+
basis=f"cp_{SUBSET}",
|
|
704
|
+
color=plotting_layers,
|
|
705
|
+
output_dir=cp_sequence_dir,
|
|
706
|
+
)
|
|
707
|
+
plot_cp_sequence_components(
|
|
708
|
+
adata,
|
|
709
|
+
output_dir=cp_sequence_dir,
|
|
710
|
+
components_key=f"H_cp_{SUBSET}",
|
|
711
|
+
uns_key=f"cp_{SUBSET}",
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
# ============================================================
|
|
715
|
+
# 9) Non-negative CP PARAFAC factorization of full OHE sequences with mask layer
|
|
716
|
+
# ============================================================
|
|
717
|
+
SUBSET = "full_ohe_sequence_N_masked_non_negative"
|
|
718
|
+
|
|
719
|
+
cp_sequence_dir = latent_dir_dedup / f"08_cp_{SUBSET}"
|
|
720
|
+
|
|
721
|
+
# Calculate CP tensor factorization
|
|
722
|
+
if SEQUENCE_INTEGER_ENCODING not in adata.layers:
|
|
723
|
+
logger.warning(
|
|
724
|
+
"Layer %s not found; skipping sequence integer encoding CP.",
|
|
725
|
+
SEQUENCE_INTEGER_ENCODING,
|
|
726
|
+
)
|
|
727
|
+
else:
|
|
728
|
+
adata = calculate_sequence_cp_decomposition(
|
|
729
|
+
adata,
|
|
730
|
+
layer=SEQUENCE_INTEGER_ENCODING,
|
|
731
|
+
var_mask=_build_reference_position_mask(adata, references),
|
|
732
|
+
var_mask_name="shared_reference_positions",
|
|
733
|
+
rank=2,
|
|
734
|
+
embedding_key=f"X_cp_{SUBSET}",
|
|
735
|
+
components_key=f"H_cp_{SUBSET}",
|
|
736
|
+
uns_key=f"cp_{SUBSET}",
|
|
737
|
+
non_negative=True,
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
# CP decomposition using sequence integer encoding (no var filters)
|
|
741
|
+
if cp_sequence_dir.is_dir() and not getattr(cfg, "force_redo_latent_analyses", False):
|
|
742
|
+
logger.debug(f"{cp_sequence_dir} already exists. Skipping sequence CP plotting.")
|
|
743
|
+
else:
|
|
744
|
+
make_dirs([cp_sequence_dir])
|
|
745
|
+
plot_embedding_grid(
|
|
746
|
+
adata,
|
|
747
|
+
basis=f"cp_{SUBSET}",
|
|
748
|
+
color=plotting_layers,
|
|
749
|
+
output_dir=cp_sequence_dir,
|
|
292
750
|
)
|
|
293
|
-
nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
294
|
-
nmf_layers += cfg.umap_layers_to_plot
|
|
295
|
-
plot_embedding(adata, basis="cp_sequence", color=nmf_layers, output_dir=nmf_sequence_dir)
|
|
296
751
|
plot_cp_sequence_components(
|
|
297
752
|
adata,
|
|
298
|
-
output_dir=
|
|
299
|
-
components_key="
|
|
300
|
-
uns_key="
|
|
753
|
+
output_dir=cp_sequence_dir,
|
|
754
|
+
components_key=f"H_cp_{SUBSET}",
|
|
755
|
+
uns_key=f"cp_{SUBSET}",
|
|
301
756
|
)
|
|
302
757
|
|
|
303
758
|
# ============================================================
|
|
304
|
-
#
|
|
759
|
+
# 10) Save latent AnnData
|
|
305
760
|
# ============================================================
|
|
306
|
-
if
|
|
307
|
-
logger.info("Saving latent analyzed AnnData
|
|
761
|
+
if not latent_adata_path.exists():
|
|
762
|
+
logger.info("Saving latent analyzed AnnData")
|
|
308
763
|
record_smftools_metadata(
|
|
309
764
|
adata,
|
|
310
765
|
step_name="latent",
|