smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +32 -6
- smftools/cli/hmm_adata.py +232 -31
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +77 -73
- smftools/cli/preprocess_adata.py +178 -53
- smftools/cli/spatial_adata.py +149 -101
- smftools/cli_entry.py +12 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +38 -1
- smftools/config/experiment_config.py +53 -1
- smftools/constants.py +65 -0
- smftools/hmm/HMM.py +88 -0
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/converted_BAM_to_adata.py +584 -163
- smftools/informatics/h5ad_functions.py +115 -2
- smftools/informatics/modkit_extract_to_adata.py +1003 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +9 -0
- smftools/plotting/general_plotting.py +2411 -628
- smftools/plotting/hmm_plotting.py +85 -7
- smftools/preprocessing/__init__.py +1 -0
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +4 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +91 -8
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
smftools/_version.py
CHANGED
smftools/cli/helpers.py
CHANGED
|
@@ -5,6 +5,8 @@ from pathlib import Path
|
|
|
5
5
|
|
|
6
6
|
import anndata as ad
|
|
7
7
|
|
|
8
|
+
from smftools.constants import H5_DIR, HMM_DIR, LATENT_DIR, LOAD_DIR, PREPROCESS_DIR, SPATIAL_DIR
|
|
9
|
+
|
|
8
10
|
from ..metadata import write_runtime_schema_yaml
|
|
9
11
|
from ..readwrite import safe_write_h5ad
|
|
10
12
|
|
|
@@ -16,28 +18,35 @@ class AdataPaths:
|
|
|
16
18
|
pp_dedup: Path
|
|
17
19
|
spatial: Path
|
|
18
20
|
hmm: Path
|
|
21
|
+
latent: Path
|
|
19
22
|
|
|
20
23
|
|
|
21
24
|
def get_adata_paths(cfg) -> AdataPaths:
|
|
22
25
|
"""
|
|
23
26
|
Central helper: given cfg, compute all standard AnnData paths.
|
|
24
27
|
"""
|
|
25
|
-
|
|
28
|
+
output_directory = Path(cfg.output_directory)
|
|
26
29
|
|
|
27
|
-
raw =
|
|
30
|
+
raw = output_directory / LOAD_DIR / H5_DIR / f"{cfg.experiment_name}.h5ad.gz"
|
|
28
31
|
|
|
29
|
-
pp =
|
|
32
|
+
pp = output_directory / PREPROCESS_DIR / H5_DIR / f"{cfg.experiment_name}_preprocessed.h5ad.gz"
|
|
30
33
|
|
|
31
34
|
if cfg.smf_modality == "direct":
|
|
32
35
|
# direct SMF: duplicate-removed path is just preprocessed path
|
|
33
36
|
pp_dedup = pp
|
|
34
37
|
else:
|
|
35
|
-
pp_dedup =
|
|
38
|
+
pp_dedup = (
|
|
39
|
+
output_directory
|
|
40
|
+
/ PREPROCESS_DIR
|
|
41
|
+
/ H5_DIR
|
|
42
|
+
/ f"{cfg.experiment_name}_preprocessed_duplicates_removed.h5ad.gz"
|
|
43
|
+
)
|
|
36
44
|
|
|
37
45
|
pp_dedup_base = pp_dedup.name.removesuffix(".h5ad.gz")
|
|
38
46
|
|
|
39
|
-
spatial =
|
|
40
|
-
hmm =
|
|
47
|
+
spatial = output_directory / SPATIAL_DIR / H5_DIR / f"{pp_dedup_base}_spatial.h5ad.gz"
|
|
48
|
+
hmm = output_directory / HMM_DIR / H5_DIR / f"{pp_dedup_base}_hmm.h5ad.gz"
|
|
49
|
+
latent = output_directory / LATENT_DIR / H5_DIR / f"{pp_dedup_base}_latent.h5ad.gz"
|
|
41
50
|
|
|
42
51
|
return AdataPaths(
|
|
43
52
|
raw=raw,
|
|
@@ -45,7 +54,24 @@ def get_adata_paths(cfg) -> AdataPaths:
|
|
|
45
54
|
pp_dedup=pp_dedup,
|
|
46
55
|
spatial=spatial,
|
|
47
56
|
hmm=hmm,
|
|
57
|
+
latent=latent,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def load_experiment_config(config_path: str):
|
|
62
|
+
"""Load ExperimentConfig without invoking any pipeline stages."""
|
|
63
|
+
from datetime import datetime
|
|
64
|
+
from importlib import resources
|
|
65
|
+
|
|
66
|
+
from ..config import ExperimentConfig, LoadExperimentConfig
|
|
67
|
+
|
|
68
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
69
|
+
loader = LoadExperimentConfig(config_path)
|
|
70
|
+
defaults_dir = resources.files("smftools").joinpath("config")
|
|
71
|
+
cfg, _ = ExperimentConfig.from_var_dict(
|
|
72
|
+
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
48
73
|
)
|
|
74
|
+
return cfg
|
|
49
75
|
|
|
50
76
|
|
|
51
77
|
def write_gz_h5ad(adata: ad.AnnData, path: Path) -> Path:
|
smftools/cli/hmm_adata.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import copy
|
|
4
|
+
import logging
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
10
|
|
|
10
|
-
from smftools.
|
|
11
|
+
from smftools.constants import HMM_DIR, LOGGING_DIR
|
|
12
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
11
13
|
from smftools.optional_imports import require
|
|
12
14
|
|
|
13
15
|
# FIX: import _to_dense_np to avoid NameError
|
|
@@ -19,12 +21,132 @@ if TYPE_CHECKING:
|
|
|
19
21
|
import torch as torch_types
|
|
20
22
|
|
|
21
23
|
torch = require("torch", extra="torch", purpose="HMM CLI")
|
|
24
|
+
mpl = require("matplotlib", extra="plotting", purpose="HMM plotting")
|
|
25
|
+
mpl_colors = require("matplotlib.colors", extra="plotting", purpose="HMM plotting")
|
|
22
26
|
|
|
23
27
|
# =============================================================================
|
|
24
28
|
# Helpers: extracting training arrays
|
|
25
29
|
# =============================================================================
|
|
26
30
|
|
|
27
31
|
|
|
32
|
+
def _strip_hmm_layer_prefix(layer: str) -> str:
|
|
33
|
+
"""Strip methbase prefixes and length suffixes from an HMM layer name.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
layer: Full layer name (e.g., "GpC_small_accessible_patch_lengths").
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The base layer name without methbase prefixes or length suffixes.
|
|
40
|
+
"""
|
|
41
|
+
base = layer
|
|
42
|
+
for prefix in ("Combined_", "GpC_", "CpG_", "C_", "A_"):
|
|
43
|
+
if base.startswith(prefix):
|
|
44
|
+
base = base[len(prefix) :]
|
|
45
|
+
break
|
|
46
|
+
if base.endswith("_lengths"):
|
|
47
|
+
base = base[: -len("_lengths")]
|
|
48
|
+
if base.endswith("_merged"):
|
|
49
|
+
base = base[: -len("_merged")]
|
|
50
|
+
return base
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _resolve_feature_colormap(layer: str, cfg, default_cmap: str) -> Any:
|
|
54
|
+
"""Resolve a colormap for a given HMM layer.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
layer: Full layer name.
|
|
58
|
+
cfg: Experiment config.
|
|
59
|
+
default_cmap: Fallback colormap name.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
A matplotlib colormap or colormap name.
|
|
63
|
+
"""
|
|
64
|
+
feature_maps = getattr(cfg, "hmm_feature_colormaps", {}) or {}
|
|
65
|
+
if not isinstance(feature_maps, dict):
|
|
66
|
+
feature_maps = {}
|
|
67
|
+
|
|
68
|
+
base = _strip_hmm_layer_prefix(layer)
|
|
69
|
+
value = feature_maps.get(layer, feature_maps.get(base))
|
|
70
|
+
if value is None:
|
|
71
|
+
return default_cmap
|
|
72
|
+
|
|
73
|
+
if isinstance(value, (list, tuple)):
|
|
74
|
+
return mpl_colors.ListedColormap(list(value))
|
|
75
|
+
|
|
76
|
+
if isinstance(value, str):
|
|
77
|
+
try:
|
|
78
|
+
mpl.colormaps.get_cmap(value)
|
|
79
|
+
return value
|
|
80
|
+
except Exception:
|
|
81
|
+
return mpl_colors.LinearSegmentedColormap.from_list(
|
|
82
|
+
f"hmm_{base}_cmap", ["#ffffff", value]
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return default_cmap
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _resolve_feature_color(layer: str, cfg, fallback_cmap: str, idx: int, total: int) -> Any:
|
|
89
|
+
"""Resolve a line color for a given HMM layer."""
|
|
90
|
+
feature_maps = getattr(cfg, "hmm_feature_colormaps", {}) or {}
|
|
91
|
+
if not isinstance(feature_maps, dict):
|
|
92
|
+
feature_maps = {}
|
|
93
|
+
|
|
94
|
+
base = _strip_hmm_layer_prefix(layer)
|
|
95
|
+
value = feature_maps.get(layer, feature_maps.get(base))
|
|
96
|
+
if isinstance(value, str):
|
|
97
|
+
try:
|
|
98
|
+
mpl.colormaps.get_cmap(value)
|
|
99
|
+
except Exception:
|
|
100
|
+
return value
|
|
101
|
+
return mpl.colormaps.get_cmap(value)(0.75)
|
|
102
|
+
if isinstance(value, (list, tuple)) and value:
|
|
103
|
+
return value[-1]
|
|
104
|
+
|
|
105
|
+
cmap_obj = mpl.colormaps.get_cmap(fallback_cmap)
|
|
106
|
+
if total <= 1:
|
|
107
|
+
return cmap_obj(0.5)
|
|
108
|
+
return cmap_obj(idx / (total - 1))
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _resolve_length_feature_ranges(
|
|
112
|
+
layer: str, cfg, default_cmap: str
|
|
113
|
+
) -> List[Tuple[int, int, Any]]:
|
|
114
|
+
"""Resolve length-based feature ranges to colors for size contour overlays."""
|
|
115
|
+
base = _strip_hmm_layer_prefix(layer)
|
|
116
|
+
feature_sets = getattr(cfg, "hmm_feature_sets", {}) or {}
|
|
117
|
+
if not isinstance(feature_sets, dict):
|
|
118
|
+
return []
|
|
119
|
+
|
|
120
|
+
feature_key = None
|
|
121
|
+
if "accessible" in base:
|
|
122
|
+
feature_key = "accessible"
|
|
123
|
+
elif "footprint" in base:
|
|
124
|
+
feature_key = "footprint"
|
|
125
|
+
|
|
126
|
+
if feature_key is None:
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
features = feature_sets.get(feature_key, {}).get("features", {})
|
|
130
|
+
if not isinstance(features, dict):
|
|
131
|
+
return []
|
|
132
|
+
|
|
133
|
+
ranges: List[Tuple[int, int, Any]] = []
|
|
134
|
+
for feature_name, bounds in features.items():
|
|
135
|
+
if not isinstance(bounds, (list, tuple)) or len(bounds) != 2:
|
|
136
|
+
continue
|
|
137
|
+
min_len, max_len = bounds
|
|
138
|
+
if max_len is None or (isinstance(max_len, (float, int)) and np.isinf(max_len)):
|
|
139
|
+
max_len = int(1e9)
|
|
140
|
+
try:
|
|
141
|
+
min_len_int = int(min_len)
|
|
142
|
+
max_len_int = int(max_len)
|
|
143
|
+
except (TypeError, ValueError):
|
|
144
|
+
continue
|
|
145
|
+
color = _resolve_feature_color(feature_name, cfg, default_cmap, 0, 1)
|
|
146
|
+
ranges.append((min_len_int, max_len_int, color))
|
|
147
|
+
return ranges
|
|
148
|
+
|
|
149
|
+
|
|
28
150
|
def _get_training_matrix(
|
|
29
151
|
subset, cols_mask: np.ndarray, smf_modality: Optional[str], cfg
|
|
30
152
|
) -> Tuple[np.ndarray, Optional[str]]:
|
|
@@ -445,31 +567,25 @@ def hmm_adata(config_path: str):
|
|
|
445
567
|
- Call hmm_adata_core(cfg, adata, paths)
|
|
446
568
|
"""
|
|
447
569
|
from ..readwrite import safe_read_h5ad
|
|
448
|
-
from .helpers import get_adata_paths
|
|
449
|
-
from .load_adata import load_adata
|
|
450
|
-
from .preprocess_adata import preprocess_adata
|
|
451
|
-
from .spatial_adata import spatial_adata
|
|
570
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
452
571
|
|
|
453
572
|
# 1) load cfg / stage paths
|
|
454
|
-
|
|
455
|
-
paths = get_adata_paths(cfg)
|
|
573
|
+
cfg = load_experiment_config(config_path)
|
|
456
574
|
|
|
457
|
-
|
|
458
|
-
preprocess_adata(config_path)
|
|
459
|
-
spatial_ad, spatial_path = spatial_adata(config_path)
|
|
575
|
+
paths = get_adata_paths(cfg)
|
|
460
576
|
|
|
461
|
-
#
|
|
577
|
+
# 2) choose starting AnnData
|
|
462
578
|
# Prefer:
|
|
463
579
|
# - existing HMM h5ad if not forcing redo
|
|
464
580
|
# - in-memory spatial_ad from wrapper call
|
|
465
581
|
# - saved spatial / pp_dedup / pp / raw on disk
|
|
466
582
|
if paths.hmm.exists() and not (cfg.force_redo_hmm_fit or cfg.force_redo_hmm_apply):
|
|
467
|
-
|
|
468
|
-
return
|
|
583
|
+
logger.debug(f"Skipping hmm. HMM AnnData found: {paths.hmm}")
|
|
584
|
+
return None
|
|
469
585
|
|
|
470
|
-
if
|
|
471
|
-
adata =
|
|
472
|
-
source_path =
|
|
586
|
+
if paths.hmm.exists():
|
|
587
|
+
adata, _ = safe_read_h5ad(paths.hmm)
|
|
588
|
+
source_path = paths.hmm
|
|
473
589
|
elif paths.spatial.exists():
|
|
474
590
|
adata, _ = safe_read_h5ad(paths.spatial)
|
|
475
591
|
source_path = paths.spatial
|
|
@@ -516,11 +632,14 @@ def hmm_adata_core(
|
|
|
516
632
|
Does NOT decide which h5ad to start from – that is the wrapper's job.
|
|
517
633
|
"""
|
|
518
634
|
|
|
635
|
+
from datetime import datetime
|
|
636
|
+
|
|
519
637
|
import numpy as np
|
|
520
638
|
|
|
521
639
|
from ..hmm import call_hmm_peaks
|
|
522
640
|
from ..metadata import record_smftools_metadata
|
|
523
641
|
from ..plotting import (
|
|
642
|
+
combined_hmm_length_clustermap,
|
|
524
643
|
combined_hmm_raw_clustermap,
|
|
525
644
|
plot_hmm_layers_rolling_by_sample_ref,
|
|
526
645
|
plot_hmm_size_contours,
|
|
@@ -528,18 +647,33 @@ def hmm_adata_core(
|
|
|
528
647
|
from ..readwrite import make_dirs
|
|
529
648
|
from .helpers import write_gz_h5ad
|
|
530
649
|
|
|
650
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
651
|
+
now = datetime.now()
|
|
652
|
+
time_str = now.strftime("%H%M%S")
|
|
653
|
+
|
|
654
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
655
|
+
|
|
531
656
|
smf_modality = cfg.smf_modality
|
|
532
657
|
deaminase = smf_modality == "deaminase"
|
|
533
658
|
|
|
534
659
|
output_directory = Path(cfg.output_directory)
|
|
535
|
-
|
|
660
|
+
hmm_directory = output_directory / HMM_DIR
|
|
661
|
+
logging_directory = hmm_directory / LOGGING_DIR
|
|
662
|
+
|
|
663
|
+
make_dirs([output_directory, hmm_directory])
|
|
664
|
+
|
|
665
|
+
if cfg.emit_log_file:
|
|
666
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
667
|
+
make_dirs([logging_directory])
|
|
668
|
+
else:
|
|
669
|
+
log_file = None
|
|
536
670
|
|
|
537
|
-
|
|
671
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
538
672
|
|
|
539
673
|
# ---------------------------- HMM annotate stage ----------------------------
|
|
540
674
|
if not (cfg.bypass_hmm_fit and cfg.bypass_hmm_apply):
|
|
541
|
-
hmm_models_dir =
|
|
542
|
-
make_dirs([
|
|
675
|
+
hmm_models_dir = hmm_directory / "10_hmm_models"
|
|
676
|
+
make_dirs([hmm_directory, hmm_models_dir])
|
|
543
677
|
|
|
544
678
|
# Standard bookkeeping
|
|
545
679
|
uns_key = "hmm_appended_layers"
|
|
@@ -743,6 +877,8 @@ def hmm_adata_core(
|
|
|
743
877
|
uns_key=uns_key,
|
|
744
878
|
uns_flag="hmm_annotated_combined",
|
|
745
879
|
force_redo=force_apply,
|
|
880
|
+
mask_to_read_span=True,
|
|
881
|
+
mask_use_original_var_names=True,
|
|
746
882
|
)
|
|
747
883
|
|
|
748
884
|
for core_layer, dist in (
|
|
@@ -855,11 +991,11 @@ def hmm_adata_core(
|
|
|
855
991
|
logger.info(f"HMM appended layers: {hmm_layers}")
|
|
856
992
|
|
|
857
993
|
# ---------------------------- HMM peak calling stage ----------------------------
|
|
858
|
-
hmm_dir =
|
|
994
|
+
hmm_dir = hmm_directory / "11_hmm_peak_calling"
|
|
859
995
|
if hmm_dir.is_dir():
|
|
860
996
|
pass
|
|
861
997
|
else:
|
|
862
|
-
make_dirs([
|
|
998
|
+
make_dirs([hmm_directory, hmm_dir])
|
|
863
999
|
|
|
864
1000
|
call_hmm_peaks(
|
|
865
1001
|
adata,
|
|
@@ -888,8 +1024,8 @@ def hmm_adata_core(
|
|
|
888
1024
|
|
|
889
1025
|
############################################### HMM based feature plotting ###############################################
|
|
890
1026
|
|
|
891
|
-
hmm_dir =
|
|
892
|
-
make_dirs([
|
|
1027
|
+
hmm_dir = hmm_directory / "12_hmm_clustermaps"
|
|
1028
|
+
make_dirs([hmm_directory, hmm_dir])
|
|
893
1029
|
|
|
894
1030
|
layers: list[str] = []
|
|
895
1031
|
|
|
@@ -914,6 +1050,7 @@ def hmm_adata_core(
|
|
|
914
1050
|
pass
|
|
915
1051
|
else:
|
|
916
1052
|
make_dirs([hmm_cluster_save_dir])
|
|
1053
|
+
hmm_cmap = _resolve_feature_colormap(layer, cfg, cfg.clustermap_cmap_hmm)
|
|
917
1054
|
|
|
918
1055
|
combined_hmm_raw_clustermap(
|
|
919
1056
|
adata,
|
|
@@ -924,7 +1061,7 @@ def hmm_adata_core(
|
|
|
924
1061
|
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
925
1062
|
layer_c=cfg.layer_for_clustermap_plotting,
|
|
926
1063
|
layer_a=cfg.layer_for_clustermap_plotting,
|
|
927
|
-
cmap_hmm=
|
|
1064
|
+
cmap_hmm=hmm_cmap,
|
|
928
1065
|
cmap_gpc=cfg.clustermap_cmap_gpc,
|
|
929
1066
|
cmap_cpg=cfg.clustermap_cmap_cpg,
|
|
930
1067
|
cmap_c=cfg.clustermap_cmap_c,
|
|
@@ -935,7 +1072,7 @@ def hmm_adata_core(
|
|
|
935
1072
|
0
|
|
936
1073
|
],
|
|
937
1074
|
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
938
|
-
demux_types=
|
|
1075
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
939
1076
|
save_path=hmm_cluster_save_dir,
|
|
940
1077
|
normalize_hmm=False,
|
|
941
1078
|
sort_by=cfg.hmm_clustermap_sortby, # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
|
|
@@ -945,12 +1082,68 @@ def hmm_adata_core(
|
|
|
945
1082
|
index_col_suffix=cfg.reindexed_var_suffix,
|
|
946
1083
|
)
|
|
947
1084
|
|
|
948
|
-
|
|
1085
|
+
hmm_length_dir = hmm_directory / "12b_hmm_length_clustermaps"
|
|
1086
|
+
make_dirs([hmm_directory, hmm_length_dir])
|
|
1087
|
+
|
|
1088
|
+
length_layers: list[str] = []
|
|
1089
|
+
length_layer_roots = list(
|
|
1090
|
+
getattr(cfg, "hmm_clustermap_length_layers", cfg.hmm_clustermap_feature_layers)
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
for base in cfg.hmm_methbases:
|
|
1094
|
+
length_layers.extend([f"{base}_{layer}_lengths" for layer in length_layer_roots])
|
|
1095
|
+
|
|
1096
|
+
if getattr(cfg, "hmm_run_multichannel", True) and len(cfg.hmm_methbases) >= 2:
|
|
1097
|
+
length_layers.extend([f"Combined_{layer}_lengths" for layer in length_layer_roots])
|
|
1098
|
+
|
|
1099
|
+
if cfg.cpg:
|
|
1100
|
+
length_layers.extend(["CpG_cpg_patch_lengths"])
|
|
1101
|
+
|
|
1102
|
+
for layer in length_layers:
|
|
1103
|
+
hmm_cluster_save_dir = hmm_length_dir / layer
|
|
1104
|
+
if hmm_cluster_save_dir.is_dir():
|
|
1105
|
+
pass
|
|
1106
|
+
else:
|
|
1107
|
+
make_dirs([hmm_cluster_save_dir])
|
|
1108
|
+
length_cmap = _resolve_feature_colormap(layer, cfg, "Greens")
|
|
1109
|
+
length_feature_ranges = _resolve_length_feature_ranges(layer, cfg, "Greens")
|
|
1110
|
+
|
|
1111
|
+
combined_hmm_length_clustermap(
|
|
1112
|
+
adata,
|
|
1113
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
1114
|
+
reference_col=cfg.reference_column,
|
|
1115
|
+
length_layer=layer,
|
|
1116
|
+
layer_gpc=cfg.layer_for_clustermap_plotting,
|
|
1117
|
+
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
1118
|
+
layer_c=cfg.layer_for_clustermap_plotting,
|
|
1119
|
+
layer_a=cfg.layer_for_clustermap_plotting,
|
|
1120
|
+
cmap_lengths=length_cmap,
|
|
1121
|
+
cmap_gpc=cfg.clustermap_cmap_gpc,
|
|
1122
|
+
cmap_cpg=cfg.clustermap_cmap_cpg,
|
|
1123
|
+
cmap_c=cfg.clustermap_cmap_c,
|
|
1124
|
+
cmap_a=cfg.clustermap_cmap_a,
|
|
1125
|
+
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
1126
|
+
min_length=cfg.read_len_filter_thresholds[0],
|
|
1127
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
1128
|
+
0
|
|
1129
|
+
],
|
|
1130
|
+
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
1131
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
1132
|
+
save_path=hmm_cluster_save_dir,
|
|
1133
|
+
sort_by=cfg.hmm_clustermap_sortby,
|
|
1134
|
+
bins=None,
|
|
1135
|
+
deaminase=deaminase,
|
|
1136
|
+
min_signal=0,
|
|
1137
|
+
index_col_suffix=cfg.reindexed_var_suffix,
|
|
1138
|
+
length_feature_ranges=length_feature_ranges,
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
hmm_dir = hmm_directory / "13_hmm_bulk_traces"
|
|
949
1142
|
|
|
950
1143
|
if hmm_dir.is_dir():
|
|
951
1144
|
logger.debug(f"{hmm_dir} already exists.")
|
|
952
1145
|
else:
|
|
953
|
-
make_dirs([
|
|
1146
|
+
make_dirs([hmm_directory, hmm_dir])
|
|
954
1147
|
from ..plotting import plot_hmm_layers_rolling_by_sample_ref
|
|
955
1148
|
|
|
956
1149
|
bulk_hmm_layers = [
|
|
@@ -958,6 +1151,10 @@ def hmm_adata_core(
|
|
|
958
1151
|
for layer in hmm_layers
|
|
959
1152
|
if not any(s in layer for s in ("_lengths", "_states", "_posterior"))
|
|
960
1153
|
]
|
|
1154
|
+
layer_colors = {
|
|
1155
|
+
layer: _resolve_feature_color(layer, cfg, "tab20", idx, len(bulk_hmm_layers))
|
|
1156
|
+
for idx, layer in enumerate(bulk_hmm_layers)
|
|
1157
|
+
}
|
|
961
1158
|
saved = plot_hmm_layers_rolling_by_sample_ref(
|
|
962
1159
|
adata,
|
|
963
1160
|
layers=bulk_hmm_layers,
|
|
@@ -969,14 +1166,15 @@ def hmm_adata_core(
|
|
|
969
1166
|
output_dir=hmm_dir,
|
|
970
1167
|
save=True,
|
|
971
1168
|
show_raw=False,
|
|
1169
|
+
layer_colors=layer_colors,
|
|
972
1170
|
)
|
|
973
1171
|
|
|
974
|
-
hmm_dir =
|
|
1172
|
+
hmm_dir = hmm_directory / "14_hmm_fragment_distributions"
|
|
975
1173
|
|
|
976
1174
|
if hmm_dir.is_dir():
|
|
977
1175
|
logger.debug(f"{hmm_dir} already exists.")
|
|
978
1176
|
else:
|
|
979
|
-
make_dirs([
|
|
1177
|
+
make_dirs([hmm_directory, hmm_dir])
|
|
980
1178
|
from ..plotting import plot_hmm_size_contours
|
|
981
1179
|
|
|
982
1180
|
if smf_modality == "deaminase":
|
|
@@ -1001,6 +1199,8 @@ def hmm_adata_core(
|
|
|
1001
1199
|
for layer, max in fragments:
|
|
1002
1200
|
save_path = hmm_dir / layer
|
|
1003
1201
|
make_dirs([save_path])
|
|
1202
|
+
layer_cmap = _resolve_feature_colormap(layer, cfg, "Greens")
|
|
1203
|
+
feature_ranges = _resolve_length_feature_ranges(layer, cfg, "Greens")
|
|
1004
1204
|
|
|
1005
1205
|
figs = plot_hmm_size_contours(
|
|
1006
1206
|
adata,
|
|
@@ -1016,8 +1216,9 @@ def hmm_adata_core(
|
|
|
1016
1216
|
dpi=200,
|
|
1017
1217
|
smoothing_sigma=(10, 10),
|
|
1018
1218
|
normalize_after_smoothing=True,
|
|
1019
|
-
cmap=
|
|
1219
|
+
cmap=layer_cmap,
|
|
1020
1220
|
log_scale_z=True,
|
|
1221
|
+
feature_ranges=tuple(feature_ranges),
|
|
1021
1222
|
)
|
|
1022
1223
|
########################################################################################################################
|
|
1023
1224
|
|