smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +34 -6
- smftools/cli/hmm_adata.py +239 -33
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +167 -131
- smftools/cli/preprocess_adata.py +180 -53
- smftools/cli/spatial_adata.py +152 -100
- smftools/cli_entry.py +38 -1
- smftools/config/__init__.py +2 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +42 -2
- smftools/config/experiment_config.py +59 -1
- smftools/constants.py +65 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +97 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +59 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1093 -176
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +641 -176
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +123 -4
- smftools/informatics/modkit_extract_to_adata.py +1019 -431
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/metadata.py +1 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +41 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +2415 -629
- smftools/plotting/hmm_plotting.py +97 -9
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +36 -37
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +30 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +93 -8
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
- smftools-0.3.1.dist-info/RECORD +189 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import anndata as ad
|
|
8
|
+
|
|
9
|
+
from smftools.constants import LATENT_DIR, LOGGING_DIR, SEQUENCE_INTEGER_ENCODING
|
|
10
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def latent_adata(
|
|
16
|
+
config_path: str,
|
|
17
|
+
) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
|
|
18
|
+
"""
|
|
19
|
+
CLI-facing wrapper for representation learning.
|
|
20
|
+
|
|
21
|
+
Called by: `smftools latent <config_path>`
|
|
22
|
+
|
|
23
|
+
Responsibilities:
|
|
24
|
+
- Determine which AnnData stages exist (pp, pp_dedup, spatial, hmm).
|
|
25
|
+
- Call `latent_adata_core(...)` when actual work is needed.
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
latent_adata : AnnData | None
|
|
30
|
+
AnnData with latent analyses, or None if we skipped because a later-stage
|
|
31
|
+
AnnData already exists.
|
|
32
|
+
latent_adata_path : Path | None
|
|
33
|
+
Path to the “current” latent AnnData.
|
|
34
|
+
"""
|
|
35
|
+
from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
|
|
36
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
37
|
+
|
|
38
|
+
# 1) Ensure config + basic paths via load_adata
|
|
39
|
+
cfg = load_experiment_config(config_path)
|
|
40
|
+
|
|
41
|
+
paths = get_adata_paths(cfg)
|
|
42
|
+
|
|
43
|
+
pp_path = paths.pp
|
|
44
|
+
pp_dedup_path = paths.pp_dedup
|
|
45
|
+
spatial_path = paths.spatial
|
|
46
|
+
hmm_path = paths.hmm
|
|
47
|
+
latent_path = paths.latent
|
|
48
|
+
|
|
49
|
+
# Stage-skipping logic for latent
|
|
50
|
+
if not getattr(cfg, "force_redo_latent_analyses", False):
|
|
51
|
+
# If latent exists, we consider latent analyses already done.
|
|
52
|
+
if latent_path.exists():
|
|
53
|
+
logger.info(f"Latent AnnData found: {latent_path}\nSkipping smftools latent")
|
|
54
|
+
return None, latent_path
|
|
55
|
+
|
|
56
|
+
# Helper to load from disk, reusing loaded_adata if it matches
|
|
57
|
+
def _load(path: Path):
|
|
58
|
+
adata, _ = safe_read_h5ad(path)
|
|
59
|
+
return adata
|
|
60
|
+
|
|
61
|
+
# 3) Decide which AnnData to use as the *starting point* for latent analyses
|
|
62
|
+
if latent_path.exists():
|
|
63
|
+
start_adata = _load(latent_path)
|
|
64
|
+
source_path = latent_path
|
|
65
|
+
elif hmm_path.exists():
|
|
66
|
+
start_adata = _load(hmm_path)
|
|
67
|
+
source_path = hmm_path
|
|
68
|
+
elif spatial_path.exists():
|
|
69
|
+
start_adata = _load(spatial_path)
|
|
70
|
+
source_path = spatial_path
|
|
71
|
+
elif pp_dedup_path.exists():
|
|
72
|
+
start_adata = _load(pp_dedup_path)
|
|
73
|
+
source_path = pp_dedup_path
|
|
74
|
+
elif pp_path.exists():
|
|
75
|
+
start_adata = _load(pp_path)
|
|
76
|
+
source_path = pp_path
|
|
77
|
+
else:
|
|
78
|
+
logger.warning(
|
|
79
|
+
"No suitable AnnData found for latent analyses (need at least preprocessed)."
|
|
80
|
+
)
|
|
81
|
+
return None, None
|
|
82
|
+
|
|
83
|
+
# 4) Run the latent core
|
|
84
|
+
adata_latent, latent_path = latent_adata_core(
|
|
85
|
+
adata=start_adata,
|
|
86
|
+
cfg=cfg,
|
|
87
|
+
paths=paths,
|
|
88
|
+
source_adata_path=source_path,
|
|
89
|
+
config_path=config_path,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return adata_latent, latent_path
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def latent_adata_core(
|
|
96
|
+
adata: ad.AnnData,
|
|
97
|
+
cfg,
|
|
98
|
+
paths: AdataPaths,
|
|
99
|
+
source_adata_path: Optional[Path] = None,
|
|
100
|
+
config_path: Optional[str] = None,
|
|
101
|
+
) -> Tuple[ad.AnnData, Path]:
|
|
102
|
+
"""
|
|
103
|
+
Core spatial analysis pipeline.
|
|
104
|
+
|
|
105
|
+
Assumes:
|
|
106
|
+
- `adata` is (typically) the preprocessed, duplicate-removed AnnData.
|
|
107
|
+
- `cfg` is the ExperimentConfig.
|
|
108
|
+
|
|
109
|
+
Does:
|
|
110
|
+
- Optional sample sheet load.
|
|
111
|
+
- Optional inversion & reindexing.
|
|
112
|
+
- PCA/UMAP/Leiden.
|
|
113
|
+
- Save latent AnnData to `latent_adata_path`.
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
adata : AnnData
|
|
118
|
+
analyzed AnnData (same object, modified in-place).
|
|
119
|
+
adata_path : Path
|
|
120
|
+
Path where AnnData was written.
|
|
121
|
+
"""
|
|
122
|
+
import os
|
|
123
|
+
import warnings
|
|
124
|
+
from datetime import datetime
|
|
125
|
+
from pathlib import Path
|
|
126
|
+
|
|
127
|
+
import numpy as np
|
|
128
|
+
import pandas as pd
|
|
129
|
+
|
|
130
|
+
from ..metadata import record_smftools_metadata
|
|
131
|
+
from ..plotting import (
|
|
132
|
+
plot_cp_sequence_components,
|
|
133
|
+
plot_embedding,
|
|
134
|
+
plot_nmf_components,
|
|
135
|
+
plot_pca,
|
|
136
|
+
plot_umap,
|
|
137
|
+
)
|
|
138
|
+
from ..preprocessing import (
|
|
139
|
+
invert_adata,
|
|
140
|
+
load_sample_sheet,
|
|
141
|
+
reindex_references_adata,
|
|
142
|
+
)
|
|
143
|
+
from ..readwrite import make_dirs, safe_read_h5ad
|
|
144
|
+
from ..tools import (
|
|
145
|
+
calculate_leiden,
|
|
146
|
+
calculate_nmf,
|
|
147
|
+
calculate_sequence_cp_decomposition,
|
|
148
|
+
calculate_umap,
|
|
149
|
+
)
|
|
150
|
+
from .helpers import write_gz_h5ad
|
|
151
|
+
|
|
152
|
+
# -----------------------------
|
|
153
|
+
# General setup
|
|
154
|
+
# -----------------------------
|
|
155
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
156
|
+
now = datetime.now()
|
|
157
|
+
time_str = now.strftime("%H%M%S")
|
|
158
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
159
|
+
|
|
160
|
+
latent_adata_path = paths.latent
|
|
161
|
+
|
|
162
|
+
output_directory = Path(cfg.output_directory)
|
|
163
|
+
latent_directory = output_directory / LATENT_DIR
|
|
164
|
+
logging_directory = latent_directory / LOGGING_DIR
|
|
165
|
+
|
|
166
|
+
make_dirs([output_directory, latent_directory])
|
|
167
|
+
|
|
168
|
+
if cfg.emit_log_file:
|
|
169
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
170
|
+
make_dirs([logging_directory])
|
|
171
|
+
else:
|
|
172
|
+
log_file = None
|
|
173
|
+
|
|
174
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
175
|
+
|
|
176
|
+
smf_modality = cfg.smf_modality
|
|
177
|
+
if smf_modality == "conversion":
|
|
178
|
+
deaminase = False
|
|
179
|
+
else:
|
|
180
|
+
deaminase = True
|
|
181
|
+
|
|
182
|
+
# -----------------------------
|
|
183
|
+
# Optional sample sheet metadata
|
|
184
|
+
# -----------------------------
|
|
185
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
186
|
+
load_sample_sheet(
|
|
187
|
+
adata,
|
|
188
|
+
cfg.sample_sheet_path,
|
|
189
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
190
|
+
as_category=True,
|
|
191
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# -----------------------------
|
|
195
|
+
# Optional inversion along positions axis
|
|
196
|
+
# -----------------------------
|
|
197
|
+
if getattr(cfg, "invert_adata", False):
|
|
198
|
+
adata = invert_adata(adata)
|
|
199
|
+
|
|
200
|
+
# -----------------------------
|
|
201
|
+
# Optional reindexing by reference
|
|
202
|
+
# -----------------------------
|
|
203
|
+
reindex_references_adata(
|
|
204
|
+
adata,
|
|
205
|
+
reference_col=cfg.reference_column,
|
|
206
|
+
offsets=cfg.reindexing_offsets,
|
|
207
|
+
new_col=cfg.reindexed_var_suffix,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if adata.uns.get("reindex_references_adata_performed", False):
|
|
211
|
+
reindex_suffix = cfg.reindexed_var_suffix
|
|
212
|
+
else:
|
|
213
|
+
reindex_suffix = None
|
|
214
|
+
|
|
215
|
+
references = adata.obs[cfg.reference_column].cat.categories
|
|
216
|
+
|
|
217
|
+
# ============================================================
|
|
218
|
+
# 2) PCA/UMAP on *deduplicated* preprocessed AnnData
|
|
219
|
+
# ============================================================
|
|
220
|
+
latent_dir_dedup = latent_directory / "deduplicated"
|
|
221
|
+
umap_dir = latent_dir_dedup / "07_umaps"
|
|
222
|
+
nmf_dir = latent_dir_dedup / "07b_nmf"
|
|
223
|
+
nmf_sequence_dir = latent_dir_dedup / "07c_nmf_sequence"
|
|
224
|
+
|
|
225
|
+
var_filters = []
|
|
226
|
+
if smf_modality == "direct":
|
|
227
|
+
for ref in references:
|
|
228
|
+
for base in cfg.mod_target_bases:
|
|
229
|
+
var_filters.append(f"{ref}_{base}_site")
|
|
230
|
+
elif deaminase:
|
|
231
|
+
for ref in references:
|
|
232
|
+
var_filters.append(f"{ref}_C_site")
|
|
233
|
+
else:
|
|
234
|
+
for ref in references:
|
|
235
|
+
for base in cfg.mod_target_bases:
|
|
236
|
+
var_filters.append(f"{ref}_{base}_site")
|
|
237
|
+
|
|
238
|
+
# UMAP / Leiden
|
|
239
|
+
if umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
240
|
+
logger.debug(f"{umap_dir} already exists. Skipping UMAP plotting.")
|
|
241
|
+
else:
|
|
242
|
+
make_dirs([umap_dir])
|
|
243
|
+
|
|
244
|
+
adata = calculate_umap(
|
|
245
|
+
adata,
|
|
246
|
+
layer=cfg.layer_for_umap_plotting,
|
|
247
|
+
var_filters=var_filters,
|
|
248
|
+
n_pcs=10,
|
|
249
|
+
knn_neighbors=15,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
calculate_leiden(adata, resolution=0.1)
|
|
253
|
+
|
|
254
|
+
umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
255
|
+
umap_layers += cfg.umap_layers_to_plot
|
|
256
|
+
plot_umap(adata, color=umap_layers, output_dir=umap_dir)
|
|
257
|
+
plot_pca(adata, color=umap_layers, output_dir=umap_dir)
|
|
258
|
+
|
|
259
|
+
# NMF
|
|
260
|
+
if nmf_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
261
|
+
logger.debug(f"{nmf_dir} already exists. Skipping NMF plotting.")
|
|
262
|
+
else:
|
|
263
|
+
make_dirs([nmf_dir])
|
|
264
|
+
adata = calculate_nmf(
|
|
265
|
+
adata,
|
|
266
|
+
layer=cfg.layer_for_umap_plotting,
|
|
267
|
+
var_filters=var_filters,
|
|
268
|
+
n_components=5,
|
|
269
|
+
)
|
|
270
|
+
nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
271
|
+
nmf_layers += cfg.umap_layers_to_plot
|
|
272
|
+
plot_embedding(adata, basis="nmf", color=nmf_layers, output_dir=nmf_dir)
|
|
273
|
+
plot_nmf_components(adata, output_dir=nmf_dir)
|
|
274
|
+
|
|
275
|
+
# CP decomposition using sequence integer encoding (no var filters)
|
|
276
|
+
if nmf_sequence_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
277
|
+
logger.debug(f"{nmf_sequence_dir} already exists. Skipping sequence CP plotting.")
|
|
278
|
+
elif SEQUENCE_INTEGER_ENCODING not in adata.layers:
|
|
279
|
+
logger.warning(
|
|
280
|
+
"Layer %s not found; skipping sequence integer encoding CP.",
|
|
281
|
+
SEQUENCE_INTEGER_ENCODING,
|
|
282
|
+
)
|
|
283
|
+
else:
|
|
284
|
+
make_dirs([nmf_sequence_dir])
|
|
285
|
+
adata = calculate_sequence_cp_decomposition(
|
|
286
|
+
adata,
|
|
287
|
+
layer=SEQUENCE_INTEGER_ENCODING,
|
|
288
|
+
rank=5,
|
|
289
|
+
embedding_key="X_cp_sequence",
|
|
290
|
+
components_key="H_cp_sequence",
|
|
291
|
+
uns_key="cp_sequence",
|
|
292
|
+
)
|
|
293
|
+
nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
294
|
+
nmf_layers += cfg.umap_layers_to_plot
|
|
295
|
+
plot_embedding(adata, basis="cp_sequence", color=nmf_layers, output_dir=nmf_sequence_dir)
|
|
296
|
+
plot_cp_sequence_components(
|
|
297
|
+
adata,
|
|
298
|
+
output_dir=nmf_sequence_dir,
|
|
299
|
+
components_key="H_cp_sequence",
|
|
300
|
+
uns_key="cp_sequence",
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# ============================================================
|
|
304
|
+
# 5) Save latent AnnData
|
|
305
|
+
# ============================================================
|
|
306
|
+
if (not latent_adata_path.exists()) or getattr(cfg, "force_redo_latent_analyses", False):
|
|
307
|
+
logger.info("Saving latent analyzed AnnData (post preprocessing and duplicate removal).")
|
|
308
|
+
record_smftools_metadata(
|
|
309
|
+
adata,
|
|
310
|
+
step_name="latent",
|
|
311
|
+
cfg=cfg,
|
|
312
|
+
config_path=config_path,
|
|
313
|
+
input_paths=[source_adata_path] if source_adata_path else None,
|
|
314
|
+
output_path=latent_adata_path,
|
|
315
|
+
)
|
|
316
|
+
write_gz_h5ad(adata, latent_adata_path)
|
|
317
|
+
|
|
318
|
+
return adata, latent_adata_path
|