smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +49 -7
- smftools/cli/hmm_adata.py +250 -32
- smftools/cli/latent_adata.py +773 -0
- smftools/cli/load_adata.py +78 -74
- smftools/cli/preprocess_adata.py +122 -58
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +74 -112
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +52 -4
- smftools/config/conversion.yaml +1 -1
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +85 -12
- smftools/config/experiment_config.py +146 -1
- smftools/constants.py +69 -0
- smftools/hmm/HMM.py +88 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +636 -175
- smftools/informatics/h5ad_functions.py +198 -2
- smftools/informatics/modkit_extract_to_adata.py +1007 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +26 -3
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +62 -1583
- smftools/plotting/hmm_plotting.py +1670 -8
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +4 -0
- smftools/preprocessing/append_base_context.py +18 -18
- smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +159 -99
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +10 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +130 -0
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +79 -80
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +872 -0
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +217 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import anndata as ad
|
|
8
|
+
|
|
9
|
+
from smftools.constants import LOGGING_DIR, VARIANT_DIR
|
|
10
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def variant_adata(
|
|
16
|
+
config_path: str,
|
|
17
|
+
) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
|
|
18
|
+
"""
|
|
19
|
+
CLI-facing wrapper for variant analyses.
|
|
20
|
+
|
|
21
|
+
Called by: `smftools variant <config_path>`
|
|
22
|
+
|
|
23
|
+
Responsibilities:
|
|
24
|
+
- Ensure a usable AnnData exists.
|
|
25
|
+
- Determine which AnnData stages exist.
|
|
26
|
+
- Decide whether to skip (return existing) or run the core.
|
|
27
|
+
- Call `variant_adata_core(...)` when actual work is needed.
|
|
28
|
+
"""
|
|
29
|
+
from ..readwrite import safe_read_h5ad
|
|
30
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
31
|
+
|
|
32
|
+
# 1) Ensure config + basic paths via load_adata
|
|
33
|
+
cfg = load_experiment_config(config_path)
|
|
34
|
+
|
|
35
|
+
paths = get_adata_paths(cfg)
|
|
36
|
+
|
|
37
|
+
pp_path = paths.pp
|
|
38
|
+
pp_dedup_path = paths.pp_dedup
|
|
39
|
+
spatial_path = paths.spatial
|
|
40
|
+
chimeric_path = paths.chimeric
|
|
41
|
+
variant_path = paths.variant
|
|
42
|
+
hmm_path = paths.hmm
|
|
43
|
+
latent_path = paths.latent
|
|
44
|
+
|
|
45
|
+
# Stage-skipping logic
|
|
46
|
+
if not getattr(cfg, "force_redo_variant_analyses", False):
|
|
47
|
+
if variant_path.exists():
|
|
48
|
+
logger.info(f"Variant AnnData found: {variant_path}\nSkipping smftools variant")
|
|
49
|
+
return None, spatial_path
|
|
50
|
+
|
|
51
|
+
# Helper to load from disk, reusing loaded_adata if it matches
|
|
52
|
+
def _load(path: Path):
|
|
53
|
+
adata, _ = safe_read_h5ad(path)
|
|
54
|
+
return adata
|
|
55
|
+
|
|
56
|
+
# 3) Decide which AnnData to use as the *starting point* for analyses
|
|
57
|
+
if hmm_path.exists():
|
|
58
|
+
start_adata = _load(hmm_path)
|
|
59
|
+
source_path = hmm_path
|
|
60
|
+
elif latent_path.exists():
|
|
61
|
+
start_adata = _load(latent_path)
|
|
62
|
+
source_path = latent_path
|
|
63
|
+
elif spatial_path.exists():
|
|
64
|
+
start_adata = _load(spatial_path)
|
|
65
|
+
source_path = spatial_path
|
|
66
|
+
elif chimeric_path.exists():
|
|
67
|
+
start_adata = _load(chimeric_path)
|
|
68
|
+
source_path = chimeric_path
|
|
69
|
+
elif variant_path.exists():
|
|
70
|
+
start_adata = _load(variant_path)
|
|
71
|
+
source_path = variant_path
|
|
72
|
+
elif pp_dedup_path.exists():
|
|
73
|
+
start_adata = _load(pp_dedup_path)
|
|
74
|
+
source_path = pp_dedup_path
|
|
75
|
+
elif pp_path.exists():
|
|
76
|
+
start_adata = _load(pp_path)
|
|
77
|
+
source_path = pp_path
|
|
78
|
+
else:
|
|
79
|
+
logger.warning(
|
|
80
|
+
"No suitable AnnData found for variant analyses (need at least preprocessed)."
|
|
81
|
+
)
|
|
82
|
+
return None, None
|
|
83
|
+
|
|
84
|
+
# 4) Run the core
|
|
85
|
+
adata_variant, variant_path = variant_adata_core(
|
|
86
|
+
adata=start_adata,
|
|
87
|
+
cfg=cfg,
|
|
88
|
+
paths=paths,
|
|
89
|
+
source_adata_path=source_path,
|
|
90
|
+
config_path=config_path,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return adata_variant, variant_path
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def variant_adata_core(
|
|
97
|
+
adata: ad.AnnData,
|
|
98
|
+
cfg,
|
|
99
|
+
paths: AdataPaths,
|
|
100
|
+
source_adata_path: Optional[Path] = None,
|
|
101
|
+
config_path: Optional[str] = None,
|
|
102
|
+
) -> Tuple[ad.AnnData, Path]:
|
|
103
|
+
"""
|
|
104
|
+
Core variant analysis pipeline.
|
|
105
|
+
|
|
106
|
+
Assumes:
|
|
107
|
+
- `cfg` is the ExperimentConfig.
|
|
108
|
+
|
|
109
|
+
Does:
|
|
110
|
+
-
|
|
111
|
+
- Save AnnData
|
|
112
|
+
"""
|
|
113
|
+
import os
|
|
114
|
+
import warnings
|
|
115
|
+
from datetime import datetime
|
|
116
|
+
from pathlib import Path
|
|
117
|
+
|
|
118
|
+
import numpy as np
|
|
119
|
+
import pandas as pd
|
|
120
|
+
|
|
121
|
+
from ..metadata import record_smftools_metadata
|
|
122
|
+
from ..plotting import (
|
|
123
|
+
plot_mismatch_base_frequency_by_position,
|
|
124
|
+
plot_sequence_integer_encoding_clustermaps,
|
|
125
|
+
plot_variant_segment_clustermaps,
|
|
126
|
+
)
|
|
127
|
+
from ..preprocessing import (
|
|
128
|
+
append_mismatch_frequency_sites,
|
|
129
|
+
append_sequence_mismatch_annotations,
|
|
130
|
+
append_variant_call_layer,
|
|
131
|
+
append_variant_segment_layer,
|
|
132
|
+
load_sample_sheet,
|
|
133
|
+
)
|
|
134
|
+
from ..readwrite import make_dirs
|
|
135
|
+
from .helpers import write_gz_h5ad
|
|
136
|
+
|
|
137
|
+
# -----------------------------
|
|
138
|
+
# General setup
|
|
139
|
+
# -----------------------------
|
|
140
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
141
|
+
now = datetime.now()
|
|
142
|
+
time_str = now.strftime("%H%M%S")
|
|
143
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
144
|
+
|
|
145
|
+
output_directory = Path(cfg.output_directory)
|
|
146
|
+
variant_directory = output_directory / VARIANT_DIR
|
|
147
|
+
logging_directory = variant_directory / LOGGING_DIR
|
|
148
|
+
|
|
149
|
+
make_dirs([output_directory, variant_directory])
|
|
150
|
+
|
|
151
|
+
if cfg.emit_log_file:
|
|
152
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
153
|
+
make_dirs([logging_directory])
|
|
154
|
+
else:
|
|
155
|
+
log_file = None
|
|
156
|
+
|
|
157
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
158
|
+
|
|
159
|
+
smf_modality = cfg.smf_modality
|
|
160
|
+
if smf_modality == "conversion":
|
|
161
|
+
deaminase = False
|
|
162
|
+
else:
|
|
163
|
+
deaminase = True
|
|
164
|
+
|
|
165
|
+
# -----------------------------
|
|
166
|
+
# Optional sample sheet metadata
|
|
167
|
+
# -----------------------------
|
|
168
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
169
|
+
load_sample_sheet(
|
|
170
|
+
adata,
|
|
171
|
+
cfg.sample_sheet_path,
|
|
172
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
173
|
+
as_category=True,
|
|
174
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# ============================================================
|
|
178
|
+
# 1) Reference variant position annotation
|
|
179
|
+
# ============================================================
|
|
180
|
+
seq1_col, seq2_col = getattr(cfg, "references_to_align_for_variant_annotation", [None, None])
|
|
181
|
+
if seq1_col and seq2_col:
|
|
182
|
+
append_sequence_mismatch_annotations(adata, seq1_col, seq2_col)
|
|
183
|
+
|
|
184
|
+
############################################### Append mismatch frequency per position ###############################################
|
|
185
|
+
append_mismatch_frequency_sites(
|
|
186
|
+
adata,
|
|
187
|
+
ref_column=cfg.reference_column,
|
|
188
|
+
mismatch_layer=cfg.mismatch_frequency_layer,
|
|
189
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
190
|
+
mismatch_frequency_range=cfg.mismatch_frequency_range,
|
|
191
|
+
bypass=cfg.bypass_append_mismatch_frequency_sites,
|
|
192
|
+
force_redo=cfg.force_redo_append_mismatch_frequency_sites,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# ============================================================
|
|
196
|
+
# 2) Per-read variant call layer at reference mismatch sites
|
|
197
|
+
# ============================================================
|
|
198
|
+
if seq1_col and seq2_col:
|
|
199
|
+
# For conversion SMF, derive converted column names so variant calling
|
|
200
|
+
# compares read bases against the converted reference (which reads are mapped to).
|
|
201
|
+
# Unconverted: "{chrom}_{strand}_strand_FASTA_base"
|
|
202
|
+
# Converted: "{chrom}_{conversion}_{strand}_{strand}_strand_FASTA_base"
|
|
203
|
+
# e.g. "6B6_top_strand_FASTA_base" -> "6B6_5mC_top_top_strand_FASTA_base"
|
|
204
|
+
def _find_converted_column(unconverted_col: str, var_columns) -> str | None:
|
|
205
|
+
"""Find the converted FASTA column corresponding to an unconverted one.
|
|
206
|
+
|
|
207
|
+
Unconverted columns follow the pattern ``{chromosome}_{strand}_strand_FASTA_base``.
|
|
208
|
+
Converted columns follow ``{chromosome}_{conversion}_{strand}_{strand}_strand_FASTA_base``
|
|
209
|
+
(e.g. ``6B6_5mC_top_top_strand_FASTA_base`` for unconverted ``6B6_top_strand_FASTA_base``).
|
|
210
|
+
"""
|
|
211
|
+
suffix = "_strand_FASTA_base"
|
|
212
|
+
if not unconverted_col.endswith(suffix):
|
|
213
|
+
return None
|
|
214
|
+
stem = unconverted_col[: -len(suffix)] # e.g. "6B6_top"
|
|
215
|
+
# Parse strand from end of stem: "6B6_top" -> strand="top", chrom="6B6"
|
|
216
|
+
for strand in ("top", "bottom"):
|
|
217
|
+
if stem.endswith(f"_{strand}"):
|
|
218
|
+
chrom = stem[: -len(f"_{strand}")]
|
|
219
|
+
# Converted column: {chrom}_{conversion}_{strand}_{strand}_strand_FASTA_base
|
|
220
|
+
# The strand appears twice: once in the record name, once in the suffix.
|
|
221
|
+
prefix = f"{chrom}_"
|
|
222
|
+
end = f"_{strand}_{strand}{suffix}"
|
|
223
|
+
candidates = [
|
|
224
|
+
c
|
|
225
|
+
for c in var_columns
|
|
226
|
+
if c.startswith(prefix) and c.endswith(end) and c != unconverted_col
|
|
227
|
+
]
|
|
228
|
+
if len(candidates) == 1:
|
|
229
|
+
return candidates[0]
|
|
230
|
+
if len(candidates) > 1:
|
|
231
|
+
logger.info(
|
|
232
|
+
"Multiple converted column candidates for '%s': %s",
|
|
233
|
+
unconverted_col,
|
|
234
|
+
candidates,
|
|
235
|
+
)
|
|
236
|
+
return candidates[0]
|
|
237
|
+
break
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
seq1_conv = _find_converted_column(seq1_col, adata.var.columns)
|
|
241
|
+
seq2_conv = _find_converted_column(seq2_col, adata.var.columns)
|
|
242
|
+
if seq1_conv and seq2_conv:
|
|
243
|
+
logger.info("Using converted columns: '%s', '%s'", seq1_conv, seq2_conv)
|
|
244
|
+
|
|
245
|
+
append_variant_call_layer(
|
|
246
|
+
adata,
|
|
247
|
+
seq1_column=seq1_col,
|
|
248
|
+
seq2_column=seq2_col,
|
|
249
|
+
seq1_converted_column=seq1_conv,
|
|
250
|
+
seq2_converted_column=seq2_conv,
|
|
251
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
252
|
+
reference_col=cfg.reference_column,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
append_variant_segment_layer(
|
|
256
|
+
adata,
|
|
257
|
+
seq1_column=seq1_col,
|
|
258
|
+
seq2_column=seq2_col,
|
|
259
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
260
|
+
reference_col=cfg.reference_column,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
############################################### Plot mismatch base frequencies ###############################################
|
|
264
|
+
if cfg.mismatch_frequency_layer not in adata.layers:
|
|
265
|
+
logger.debug(
|
|
266
|
+
"Mismatch layer '%s' not found; skipping mismatch base frequency plots.",
|
|
267
|
+
cfg.mismatch_frequency_layer,
|
|
268
|
+
)
|
|
269
|
+
elif not adata.uns.get("mismatch_integer_encoding_map"):
|
|
270
|
+
logger.debug("Mismatch encoding map not found; skipping mismatch base frequency plots.")
|
|
271
|
+
else:
|
|
272
|
+
mismatch_base_freq_dir = (
|
|
273
|
+
variant_directory / "deduplicated" / "01_mismatch_base_frequency_plots"
|
|
274
|
+
)
|
|
275
|
+
if mismatch_base_freq_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
276
|
+
logger.debug(
|
|
277
|
+
f"{mismatch_base_freq_dir} already exists. Skipping mismatch base frequency plots."
|
|
278
|
+
)
|
|
279
|
+
else:
|
|
280
|
+
make_dirs([mismatch_base_freq_dir])
|
|
281
|
+
plot_mismatch_base_frequency_by_position(
|
|
282
|
+
adata,
|
|
283
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
284
|
+
reference_col=cfg.reference_column,
|
|
285
|
+
mismatch_layer=cfg.mismatch_frequency_layer,
|
|
286
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
287
|
+
exclude_mod_sites=True, # cfg.mismatch_base_frequency_exclude_mod_sites,
|
|
288
|
+
mod_site_bases=cfg.mod_target_bases,
|
|
289
|
+
save_path=mismatch_base_freq_dir,
|
|
290
|
+
plot_zscores=True,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
############################################### Plot integer sequence encoding clustermaps ###############################################
|
|
294
|
+
if "sequence_integer_encoding" not in adata.layers:
|
|
295
|
+
logger.debug(
|
|
296
|
+
"sequence_integer_encoding layer not found; skipping integer encoding clustermaps."
|
|
297
|
+
)
|
|
298
|
+
else:
|
|
299
|
+
seq_clustermap_dir = (
|
|
300
|
+
variant_directory / "deduplicated" / "02_sequence_integer_encoding_clustermaps"
|
|
301
|
+
)
|
|
302
|
+
if seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
303
|
+
logger.debug(
|
|
304
|
+
f"{seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
|
|
305
|
+
)
|
|
306
|
+
else:
|
|
307
|
+
make_dirs([seq_clustermap_dir])
|
|
308
|
+
plot_sequence_integer_encoding_clustermaps(
|
|
309
|
+
adata,
|
|
310
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
311
|
+
reference_col=cfg.reference_column,
|
|
312
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
313
|
+
min_quality=None,
|
|
314
|
+
min_length=None,
|
|
315
|
+
min_mapped_length_to_reference_length_ratio=None,
|
|
316
|
+
sort_by="none",
|
|
317
|
+
max_unknown_fraction=0.5,
|
|
318
|
+
save_path=seq_clustermap_dir,
|
|
319
|
+
show_position_axis=True,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
if "mismatch_integer_encoding" in adata.layers:
|
|
323
|
+
mismatch_clustermap_dir = (
|
|
324
|
+
variant_directory
|
|
325
|
+
/ "deduplicated"
|
|
326
|
+
/ "03_mismatch_integer_encoding_clustermaps_no_mod_sites"
|
|
327
|
+
)
|
|
328
|
+
if mismatch_clustermap_dir.is_dir():
|
|
329
|
+
logger.debug(
|
|
330
|
+
f"{mismatch_clustermap_dir} already exists. "
|
|
331
|
+
"Skipping mismatch clustermaps without mod sites."
|
|
332
|
+
)
|
|
333
|
+
else:
|
|
334
|
+
make_dirs([mismatch_clustermap_dir])
|
|
335
|
+
plot_sequence_integer_encoding_clustermaps(
|
|
336
|
+
adata,
|
|
337
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
338
|
+
reference_col=cfg.reference_column,
|
|
339
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
340
|
+
min_quality=None,
|
|
341
|
+
min_length=None,
|
|
342
|
+
min_mapped_length_to_reference_length_ratio=None,
|
|
343
|
+
sort_by="none",
|
|
344
|
+
max_unknown_fraction=0.5,
|
|
345
|
+
save_path=mismatch_clustermap_dir,
|
|
346
|
+
show_position_axis=True,
|
|
347
|
+
exclude_mod_sites=True,
|
|
348
|
+
mod_site_bases=cfg.mod_target_bases,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
# ============================================================
|
|
352
|
+
# 4) Variant segment clustermaps
|
|
353
|
+
# ============================================================
|
|
354
|
+
if seq1_col and seq2_col:
|
|
355
|
+
segment_layer_name = f"{seq1_col}__{seq2_col}_variant_segments"
|
|
356
|
+
if segment_layer_name in adata.layers:
|
|
357
|
+
segment_dir = variant_directory / "deduplicated" / "04_variant_segment_clustermaps"
|
|
358
|
+
if segment_dir.exists():
|
|
359
|
+
logger.info(
|
|
360
|
+
"Variant segment clustermaps already exist at %s; skipping.",
|
|
361
|
+
segment_dir,
|
|
362
|
+
)
|
|
363
|
+
else:
|
|
364
|
+
make_dirs([segment_dir])
|
|
365
|
+
plot_variant_segment_clustermaps(
|
|
366
|
+
adata,
|
|
367
|
+
seq1_column=seq1_col,
|
|
368
|
+
seq2_column=seq2_col,
|
|
369
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
370
|
+
reference_col=cfg.reference_column,
|
|
371
|
+
variant_segment_layer=segment_layer_name,
|
|
372
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
373
|
+
save_path=segment_dir,
|
|
374
|
+
ref1_marker_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
|
|
375
|
+
ref2_marker_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
|
|
376
|
+
marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
|
|
377
|
+
show_position_axis=True,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
segment_type_dir = (
|
|
381
|
+
variant_directory
|
|
382
|
+
/ "deduplicated"
|
|
383
|
+
/ "05_variant_segment_clustermaps_with_mismatch_type"
|
|
384
|
+
)
|
|
385
|
+
if segment_type_dir.exists():
|
|
386
|
+
logger.info(
|
|
387
|
+
"Variant segment mismatch-type clustermaps already exist at %s; skipping.",
|
|
388
|
+
segment_type_dir,
|
|
389
|
+
)
|
|
390
|
+
else:
|
|
391
|
+
make_dirs([segment_type_dir])
|
|
392
|
+
plot_variant_segment_clustermaps(
|
|
393
|
+
adata,
|
|
394
|
+
seq1_column=seq1_col,
|
|
395
|
+
seq2_column=seq2_col,
|
|
396
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
397
|
+
reference_col=cfg.reference_column,
|
|
398
|
+
variant_segment_layer=segment_layer_name,
|
|
399
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
400
|
+
save_path=segment_type_dir,
|
|
401
|
+
ref1_marker_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
|
|
402
|
+
ref2_marker_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
|
|
403
|
+
marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
|
|
404
|
+
show_position_axis=True,
|
|
405
|
+
mismatch_type_obs_col="chimeric_variant_sites_type",
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# ============================================================
|
|
409
|
+
# 5) Save AnnData
|
|
410
|
+
# ============================================================
|
|
411
|
+
if not paths.variant.exists():
|
|
412
|
+
logger.info("Saving variant AnnData")
|
|
413
|
+
record_smftools_metadata(
|
|
414
|
+
adata,
|
|
415
|
+
step_name="variant",
|
|
416
|
+
cfg=cfg,
|
|
417
|
+
config_path=config_path,
|
|
418
|
+
input_paths=[source_adata_path] if source_adata_path else None,
|
|
419
|
+
output_path=paths.variant,
|
|
420
|
+
)
|
|
421
|
+
write_gz_h5ad(adata, paths.variant)
|
|
422
|
+
|
|
423
|
+
return adata, paths.variant
|
smftools/cli_entry.py
CHANGED
|
@@ -7,10 +7,14 @@ from typing import Sequence
|
|
|
7
7
|
import click
|
|
8
8
|
import pandas as pd
|
|
9
9
|
|
|
10
|
+
from .cli.chimeric_adata import chimeric_adata
|
|
10
11
|
from .cli.hmm_adata import hmm_adata
|
|
12
|
+
from .cli.latent_adata import latent_adata
|
|
11
13
|
from .cli.load_adata import load_adata
|
|
12
14
|
from .cli.preprocess_adata import preprocess_adata
|
|
15
|
+
from .cli.recipes import full_flow
|
|
13
16
|
from .cli.spatial_adata import spatial_adata
|
|
17
|
+
from .cli.variant_adata import variant_adata
|
|
14
18
|
from .informatics.pod5_functions import subsample_pod5
|
|
15
19
|
from .logging_utils import get_logger, setup_logging
|
|
16
20
|
from .readwrite import concatenate_h5ads
|
|
@@ -63,7 +67,7 @@ def cli(log_file: Path | None, log_level: str):
|
|
|
63
67
|
@cli.command()
|
|
64
68
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
65
69
|
def load(config_path):
|
|
66
|
-
"""Load
|
|
70
|
+
"""Load raw data into AnnData."""
|
|
67
71
|
load_adata(config_path)
|
|
68
72
|
|
|
69
73
|
|
|
@@ -74,7 +78,7 @@ def load(config_path):
|
|
|
74
78
|
@cli.command()
|
|
75
79
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
76
80
|
def preprocess(config_path):
|
|
77
|
-
"""
|
|
81
|
+
"""Preprocessing."""
|
|
78
82
|
preprocess_adata(config_path)
|
|
79
83
|
|
|
80
84
|
|
|
@@ -85,7 +89,7 @@ def preprocess(config_path):
|
|
|
85
89
|
@cli.command()
|
|
86
90
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
87
91
|
def spatial(config_path):
|
|
88
|
-
"""
|
|
92
|
+
"""Spatial signal analysis"""
|
|
89
93
|
spatial_adata(config_path)
|
|
90
94
|
|
|
91
95
|
|
|
@@ -96,13 +100,57 @@ def spatial(config_path):
|
|
|
96
100
|
@cli.command()
|
|
97
101
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
98
102
|
def hmm(config_path):
|
|
99
|
-
"""
|
|
103
|
+
"""HMM feature annotations and plotting"""
|
|
100
104
|
hmm_adata(config_path)
|
|
101
105
|
|
|
102
106
|
|
|
103
107
|
##########################################
|
|
104
108
|
|
|
105
109
|
|
|
110
|
+
####### Latent ###########
|
|
111
|
+
@cli.command()
|
|
112
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
113
|
+
def latent(config_path):
|
|
114
|
+
"""Latent representations of signal"""
|
|
115
|
+
latent_adata(config_path)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
##########################################
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
####### Variant ###########
|
|
122
|
+
@cli.command()
|
|
123
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
124
|
+
def variant(config_path):
|
|
125
|
+
"""Sequence variation analyses"""
|
|
126
|
+
variant_adata(config_path)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
##########################################
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
####### Chimeric ###########
|
|
133
|
+
@cli.command()
|
|
134
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
135
|
+
def chimeric(config_path):
|
|
136
|
+
"""Finding putative PCR chimeras"""
|
|
137
|
+
chimeric_adata(config_path)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
##########################################
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
####### Recipes ###########
|
|
144
|
+
@cli.command()
|
|
145
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
146
|
+
def full(config_path):
|
|
147
|
+
"""Workflow: load preprocess spatial variant chimeric hmm latent"""
|
|
148
|
+
full_flow(config_path)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
##########################################
|
|
152
|
+
|
|
153
|
+
|
|
106
154
|
####### batch command ###########
|
|
107
155
|
@cli.command()
|
|
108
156
|
@click.argument(
|
smftools/config/conversion.yaml
CHANGED
smftools/config/deaminase.yaml
CHANGED