smftools 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +18 -2
- smftools/cli/hmm_adata.py +18 -1
- smftools/cli/latent_adata.py +522 -67
- smftools/cli/load_adata.py +2 -2
- smftools/cli/preprocess_adata.py +32 -93
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +23 -109
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +41 -5
- smftools/config/conversion.yaml +0 -10
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +49 -13
- smftools/config/experiment_config.py +96 -3
- smftools/constants.py +4 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +53 -13
- smftools/informatics/h5ad_functions.py +83 -0
- smftools/informatics/modkit_extract_to_adata.py +4 -0
- smftools/plotting/__init__.py +26 -12
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +58 -3362
- smftools/plotting/hmm_plotting.py +1586 -2
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +3 -0
- smftools/preprocessing/append_base_context.py +1 -1
- smftools/preprocessing/append_mismatch_frequency_sites.py +35 -6
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +109 -85
- smftools/tools/__init__.py +6 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_nmf.py +18 -7
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +70 -154
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +640 -3
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +52 -4
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/METADATA +3 -1
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/RECORD +56 -42
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import anndata as ad
|
|
8
|
+
|
|
9
|
+
from smftools.constants import LOGGING_DIR, VARIANT_DIR
|
|
10
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def variant_adata(
|
|
16
|
+
config_path: str,
|
|
17
|
+
) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
|
|
18
|
+
"""
|
|
19
|
+
CLI-facing wrapper for variant analyses.
|
|
20
|
+
|
|
21
|
+
Called by: `smftools variant <config_path>`
|
|
22
|
+
|
|
23
|
+
Responsibilities:
|
|
24
|
+
- Ensure a usable AnnData exists.
|
|
25
|
+
- Determine which AnnData stages exist.
|
|
26
|
+
- Decide whether to skip (return existing) or run the core.
|
|
27
|
+
- Call `variant_adata_core(...)` when actual work is needed.
|
|
28
|
+
"""
|
|
29
|
+
from ..readwrite import safe_read_h5ad
|
|
30
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
31
|
+
|
|
32
|
+
# 1) Ensure config + basic paths via load_adata
|
|
33
|
+
cfg = load_experiment_config(config_path)
|
|
34
|
+
|
|
35
|
+
paths = get_adata_paths(cfg)
|
|
36
|
+
|
|
37
|
+
pp_path = paths.pp
|
|
38
|
+
pp_dedup_path = paths.pp_dedup
|
|
39
|
+
spatial_path = paths.spatial
|
|
40
|
+
chimeric_path = paths.chimeric
|
|
41
|
+
variant_path = paths.variant
|
|
42
|
+
hmm_path = paths.hmm
|
|
43
|
+
latent_path = paths.latent
|
|
44
|
+
|
|
45
|
+
# Stage-skipping logic
|
|
46
|
+
if not getattr(cfg, "force_redo_variant_analyses", False):
|
|
47
|
+
if variant_path.exists():
|
|
48
|
+
logger.info(f"Variant AnnData found: {variant_path}\nSkipping smftools variant")
|
|
49
|
+
return None, spatial_path
|
|
50
|
+
|
|
51
|
+
# Helper to load from disk, reusing loaded_adata if it matches
|
|
52
|
+
def _load(path: Path):
|
|
53
|
+
adata, _ = safe_read_h5ad(path)
|
|
54
|
+
return adata
|
|
55
|
+
|
|
56
|
+
# 3) Decide which AnnData to use as the *starting point* for analyses
|
|
57
|
+
if hmm_path.exists():
|
|
58
|
+
start_adata = _load(hmm_path)
|
|
59
|
+
source_path = hmm_path
|
|
60
|
+
elif latent_path.exists():
|
|
61
|
+
start_adata = _load(latent_path)
|
|
62
|
+
source_path = latent_path
|
|
63
|
+
elif spatial_path.exists():
|
|
64
|
+
start_adata = _load(spatial_path)
|
|
65
|
+
source_path = spatial_path
|
|
66
|
+
elif chimeric_path.exists():
|
|
67
|
+
start_adata = _load(chimeric_path)
|
|
68
|
+
source_path = chimeric_path
|
|
69
|
+
elif variant_path.exists():
|
|
70
|
+
start_adata = _load(variant_path)
|
|
71
|
+
source_path = variant_path
|
|
72
|
+
elif pp_dedup_path.exists():
|
|
73
|
+
start_adata = _load(pp_dedup_path)
|
|
74
|
+
source_path = pp_dedup_path
|
|
75
|
+
elif pp_path.exists():
|
|
76
|
+
start_adata = _load(pp_path)
|
|
77
|
+
source_path = pp_path
|
|
78
|
+
else:
|
|
79
|
+
logger.warning(
|
|
80
|
+
"No suitable AnnData found for variant analyses (need at least preprocessed)."
|
|
81
|
+
)
|
|
82
|
+
return None, None
|
|
83
|
+
|
|
84
|
+
# 4) Run the core
|
|
85
|
+
adata_variant, variant_path = variant_adata_core(
|
|
86
|
+
adata=start_adata,
|
|
87
|
+
cfg=cfg,
|
|
88
|
+
paths=paths,
|
|
89
|
+
source_adata_path=source_path,
|
|
90
|
+
config_path=config_path,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return adata_variant, variant_path
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def variant_adata_core(
|
|
97
|
+
adata: ad.AnnData,
|
|
98
|
+
cfg,
|
|
99
|
+
paths: AdataPaths,
|
|
100
|
+
source_adata_path: Optional[Path] = None,
|
|
101
|
+
config_path: Optional[str] = None,
|
|
102
|
+
) -> Tuple[ad.AnnData, Path]:
|
|
103
|
+
"""
|
|
104
|
+
Core variant analysis pipeline.
|
|
105
|
+
|
|
106
|
+
Assumes:
|
|
107
|
+
- `cfg` is the ExperimentConfig.
|
|
108
|
+
|
|
109
|
+
Does:
|
|
110
|
+
-
|
|
111
|
+
- Save AnnData
|
|
112
|
+
"""
|
|
113
|
+
import os
|
|
114
|
+
import warnings
|
|
115
|
+
from datetime import datetime
|
|
116
|
+
from pathlib import Path
|
|
117
|
+
|
|
118
|
+
import numpy as np
|
|
119
|
+
import pandas as pd
|
|
120
|
+
|
|
121
|
+
from ..metadata import record_smftools_metadata
|
|
122
|
+
from ..plotting import (
|
|
123
|
+
plot_mismatch_base_frequency_by_position,
|
|
124
|
+
plot_sequence_integer_encoding_clustermaps,
|
|
125
|
+
plot_variant_segment_clustermaps,
|
|
126
|
+
)
|
|
127
|
+
from ..preprocessing import (
|
|
128
|
+
append_mismatch_frequency_sites,
|
|
129
|
+
append_sequence_mismatch_annotations,
|
|
130
|
+
append_variant_call_layer,
|
|
131
|
+
append_variant_segment_layer,
|
|
132
|
+
load_sample_sheet,
|
|
133
|
+
)
|
|
134
|
+
from ..readwrite import make_dirs
|
|
135
|
+
from .helpers import write_gz_h5ad
|
|
136
|
+
|
|
137
|
+
# -----------------------------
|
|
138
|
+
# General setup
|
|
139
|
+
# -----------------------------
|
|
140
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
141
|
+
now = datetime.now()
|
|
142
|
+
time_str = now.strftime("%H%M%S")
|
|
143
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
144
|
+
|
|
145
|
+
output_directory = Path(cfg.output_directory)
|
|
146
|
+
variant_directory = output_directory / VARIANT_DIR
|
|
147
|
+
logging_directory = variant_directory / LOGGING_DIR
|
|
148
|
+
|
|
149
|
+
make_dirs([output_directory, variant_directory])
|
|
150
|
+
|
|
151
|
+
if cfg.emit_log_file:
|
|
152
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
153
|
+
make_dirs([logging_directory])
|
|
154
|
+
else:
|
|
155
|
+
log_file = None
|
|
156
|
+
|
|
157
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
158
|
+
|
|
159
|
+
smf_modality = cfg.smf_modality
|
|
160
|
+
if smf_modality == "conversion":
|
|
161
|
+
deaminase = False
|
|
162
|
+
else:
|
|
163
|
+
deaminase = True
|
|
164
|
+
|
|
165
|
+
# -----------------------------
|
|
166
|
+
# Optional sample sheet metadata
|
|
167
|
+
# -----------------------------
|
|
168
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
169
|
+
load_sample_sheet(
|
|
170
|
+
adata,
|
|
171
|
+
cfg.sample_sheet_path,
|
|
172
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
173
|
+
as_category=True,
|
|
174
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# ============================================================
|
|
178
|
+
# 1) Reference variant position annotation
|
|
179
|
+
# ============================================================
|
|
180
|
+
seq1_col, seq2_col = getattr(cfg, "references_to_align_for_variant_annotation", [None, None])
|
|
181
|
+
if seq1_col and seq2_col:
|
|
182
|
+
append_sequence_mismatch_annotations(adata, seq1_col, seq2_col)
|
|
183
|
+
|
|
184
|
+
############################################### Append mismatch frequency per position ###############################################
|
|
185
|
+
append_mismatch_frequency_sites(
|
|
186
|
+
adata,
|
|
187
|
+
ref_column=cfg.reference_column,
|
|
188
|
+
mismatch_layer=cfg.mismatch_frequency_layer,
|
|
189
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
190
|
+
mismatch_frequency_range=cfg.mismatch_frequency_range,
|
|
191
|
+
bypass=cfg.bypass_append_mismatch_frequency_sites,
|
|
192
|
+
force_redo=cfg.force_redo_append_mismatch_frequency_sites,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# ============================================================
|
|
196
|
+
# 2) Per-read variant call layer at reference mismatch sites
|
|
197
|
+
# ============================================================
|
|
198
|
+
if seq1_col and seq2_col:
|
|
199
|
+
# For conversion SMF, derive converted column names so variant calling
|
|
200
|
+
# compares read bases against the converted reference (which reads are mapped to).
|
|
201
|
+
# Unconverted: "{chrom}_{strand}_strand_FASTA_base"
|
|
202
|
+
# Converted: "{chrom}_{conversion}_{strand}_{strand}_strand_FASTA_base"
|
|
203
|
+
# e.g. "6B6_top_strand_FASTA_base" -> "6B6_5mC_top_top_strand_FASTA_base"
|
|
204
|
+
def _find_converted_column(unconverted_col: str, var_columns) -> str | None:
|
|
205
|
+
"""Find the converted FASTA column corresponding to an unconverted one.
|
|
206
|
+
|
|
207
|
+
Unconverted columns follow the pattern ``{chromosome}_{strand}_strand_FASTA_base``.
|
|
208
|
+
Converted columns follow ``{chromosome}_{conversion}_{strand}_{strand}_strand_FASTA_base``
|
|
209
|
+
(e.g. ``6B6_5mC_top_top_strand_FASTA_base`` for unconverted ``6B6_top_strand_FASTA_base``).
|
|
210
|
+
"""
|
|
211
|
+
suffix = "_strand_FASTA_base"
|
|
212
|
+
if not unconverted_col.endswith(suffix):
|
|
213
|
+
return None
|
|
214
|
+
stem = unconverted_col[: -len(suffix)] # e.g. "6B6_top"
|
|
215
|
+
# Parse strand from end of stem: "6B6_top" -> strand="top", chrom="6B6"
|
|
216
|
+
for strand in ("top", "bottom"):
|
|
217
|
+
if stem.endswith(f"_{strand}"):
|
|
218
|
+
chrom = stem[: -len(f"_{strand}")]
|
|
219
|
+
# Converted column: {chrom}_{conversion}_{strand}_{strand}_strand_FASTA_base
|
|
220
|
+
# The strand appears twice: once in the record name, once in the suffix.
|
|
221
|
+
prefix = f"{chrom}_"
|
|
222
|
+
end = f"_{strand}_{strand}{suffix}"
|
|
223
|
+
candidates = [
|
|
224
|
+
c
|
|
225
|
+
for c in var_columns
|
|
226
|
+
if c.startswith(prefix) and c.endswith(end) and c != unconverted_col
|
|
227
|
+
]
|
|
228
|
+
if len(candidates) == 1:
|
|
229
|
+
return candidates[0]
|
|
230
|
+
if len(candidates) > 1:
|
|
231
|
+
logger.info(
|
|
232
|
+
"Multiple converted column candidates for '%s': %s",
|
|
233
|
+
unconverted_col,
|
|
234
|
+
candidates,
|
|
235
|
+
)
|
|
236
|
+
return candidates[0]
|
|
237
|
+
break
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
seq1_conv = _find_converted_column(seq1_col, adata.var.columns)
|
|
241
|
+
seq2_conv = _find_converted_column(seq2_col, adata.var.columns)
|
|
242
|
+
if seq1_conv and seq2_conv:
|
|
243
|
+
logger.info("Using converted columns: '%s', '%s'", seq1_conv, seq2_conv)
|
|
244
|
+
|
|
245
|
+
append_variant_call_layer(
|
|
246
|
+
adata,
|
|
247
|
+
seq1_column=seq1_col,
|
|
248
|
+
seq2_column=seq2_col,
|
|
249
|
+
seq1_converted_column=seq1_conv,
|
|
250
|
+
seq2_converted_column=seq2_conv,
|
|
251
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
252
|
+
reference_col=cfg.reference_column,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
append_variant_segment_layer(
|
|
256
|
+
adata,
|
|
257
|
+
seq1_column=seq1_col,
|
|
258
|
+
seq2_column=seq2_col,
|
|
259
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
260
|
+
reference_col=cfg.reference_column,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
############################################### Plot mismatch base frequencies ###############################################
|
|
264
|
+
if cfg.mismatch_frequency_layer not in adata.layers:
|
|
265
|
+
logger.debug(
|
|
266
|
+
"Mismatch layer '%s' not found; skipping mismatch base frequency plots.",
|
|
267
|
+
cfg.mismatch_frequency_layer,
|
|
268
|
+
)
|
|
269
|
+
elif not adata.uns.get("mismatch_integer_encoding_map"):
|
|
270
|
+
logger.debug("Mismatch encoding map not found; skipping mismatch base frequency plots.")
|
|
271
|
+
else:
|
|
272
|
+
mismatch_base_freq_dir = (
|
|
273
|
+
variant_directory / "deduplicated" / "01_mismatch_base_frequency_plots"
|
|
274
|
+
)
|
|
275
|
+
if mismatch_base_freq_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
276
|
+
logger.debug(
|
|
277
|
+
f"{mismatch_base_freq_dir} already exists. Skipping mismatch base frequency plots."
|
|
278
|
+
)
|
|
279
|
+
else:
|
|
280
|
+
make_dirs([mismatch_base_freq_dir])
|
|
281
|
+
plot_mismatch_base_frequency_by_position(
|
|
282
|
+
adata,
|
|
283
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
284
|
+
reference_col=cfg.reference_column,
|
|
285
|
+
mismatch_layer=cfg.mismatch_frequency_layer,
|
|
286
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
287
|
+
exclude_mod_sites=True, # cfg.mismatch_base_frequency_exclude_mod_sites,
|
|
288
|
+
mod_site_bases=cfg.mod_target_bases,
|
|
289
|
+
save_path=mismatch_base_freq_dir,
|
|
290
|
+
plot_zscores=True,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
############################################### Plot integer sequence encoding clustermaps ###############################################
|
|
294
|
+
if "sequence_integer_encoding" not in adata.layers:
|
|
295
|
+
logger.debug(
|
|
296
|
+
"sequence_integer_encoding layer not found; skipping integer encoding clustermaps."
|
|
297
|
+
)
|
|
298
|
+
else:
|
|
299
|
+
seq_clustermap_dir = (
|
|
300
|
+
variant_directory / "deduplicated" / "02_sequence_integer_encoding_clustermaps"
|
|
301
|
+
)
|
|
302
|
+
if seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
303
|
+
logger.debug(
|
|
304
|
+
f"{seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
|
|
305
|
+
)
|
|
306
|
+
else:
|
|
307
|
+
make_dirs([seq_clustermap_dir])
|
|
308
|
+
plot_sequence_integer_encoding_clustermaps(
|
|
309
|
+
adata,
|
|
310
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
311
|
+
reference_col=cfg.reference_column,
|
|
312
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
313
|
+
min_quality=None,
|
|
314
|
+
min_length=None,
|
|
315
|
+
min_mapped_length_to_reference_length_ratio=None,
|
|
316
|
+
sort_by="none",
|
|
317
|
+
max_unknown_fraction=0.5,
|
|
318
|
+
save_path=seq_clustermap_dir,
|
|
319
|
+
show_position_axis=True,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
if "mismatch_integer_encoding" in adata.layers:
|
|
323
|
+
mismatch_clustermap_dir = (
|
|
324
|
+
variant_directory
|
|
325
|
+
/ "deduplicated"
|
|
326
|
+
/ "03_mismatch_integer_encoding_clustermaps_no_mod_sites"
|
|
327
|
+
)
|
|
328
|
+
if mismatch_clustermap_dir.is_dir():
|
|
329
|
+
logger.debug(
|
|
330
|
+
f"{mismatch_clustermap_dir} already exists. "
|
|
331
|
+
"Skipping mismatch clustermaps without mod sites."
|
|
332
|
+
)
|
|
333
|
+
else:
|
|
334
|
+
make_dirs([mismatch_clustermap_dir])
|
|
335
|
+
plot_sequence_integer_encoding_clustermaps(
|
|
336
|
+
adata,
|
|
337
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
338
|
+
reference_col=cfg.reference_column,
|
|
339
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
340
|
+
min_quality=None,
|
|
341
|
+
min_length=None,
|
|
342
|
+
min_mapped_length_to_reference_length_ratio=None,
|
|
343
|
+
sort_by="none",
|
|
344
|
+
max_unknown_fraction=0.5,
|
|
345
|
+
save_path=mismatch_clustermap_dir,
|
|
346
|
+
show_position_axis=True,
|
|
347
|
+
exclude_mod_sites=True,
|
|
348
|
+
mod_site_bases=cfg.mod_target_bases,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
# ============================================================
|
|
352
|
+
# 4) Variant segment clustermaps
|
|
353
|
+
# ============================================================
|
|
354
|
+
if seq1_col and seq2_col:
|
|
355
|
+
segment_layer_name = f"{seq1_col}__{seq2_col}_variant_segments"
|
|
356
|
+
if segment_layer_name in adata.layers:
|
|
357
|
+
segment_dir = variant_directory / "deduplicated" / "04_variant_segment_clustermaps"
|
|
358
|
+
if segment_dir.exists():
|
|
359
|
+
logger.info(
|
|
360
|
+
"Variant segment clustermaps already exist at %s; skipping.",
|
|
361
|
+
segment_dir,
|
|
362
|
+
)
|
|
363
|
+
else:
|
|
364
|
+
make_dirs([segment_dir])
|
|
365
|
+
plot_variant_segment_clustermaps(
|
|
366
|
+
adata,
|
|
367
|
+
seq1_column=seq1_col,
|
|
368
|
+
seq2_column=seq2_col,
|
|
369
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
370
|
+
reference_col=cfg.reference_column,
|
|
371
|
+
variant_segment_layer=segment_layer_name,
|
|
372
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
373
|
+
save_path=segment_dir,
|
|
374
|
+
ref1_marker_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
|
|
375
|
+
ref2_marker_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
|
|
376
|
+
marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
|
|
377
|
+
show_position_axis=True,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
segment_type_dir = (
|
|
381
|
+
variant_directory
|
|
382
|
+
/ "deduplicated"
|
|
383
|
+
/ "05_variant_segment_clustermaps_with_mismatch_type"
|
|
384
|
+
)
|
|
385
|
+
if segment_type_dir.exists():
|
|
386
|
+
logger.info(
|
|
387
|
+
"Variant segment mismatch-type clustermaps already exist at %s; skipping.",
|
|
388
|
+
segment_type_dir,
|
|
389
|
+
)
|
|
390
|
+
else:
|
|
391
|
+
make_dirs([segment_type_dir])
|
|
392
|
+
plot_variant_segment_clustermaps(
|
|
393
|
+
adata,
|
|
394
|
+
seq1_column=seq1_col,
|
|
395
|
+
seq2_column=seq2_col,
|
|
396
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
397
|
+
reference_col=cfg.reference_column,
|
|
398
|
+
variant_segment_layer=segment_layer_name,
|
|
399
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
400
|
+
save_path=segment_type_dir,
|
|
401
|
+
ref1_marker_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
|
|
402
|
+
ref2_marker_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
|
|
403
|
+
marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
|
|
404
|
+
show_position_axis=True,
|
|
405
|
+
mismatch_type_obs_col="chimeric_variant_sites_type",
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# ============================================================
|
|
409
|
+
# 5) Save AnnData
|
|
410
|
+
# ============================================================
|
|
411
|
+
if not paths.variant.exists():
|
|
412
|
+
logger.info("Saving variant AnnData")
|
|
413
|
+
record_smftools_metadata(
|
|
414
|
+
adata,
|
|
415
|
+
step_name="variant",
|
|
416
|
+
cfg=cfg,
|
|
417
|
+
config_path=config_path,
|
|
418
|
+
input_paths=[source_adata_path] if source_adata_path else None,
|
|
419
|
+
output_path=paths.variant,
|
|
420
|
+
)
|
|
421
|
+
write_gz_h5ad(adata, paths.variant)
|
|
422
|
+
|
|
423
|
+
return adata, paths.variant
|
smftools/cli_entry.py
CHANGED
|
@@ -7,11 +7,14 @@ from typing import Sequence
|
|
|
7
7
|
import click
|
|
8
8
|
import pandas as pd
|
|
9
9
|
|
|
10
|
+
from .cli.chimeric_adata import chimeric_adata
|
|
10
11
|
from .cli.hmm_adata import hmm_adata
|
|
11
12
|
from .cli.latent_adata import latent_adata
|
|
12
13
|
from .cli.load_adata import load_adata
|
|
13
14
|
from .cli.preprocess_adata import preprocess_adata
|
|
15
|
+
from .cli.recipes import full_flow
|
|
14
16
|
from .cli.spatial_adata import spatial_adata
|
|
17
|
+
from .cli.variant_adata import variant_adata
|
|
15
18
|
from .informatics.pod5_functions import subsample_pod5
|
|
16
19
|
from .logging_utils import get_logger, setup_logging
|
|
17
20
|
from .readwrite import concatenate_h5ads
|
|
@@ -64,7 +67,7 @@ def cli(log_file: Path | None, log_level: str):
|
|
|
64
67
|
@cli.command()
|
|
65
68
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
66
69
|
def load(config_path):
|
|
67
|
-
"""Load
|
|
70
|
+
"""Load raw data into AnnData."""
|
|
68
71
|
load_adata(config_path)
|
|
69
72
|
|
|
70
73
|
|
|
@@ -75,7 +78,7 @@ def load(config_path):
|
|
|
75
78
|
@cli.command()
|
|
76
79
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
77
80
|
def preprocess(config_path):
|
|
78
|
-
"""
|
|
81
|
+
"""Preprocessing."""
|
|
79
82
|
preprocess_adata(config_path)
|
|
80
83
|
|
|
81
84
|
|
|
@@ -86,7 +89,7 @@ def preprocess(config_path):
|
|
|
86
89
|
@cli.command()
|
|
87
90
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
88
91
|
def spatial(config_path):
|
|
89
|
-
"""
|
|
92
|
+
"""Spatial signal analysis"""
|
|
90
93
|
spatial_adata(config_path)
|
|
91
94
|
|
|
92
95
|
|
|
@@ -97,7 +100,7 @@ def spatial(config_path):
|
|
|
97
100
|
@cli.command()
|
|
98
101
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
99
102
|
def hmm(config_path):
|
|
100
|
-
"""
|
|
103
|
+
"""HMM feature annotations and plotting"""
|
|
101
104
|
hmm_adata(config_path)
|
|
102
105
|
|
|
103
106
|
|
|
@@ -108,13 +111,46 @@ def hmm(config_path):
|
|
|
108
111
|
@cli.command()
|
|
109
112
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
110
113
|
def latent(config_path):
|
|
111
|
-
"""
|
|
114
|
+
"""Latent representations of signal"""
|
|
112
115
|
latent_adata(config_path)
|
|
113
116
|
|
|
114
117
|
|
|
115
118
|
##########################################
|
|
116
119
|
|
|
117
120
|
|
|
121
|
+
####### Variant ###########
|
|
122
|
+
@cli.command()
|
|
123
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
124
|
+
def variant(config_path):
|
|
125
|
+
"""Sequence variation analyses"""
|
|
126
|
+
variant_adata(config_path)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
##########################################
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
####### Chimeric ###########
|
|
133
|
+
@cli.command()
|
|
134
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
135
|
+
def chimeric(config_path):
|
|
136
|
+
"""Finding putative PCR chimeras"""
|
|
137
|
+
chimeric_adata(config_path)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
##########################################
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
####### Recipes ###########
|
|
144
|
+
@cli.command()
|
|
145
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
146
|
+
def full(config_path):
|
|
147
|
+
"""Workflow: load preprocess spatial variant chimeric hmm latent"""
|
|
148
|
+
full_flow(config_path)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
##########################################
|
|
152
|
+
|
|
153
|
+
|
|
118
154
|
####### batch command ###########
|
|
119
155
|
@cli.command()
|
|
120
156
|
@click.argument(
|
smftools/config/conversion.yaml
CHANGED
|
@@ -15,16 +15,6 @@ autocorr_site_types:
|
|
|
15
15
|
|
|
16
16
|
# Spatial Analysis - Clustermap params
|
|
17
17
|
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
18
|
-
rolling_nn_layer: "nan0_0minus1"
|
|
19
|
-
rolling_nn_plot_layer: "nan0_0minus1"
|
|
20
|
-
rolling_nn_window: 30
|
|
21
|
-
rolling_nn_step: 2
|
|
22
|
-
rolling_nn_min_overlap: 20
|
|
23
|
-
rolling_nn_return_fraction: true
|
|
24
|
-
rolling_nn_obsm_key: "rolling_nn_dist"
|
|
25
|
-
rolling_nn_site_types:
|
|
26
|
-
- "GpC"
|
|
27
|
-
- "CpG"
|
|
28
18
|
clustermap_cmap_c: "coolwarm"
|
|
29
19
|
clustermap_cmap_gpc: "coolwarm"
|
|
30
20
|
clustermap_cmap_cpg: "viridis"
|
smftools/config/deaminase.yaml
CHANGED
smftools/config/default.yaml
CHANGED
|
@@ -110,7 +110,7 @@ read_len_to_ref_ratio_filter_thresholds:
|
|
|
110
110
|
- null
|
|
111
111
|
- null
|
|
112
112
|
read_quality_filter_thresholds:
|
|
113
|
-
-
|
|
113
|
+
- 10
|
|
114
114
|
- null
|
|
115
115
|
read_mapping_quality_filter_thresholds:
|
|
116
116
|
- null
|
|
@@ -130,7 +130,7 @@ read_mod_filtering_a_thresholds:
|
|
|
130
130
|
- 0.025
|
|
131
131
|
- 0.975
|
|
132
132
|
read_mod_filtering_use_other_c_as_background: False
|
|
133
|
-
min_valid_fraction_positions_in_read_vs_ref: 0.
|
|
133
|
+
min_valid_fraction_positions_in_read_vs_ref: 0.2
|
|
134
134
|
|
|
135
135
|
# Plotting params for read length histograms
|
|
136
136
|
obs_to_plot_pp_qc:
|
|
@@ -162,12 +162,13 @@ duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical cl
|
|
|
162
162
|
duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
|
|
163
163
|
|
|
164
164
|
# Position QC params
|
|
165
|
-
position_max_nan_threshold: 0.
|
|
165
|
+
position_max_nan_threshold: 0.8 # The maximum amount of nans to tolerate in a column
|
|
166
166
|
mismatch_frequency_range:
|
|
167
167
|
- 0.01
|
|
168
168
|
- 0.99
|
|
169
169
|
mismatch_frequency_layer: "mismatch_integer_encoding"
|
|
170
170
|
mismatch_frequency_read_span_layer: "read_span_mask"
|
|
171
|
+
mismatch_base_frequency_exclude_mod_sites: True
|
|
171
172
|
|
|
172
173
|
######## smftools spatial params #########
|
|
173
174
|
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
@@ -186,13 +187,56 @@ clustermap_cmap_gpc: "coolwarm"
|
|
|
186
187
|
clustermap_cmap_cpg: "coolwarm"
|
|
187
188
|
clustermap_cmap_a: "coolwarm"
|
|
188
189
|
spatial_clustermap_sortby: "gpc"
|
|
190
|
+
|
|
191
|
+
# Clustermap variant params
|
|
192
|
+
overlay_variant_calls: false
|
|
193
|
+
variant_overlay_seq1_color: "black"
|
|
194
|
+
variant_overlay_seq2_color: "white"
|
|
195
|
+
variant_overlay_marker_size: 4.0
|
|
196
|
+
|
|
197
|
+
# Spatial analysis - Rolling NN Hamming
|
|
198
|
+
rolling_nn_layer: "nan0_0minus1"
|
|
199
|
+
rolling_nn_plot_layer: "nan0_0minus1"
|
|
200
|
+
rolling_nn_plot_layers:
|
|
201
|
+
- "nan0_0minus1"
|
|
202
|
+
- "zero_hamming_distance_spans"
|
|
203
|
+
rolling_nn_window: 10
|
|
204
|
+
rolling_nn_step: 1
|
|
205
|
+
rolling_nn_min_overlap: 8
|
|
206
|
+
rolling_nn_return_fraction: true
|
|
207
|
+
rolling_nn_obsm_key: "rolling_nn_dist"
|
|
189
208
|
rolling_nn_site_types:
|
|
190
209
|
- "GpC"
|
|
191
210
|
- "CpG"
|
|
192
|
-
|
|
193
|
-
|
|
211
|
+
rolling_nn_write_zero_pairs_csvs: true
|
|
212
|
+
rolling_nn_zero_pairs_uns_key: null
|
|
213
|
+
rolling_nn_zero_pairs_segments_key: null
|
|
214
|
+
rolling_nn_zero_pairs_layer_key: null
|
|
215
|
+
rolling_nn_zero_pairs_refine: true
|
|
216
|
+
rolling_nn_zero_pairs_max_nan_run: 2
|
|
217
|
+
rolling_nn_zero_pairs_merge_gap: 1
|
|
218
|
+
rolling_nn_zero_pairs_max_segments_per_read: 2
|
|
219
|
+
rolling_nn_zero_pairs_max_overlap: 5
|
|
220
|
+
rolling_nn_zero_pairs_layer_overlap_mode: "sum"
|
|
221
|
+
rolling_nn_zero_pairs_layer_overlap_value: null
|
|
222
|
+
rolling_nn_zero_pairs_keep_uns: true
|
|
223
|
+
rolling_nn_zero_pairs_segments_keep_uns: true
|
|
224
|
+
rolling_nn_zero_pairs_top_segments_per_read: 3
|
|
225
|
+
rolling_nn_zero_pairs_top_segments_max_overlap: 5
|
|
226
|
+
rolling_nn_zero_pairs_top_segments_min_span: 300
|
|
227
|
+
rolling_nn_zero_pairs_top_segments_write_csvs: true
|
|
228
|
+
rolling_nn_zero_pairs_segment_histogram_bins: 30
|
|
229
|
+
|
|
230
|
+
# Cross-sample rolling NN analysis
|
|
231
|
+
cross_sample_analysis: true
|
|
232
|
+
cross_sample_grouping_col: null
|
|
233
|
+
cross_sample_random_seed: 42
|
|
234
|
+
delta_hamming_chimeric_span_threshold: 200
|
|
235
|
+
|
|
236
|
+
# Latent Analysis - UMAP/Leiden params
|
|
194
237
|
layer_for_umap_plotting: 'nan_half'
|
|
195
238
|
umap_layers_to_plot:
|
|
239
|
+
- "leiden"
|
|
196
240
|
- "mapped_length"
|
|
197
241
|
- "Raw_modification_signal"
|
|
198
242
|
|
|
@@ -279,21 +323,13 @@ hmm_merge_layer_features:
|
|
|
279
323
|
- ["all_accessible_features", 60]
|
|
280
324
|
clustermap_cmap_hmm: "coolwarm"
|
|
281
325
|
hmm_clustermap_feature_layers:
|
|
282
|
-
- all_accessible_features
|
|
283
326
|
- all_accessible_features_merged
|
|
284
|
-
- small_accessible_patch
|
|
285
|
-
- mid_accessible_patch
|
|
286
|
-
- large_accessible_patch
|
|
287
|
-
- large_accessible_patch_merged
|
|
288
|
-
- nucleosome_depleted_region
|
|
289
327
|
- nucleosome_depleted_region_merged
|
|
290
328
|
- small_bound_stretch
|
|
291
329
|
- medium_bound_stretch
|
|
292
330
|
- putative_nucleosome
|
|
293
|
-
- large_bound_stretch
|
|
294
331
|
- all_footprint_features
|
|
295
332
|
hmm_clustermap_length_layers:
|
|
296
|
-
- all_accessible_features
|
|
297
333
|
- all_accessible_features_merged
|
|
298
334
|
- all_footprint_features
|
|
299
335
|
hmm_clustermap_sortby: "hmm"
|