smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +32 -6
- smftools/cli/hmm_adata.py +232 -31
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +77 -73
- smftools/cli/preprocess_adata.py +178 -53
- smftools/cli/spatial_adata.py +149 -101
- smftools/cli_entry.py +12 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +38 -1
- smftools/config/experiment_config.py +53 -1
- smftools/constants.py +65 -0
- smftools/hmm/HMM.py +88 -0
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/converted_BAM_to_adata.py +584 -163
- smftools/informatics/h5ad_functions.py +115 -2
- smftools/informatics/modkit_extract_to_adata.py +1003 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +9 -0
- smftools/plotting/general_plotting.py +2411 -628
- smftools/plotting/hmm_plotting.py +85 -7
- smftools/preprocessing/__init__.py +1 -0
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +4 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +91 -8
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
smftools/cli/preprocess_adata.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Optional, Tuple
|
|
5
6
|
|
|
6
7
|
import anndata as ad
|
|
7
8
|
|
|
8
|
-
from smftools.
|
|
9
|
+
from smftools.constants import LOGGING_DIR, PREPROCESS_DIR
|
|
10
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
9
11
|
|
|
10
12
|
logger = get_logger(__name__)
|
|
11
13
|
|
|
@@ -36,30 +38,23 @@ def preprocess_adata(
|
|
|
36
38
|
Path to preprocessed, duplicate-removed AnnData.
|
|
37
39
|
"""
|
|
38
40
|
from ..readwrite import safe_read_h5ad
|
|
39
|
-
from .helpers import get_adata_paths
|
|
40
|
-
from .load_adata import load_adata
|
|
41
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
41
42
|
|
|
42
43
|
# 1) Ensure config is loaded and at least *some* AnnData stage exists
|
|
43
|
-
|
|
44
|
+
cfg = load_experiment_config(config_path)
|
|
44
45
|
|
|
45
46
|
# 2) Compute canonical paths
|
|
46
47
|
paths = get_adata_paths(cfg)
|
|
47
48
|
raw_path = paths.raw
|
|
48
49
|
pp_path = paths.pp
|
|
49
50
|
pp_dedup_path = paths.pp_dedup
|
|
50
|
-
spatial_path = paths.spatial
|
|
51
|
-
hmm_path = paths.hmm
|
|
52
51
|
|
|
53
52
|
raw_exists = raw_path.exists()
|
|
54
53
|
pp_exists = pp_path.exists()
|
|
55
54
|
pp_dedup_exists = pp_dedup_path.exists()
|
|
56
|
-
spatial_exists = spatial_path.exists()
|
|
57
|
-
hmm_exists = hmm_path.exists()
|
|
58
55
|
|
|
59
|
-
# Helper:
|
|
56
|
+
# Helper: read from disk
|
|
60
57
|
def _load(path: Path):
|
|
61
|
-
if loaded_adata is not None and loaded_path == path:
|
|
62
|
-
return loaded_adata
|
|
63
58
|
adata, _ = safe_read_h5ad(path)
|
|
64
59
|
return adata
|
|
65
60
|
|
|
@@ -67,20 +62,8 @@ def preprocess_adata(
|
|
|
67
62
|
# Case A: full redo of preprocessing
|
|
68
63
|
# -----------------------------
|
|
69
64
|
if getattr(cfg, "force_redo_preprocessing", False):
|
|
70
|
-
logger.info(
|
|
71
|
-
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
if hmm_exists:
|
|
75
|
-
adata = _load(hmm_path)
|
|
76
|
-
source_path = hmm_path
|
|
77
|
-
elif spatial_exists:
|
|
78
|
-
adata = _load(spatial_path)
|
|
79
|
-
source_path = spatial_path
|
|
80
|
-
elif pp_dedup_exists:
|
|
81
|
-
adata = _load(pp_dedup_path)
|
|
82
|
-
source_path = pp_dedup_path
|
|
83
|
-
elif pp_exists:
|
|
65
|
+
logger.info("Forcing full redo of preprocessing workflow.")
|
|
66
|
+
if pp_exists:
|
|
84
67
|
adata = _load(pp_path)
|
|
85
68
|
source_path = pp_path
|
|
86
69
|
elif raw_exists:
|
|
@@ -135,26 +118,16 @@ def preprocess_adata(
|
|
|
135
118
|
# Case C: normal behavior (no explicit redo flags)
|
|
136
119
|
# -----------------------------
|
|
137
120
|
|
|
138
|
-
# If HMM exists, preprocessing is considered “done enough”
|
|
139
|
-
if hmm_exists:
|
|
140
|
-
logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
|
|
141
|
-
return (None, None, None, None)
|
|
142
|
-
|
|
143
|
-
# If spatial exists, also skip re-preprocessing by default
|
|
144
|
-
if spatial_exists:
|
|
145
|
-
logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
|
|
146
|
-
return (None, None, None, None)
|
|
147
|
-
|
|
148
121
|
# If pp_dedup exists, just return paths (no recomputation)
|
|
149
122
|
if pp_dedup_exists:
|
|
150
|
-
logger.
|
|
123
|
+
logger.info(
|
|
151
124
|
f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
|
|
152
125
|
)
|
|
153
126
|
return (None, pp_path, None, pp_dedup_path)
|
|
154
127
|
|
|
155
128
|
# If pp exists but pp_dedup does not, load pp and run core
|
|
156
129
|
if pp_exists:
|
|
157
|
-
logger.
|
|
130
|
+
logger.info(f"Preprocessed AnnData found: {pp_path}")
|
|
158
131
|
adata = _load(pp_path)
|
|
159
132
|
source_path = pp_path
|
|
160
133
|
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
@@ -218,13 +191,19 @@ def preprocess_adata_core(
|
|
|
218
191
|
pp_dup_rem_adata_path : Path
|
|
219
192
|
Path where pp_dedup_adata was written.
|
|
220
193
|
"""
|
|
194
|
+
from datetime import datetime
|
|
221
195
|
from pathlib import Path
|
|
222
196
|
|
|
223
197
|
from ..metadata import record_smftools_metadata
|
|
224
|
-
from ..plotting import
|
|
198
|
+
from ..plotting import (
|
|
199
|
+
plot_read_qc_histograms,
|
|
200
|
+
plot_read_span_quality_clustermaps,
|
|
201
|
+
plot_sequence_integer_encoding_clustermaps,
|
|
202
|
+
)
|
|
225
203
|
from ..preprocessing import (
|
|
226
204
|
append_base_context,
|
|
227
205
|
append_binary_layer_by_base_context,
|
|
206
|
+
append_mismatch_frequency_sites,
|
|
228
207
|
binarize_adata,
|
|
229
208
|
binarize_on_Youden,
|
|
230
209
|
calculate_complexity_II,
|
|
@@ -235,22 +214,39 @@ def preprocess_adata_core(
|
|
|
235
214
|
filter_reads_on_length_quality_mapping,
|
|
236
215
|
filter_reads_on_modification_thresholds,
|
|
237
216
|
flag_duplicate_reads,
|
|
217
|
+
invert_adata,
|
|
238
218
|
load_sample_sheet,
|
|
219
|
+
reindex_references_adata,
|
|
239
220
|
)
|
|
240
221
|
from ..readwrite import make_dirs
|
|
241
222
|
from .helpers import write_gz_h5ad
|
|
242
223
|
|
|
243
224
|
################################### 1) Load existing ###################################
|
|
225
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
226
|
+
now = datetime.now()
|
|
227
|
+
time_str = now.strftime("%H%M%S")
|
|
228
|
+
|
|
229
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
230
|
+
|
|
244
231
|
# General config variable init - Necessary user passed inputs
|
|
245
232
|
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
246
233
|
output_directory = Path(
|
|
247
234
|
cfg.output_directory
|
|
248
235
|
) # Path to the output directory to make for the analysis. Necessary.
|
|
249
|
-
|
|
236
|
+
preprocess_directory = output_directory / PREPROCESS_DIR
|
|
237
|
+
logging_directory = preprocess_directory / LOGGING_DIR
|
|
250
238
|
|
|
251
|
-
|
|
252
|
-
|
|
239
|
+
make_dirs([output_directory, preprocess_directory])
|
|
240
|
+
|
|
241
|
+
if cfg.emit_log_file:
|
|
242
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
243
|
+
make_dirs([logging_directory])
|
|
244
|
+
else:
|
|
245
|
+
log_file = None
|
|
253
246
|
|
|
247
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
248
|
+
|
|
249
|
+
######### Begin Preprocessing #########
|
|
254
250
|
## Load sample sheet metadata based on barcode mapping ##
|
|
255
251
|
if getattr(cfg, "sample_sheet_path", None):
|
|
256
252
|
load_sample_sheet(
|
|
@@ -264,12 +260,12 @@ def preprocess_adata_core(
|
|
|
264
260
|
pass
|
|
265
261
|
|
|
266
262
|
# Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
|
|
267
|
-
pp_length_qc_dir =
|
|
263
|
+
pp_length_qc_dir = preprocess_directory / "01_Read_length_and_quality_QC_metrics"
|
|
268
264
|
|
|
269
265
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
270
266
|
logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
|
|
271
267
|
else:
|
|
272
|
-
make_dirs([
|
|
268
|
+
make_dirs([preprocess_directory, pp_length_qc_dir])
|
|
273
269
|
plot_read_qc_histograms(
|
|
274
270
|
adata,
|
|
275
271
|
pp_length_qc_dir,
|
|
@@ -292,12 +288,12 @@ def preprocess_adata_core(
|
|
|
292
288
|
)
|
|
293
289
|
print(adata.shape)
|
|
294
290
|
|
|
295
|
-
pp_length_qc_dir =
|
|
291
|
+
pp_length_qc_dir = preprocess_directory / "02_Read_length_and_quality_QC_metrics_post_filtering"
|
|
296
292
|
|
|
297
293
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
298
294
|
logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
|
|
299
295
|
else:
|
|
300
|
-
make_dirs([
|
|
296
|
+
make_dirs([preprocess_directory, pp_length_qc_dir])
|
|
301
297
|
plot_read_qc_histograms(
|
|
302
298
|
adata,
|
|
303
299
|
pp_length_qc_dir,
|
|
@@ -310,7 +306,7 @@ def preprocess_adata_core(
|
|
|
310
306
|
if smf_modality == "direct":
|
|
311
307
|
native = True
|
|
312
308
|
if cfg.fit_position_methylation_thresholds:
|
|
313
|
-
pp_Youden_dir =
|
|
309
|
+
pp_Youden_dir = preprocess_directory / "02B_Position_wide_Youden_threshold_performance"
|
|
314
310
|
make_dirs([pp_Youden_dir])
|
|
315
311
|
# Calculate positional methylation thresholds for mod calls
|
|
316
312
|
calculate_position_Youden(
|
|
@@ -359,7 +355,6 @@ def preprocess_adata_core(
|
|
|
359
355
|
)
|
|
360
356
|
|
|
361
357
|
############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
|
|
362
|
-
# Additionally, store base_context level binary modification arrays in adata.obsm
|
|
363
358
|
append_base_context(
|
|
364
359
|
adata,
|
|
365
360
|
ref_column=cfg.reference_column,
|
|
@@ -378,17 +373,18 @@ def preprocess_adata_core(
|
|
|
378
373
|
cfg.mod_target_bases,
|
|
379
374
|
bypass=cfg.bypass_calculate_read_modification_stats,
|
|
380
375
|
force_redo=cfg.force_redo_calculate_read_modification_stats,
|
|
376
|
+
smf_modality=cfg.smf_modality,
|
|
381
377
|
)
|
|
382
378
|
|
|
383
379
|
### Make a dir for outputting sample level read modification metrics before filtering ###
|
|
384
|
-
pp_meth_qc_dir =
|
|
380
|
+
pp_meth_qc_dir = preprocess_directory / "03_read_modification_QC_metrics"
|
|
385
381
|
|
|
386
382
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
387
383
|
logger.debug(
|
|
388
384
|
f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
|
|
389
385
|
)
|
|
390
386
|
else:
|
|
391
|
-
make_dirs([
|
|
387
|
+
make_dirs([preprocess_directory, pp_meth_qc_dir])
|
|
392
388
|
obs_to_plot = ["Raw_modification_signal"]
|
|
393
389
|
if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
|
|
394
390
|
obs_to_plot += [
|
|
@@ -422,14 +418,14 @@ def preprocess_adata_core(
|
|
|
422
418
|
force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
|
|
423
419
|
)
|
|
424
420
|
|
|
425
|
-
pp_meth_qc_dir =
|
|
421
|
+
pp_meth_qc_dir = preprocess_directory / "04_read_modification_QC_metrics_post_filtering"
|
|
426
422
|
|
|
427
423
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
428
424
|
logger.debug(
|
|
429
425
|
f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
|
|
430
426
|
)
|
|
431
427
|
else:
|
|
432
|
-
make_dirs([
|
|
428
|
+
make_dirs([preprocess_directory, pp_meth_qc_dir])
|
|
433
429
|
obs_to_plot = ["Raw_modification_signal"]
|
|
434
430
|
if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
|
|
435
431
|
obs_to_plot += [
|
|
@@ -489,7 +485,7 @@ def preprocess_adata_core(
|
|
|
489
485
|
for site_type in cfg.duplicate_detection_site_types:
|
|
490
486
|
var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
|
|
491
487
|
|
|
492
|
-
pp_dup_qc_dir =
|
|
488
|
+
pp_dup_qc_dir = preprocess_directory / "05_read_duplication_QC_metrics"
|
|
493
489
|
|
|
494
490
|
make_dirs([pp_dup_qc_dir])
|
|
495
491
|
|
|
@@ -514,7 +510,7 @@ def preprocess_adata_core(
|
|
|
514
510
|
hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
|
|
515
511
|
hierarchical_metric="euclidean",
|
|
516
512
|
hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
|
|
517
|
-
demux_types=
|
|
513
|
+
demux_types=cfg.duplicate_detection_demux_types_to_use,
|
|
518
514
|
demux_col="demux_type",
|
|
519
515
|
)
|
|
520
516
|
|
|
@@ -541,6 +537,135 @@ def preprocess_adata_core(
|
|
|
541
537
|
adata_unique = adata
|
|
542
538
|
########################################################################################################################
|
|
543
539
|
|
|
540
|
+
# -----------------------------
|
|
541
|
+
# Optional inversion along positions axis
|
|
542
|
+
# -----------------------------
|
|
543
|
+
if getattr(cfg, "invert_adata", False):
|
|
544
|
+
adata = invert_adata(adata)
|
|
545
|
+
|
|
546
|
+
# -----------------------------
|
|
547
|
+
# Optional reindexing by reference
|
|
548
|
+
# -----------------------------
|
|
549
|
+
reindex_references_adata(
|
|
550
|
+
adata,
|
|
551
|
+
reference_col=cfg.reference_column,
|
|
552
|
+
offsets=cfg.reindexing_offsets,
|
|
553
|
+
new_col=cfg.reindexed_var_suffix,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
############################################### Append mismatch frequency per position ###############################################
|
|
557
|
+
append_mismatch_frequency_sites(
|
|
558
|
+
adata_unique,
|
|
559
|
+
ref_column=cfg.reference_column,
|
|
560
|
+
mismatch_layer=cfg.mismatch_frequency_layer,
|
|
561
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
562
|
+
mismatch_frequency_range=cfg.mismatch_frequency_range,
|
|
563
|
+
bypass=cfg.bypass_append_mismatch_frequency_sites,
|
|
564
|
+
force_redo=cfg.force_redo_append_mismatch_frequency_sites,
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
############################################### Plot integer sequence encoding clustermaps ###############################################
|
|
568
|
+
if "sequence_integer_encoding" not in adata.layers:
|
|
569
|
+
logger.debug(
|
|
570
|
+
"sequence_integer_encoding layer not found; skipping integer encoding clustermaps."
|
|
571
|
+
)
|
|
572
|
+
else:
|
|
573
|
+
pp_seq_clustermap_dir = preprocess_directory / "06_sequence_integer_encoding_clustermaps"
|
|
574
|
+
if pp_seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
575
|
+
logger.debug(
|
|
576
|
+
f"{pp_seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
|
|
577
|
+
)
|
|
578
|
+
else:
|
|
579
|
+
make_dirs([pp_seq_clustermap_dir])
|
|
580
|
+
plot_sequence_integer_encoding_clustermaps(
|
|
581
|
+
adata,
|
|
582
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
583
|
+
reference_col=cfg.reference_column,
|
|
584
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
585
|
+
min_quality=None,
|
|
586
|
+
min_length=None,
|
|
587
|
+
min_mapped_length_to_reference_length_ratio=None,
|
|
588
|
+
sort_by="none",
|
|
589
|
+
max_unknown_fraction=0.5,
|
|
590
|
+
save_path=pp_seq_clustermap_dir,
|
|
591
|
+
show_position_axis=True,
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
pp_dedup_seq_clustermap_dir = (
|
|
595
|
+
preprocess_directory / "deduplicated" / "06_sequence_integer_encoding_clustermaps"
|
|
596
|
+
)
|
|
597
|
+
if pp_dedup_seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
598
|
+
logger.debug(
|
|
599
|
+
f"{pp_dedup_seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
|
|
600
|
+
)
|
|
601
|
+
else:
|
|
602
|
+
make_dirs([pp_dedup_seq_clustermap_dir])
|
|
603
|
+
plot_sequence_integer_encoding_clustermaps(
|
|
604
|
+
adata_unique,
|
|
605
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
606
|
+
reference_col=cfg.reference_column,
|
|
607
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
608
|
+
min_quality=None,
|
|
609
|
+
min_length=None,
|
|
610
|
+
min_mapped_length_to_reference_length_ratio=None,
|
|
611
|
+
sort_by="none",
|
|
612
|
+
max_unknown_fraction=0.5,
|
|
613
|
+
save_path=pp_dedup_seq_clustermap_dir,
|
|
614
|
+
show_position_axis=True,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
############################################### Plot read span mask + base quality clustermaps ###############################################
|
|
618
|
+
quality_layer = None
|
|
619
|
+
if "base_quality_scores" in adata.layers:
|
|
620
|
+
quality_layer = "base_quality_scores"
|
|
621
|
+
elif "base_qualities" in adata.layers:
|
|
622
|
+
quality_layer = "base_qualities"
|
|
623
|
+
|
|
624
|
+
if "read_span_mask" not in adata.layers or quality_layer is None:
|
|
625
|
+
logger.debug(
|
|
626
|
+
"read_span_mask and base quality layers not found; skipping read span/base quality clustermaps."
|
|
627
|
+
)
|
|
628
|
+
else:
|
|
629
|
+
pp_span_quality_dir = preprocess_directory / "07_read_span_quality_clustermaps"
|
|
630
|
+
if pp_span_quality_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
631
|
+
logger.debug(
|
|
632
|
+
f"{pp_span_quality_dir} already exists. Skipping read span/base quality clustermaps."
|
|
633
|
+
)
|
|
634
|
+
else:
|
|
635
|
+
make_dirs([pp_span_quality_dir])
|
|
636
|
+
plot_read_span_quality_clustermaps(
|
|
637
|
+
adata,
|
|
638
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
639
|
+
reference_col=cfg.reference_column,
|
|
640
|
+
quality_layer=quality_layer,
|
|
641
|
+
read_span_layer="read_span_mask",
|
|
642
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
643
|
+
save_path=pp_span_quality_dir,
|
|
644
|
+
show_position_axis=True,
|
|
645
|
+
max_nan_fraction=0.5,
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
pp_dedup_span_quality_dir = (
|
|
649
|
+
preprocess_directory / "deduplicated" / "07_read_span_quality_clustermaps"
|
|
650
|
+
)
|
|
651
|
+
if pp_dedup_span_quality_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
652
|
+
logger.debug(
|
|
653
|
+
f"{pp_dedup_span_quality_dir} already exists. Skipping read span/base quality clustermaps."
|
|
654
|
+
)
|
|
655
|
+
elif quality_layer in adata_unique.layers and "read_span_mask" in adata_unique.layers:
|
|
656
|
+
make_dirs([pp_dedup_span_quality_dir])
|
|
657
|
+
plot_read_span_quality_clustermaps(
|
|
658
|
+
adata_unique,
|
|
659
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
660
|
+
reference_col=cfg.reference_column,
|
|
661
|
+
quality_layer=quality_layer,
|
|
662
|
+
read_span_layer="read_span_mask",
|
|
663
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
664
|
+
save_path=pp_dedup_span_quality_dir,
|
|
665
|
+
show_position_axis=True,
|
|
666
|
+
max_nan_fraction=0.5,
|
|
667
|
+
)
|
|
668
|
+
|
|
544
669
|
############################################### Save preprocessed adata with duplicate detection ###############################################
|
|
545
670
|
if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
546
671
|
logger.info("Saving preprocessed adata.")
|