smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +34 -6
- smftools/cli/hmm_adata.py +239 -33
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +167 -131
- smftools/cli/preprocess_adata.py +180 -53
- smftools/cli/spatial_adata.py +152 -100
- smftools/cli_entry.py +38 -1
- smftools/config/__init__.py +2 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +42 -2
- smftools/config/experiment_config.py +59 -1
- smftools/constants.py +65 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +97 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +59 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1093 -176
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +641 -176
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +123 -4
- smftools/informatics/modkit_extract_to_adata.py +1019 -431
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/metadata.py +1 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +41 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +2415 -629
- smftools/plotting/hmm_plotting.py +97 -9
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +36 -37
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +30 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +93 -8
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
- smftools-0.3.1.dist-info/RECORD +189 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
smftools/cli/preprocess_adata.py
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
1
4
|
from pathlib import Path
|
|
2
5
|
from typing import Optional, Tuple
|
|
3
6
|
|
|
4
7
|
import anndata as ad
|
|
5
8
|
|
|
6
|
-
from smftools.
|
|
9
|
+
from smftools.constants import LOGGING_DIR, PREPROCESS_DIR
|
|
10
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
7
11
|
|
|
8
12
|
logger = get_logger(__name__)
|
|
9
13
|
|
|
@@ -34,30 +38,23 @@ def preprocess_adata(
|
|
|
34
38
|
Path to preprocessed, duplicate-removed AnnData.
|
|
35
39
|
"""
|
|
36
40
|
from ..readwrite import safe_read_h5ad
|
|
37
|
-
from .helpers import get_adata_paths
|
|
38
|
-
from .load_adata import load_adata
|
|
41
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
39
42
|
|
|
40
43
|
# 1) Ensure config is loaded and at least *some* AnnData stage exists
|
|
41
|
-
|
|
44
|
+
cfg = load_experiment_config(config_path)
|
|
42
45
|
|
|
43
46
|
# 2) Compute canonical paths
|
|
44
47
|
paths = get_adata_paths(cfg)
|
|
45
48
|
raw_path = paths.raw
|
|
46
49
|
pp_path = paths.pp
|
|
47
50
|
pp_dedup_path = paths.pp_dedup
|
|
48
|
-
spatial_path = paths.spatial
|
|
49
|
-
hmm_path = paths.hmm
|
|
50
51
|
|
|
51
52
|
raw_exists = raw_path.exists()
|
|
52
53
|
pp_exists = pp_path.exists()
|
|
53
54
|
pp_dedup_exists = pp_dedup_path.exists()
|
|
54
|
-
spatial_exists = spatial_path.exists()
|
|
55
|
-
hmm_exists = hmm_path.exists()
|
|
56
55
|
|
|
57
|
-
# Helper:
|
|
56
|
+
# Helper: read from disk
|
|
58
57
|
def _load(path: Path):
|
|
59
|
-
if loaded_adata is not None and loaded_path == path:
|
|
60
|
-
return loaded_adata
|
|
61
58
|
adata, _ = safe_read_h5ad(path)
|
|
62
59
|
return adata
|
|
63
60
|
|
|
@@ -65,20 +62,8 @@ def preprocess_adata(
|
|
|
65
62
|
# Case A: full redo of preprocessing
|
|
66
63
|
# -----------------------------
|
|
67
64
|
if getattr(cfg, "force_redo_preprocessing", False):
|
|
68
|
-
logger.info(
|
|
69
|
-
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
if hmm_exists:
|
|
73
|
-
adata = _load(hmm_path)
|
|
74
|
-
source_path = hmm_path
|
|
75
|
-
elif spatial_exists:
|
|
76
|
-
adata = _load(spatial_path)
|
|
77
|
-
source_path = spatial_path
|
|
78
|
-
elif pp_dedup_exists:
|
|
79
|
-
adata = _load(pp_dedup_path)
|
|
80
|
-
source_path = pp_dedup_path
|
|
81
|
-
elif pp_exists:
|
|
65
|
+
logger.info("Forcing full redo of preprocessing workflow.")
|
|
66
|
+
if pp_exists:
|
|
82
67
|
adata = _load(pp_path)
|
|
83
68
|
source_path = pp_path
|
|
84
69
|
elif raw_exists:
|
|
@@ -133,26 +118,16 @@ def preprocess_adata(
|
|
|
133
118
|
# Case C: normal behavior (no explicit redo flags)
|
|
134
119
|
# -----------------------------
|
|
135
120
|
|
|
136
|
-
# If HMM exists, preprocessing is considered “done enough”
|
|
137
|
-
if hmm_exists:
|
|
138
|
-
logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
|
|
139
|
-
return (None, None, None, None)
|
|
140
|
-
|
|
141
|
-
# If spatial exists, also skip re-preprocessing by default
|
|
142
|
-
if spatial_exists:
|
|
143
|
-
logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
|
|
144
|
-
return (None, None, None, None)
|
|
145
|
-
|
|
146
121
|
# If pp_dedup exists, just return paths (no recomputation)
|
|
147
122
|
if pp_dedup_exists:
|
|
148
|
-
logger.
|
|
123
|
+
logger.info(
|
|
149
124
|
f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
|
|
150
125
|
)
|
|
151
126
|
return (None, pp_path, None, pp_dedup_path)
|
|
152
127
|
|
|
153
128
|
# If pp exists but pp_dedup does not, load pp and run core
|
|
154
129
|
if pp_exists:
|
|
155
|
-
logger.
|
|
130
|
+
logger.info(f"Preprocessed AnnData found: {pp_path}")
|
|
156
131
|
adata = _load(pp_path)
|
|
157
132
|
source_path = pp_path
|
|
158
133
|
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
@@ -216,13 +191,19 @@ def preprocess_adata_core(
|
|
|
216
191
|
pp_dup_rem_adata_path : Path
|
|
217
192
|
Path where pp_dedup_adata was written.
|
|
218
193
|
"""
|
|
194
|
+
from datetime import datetime
|
|
219
195
|
from pathlib import Path
|
|
220
196
|
|
|
221
197
|
from ..metadata import record_smftools_metadata
|
|
222
|
-
from ..plotting import
|
|
198
|
+
from ..plotting import (
|
|
199
|
+
plot_read_qc_histograms,
|
|
200
|
+
plot_read_span_quality_clustermaps,
|
|
201
|
+
plot_sequence_integer_encoding_clustermaps,
|
|
202
|
+
)
|
|
223
203
|
from ..preprocessing import (
|
|
224
204
|
append_base_context,
|
|
225
205
|
append_binary_layer_by_base_context,
|
|
206
|
+
append_mismatch_frequency_sites,
|
|
226
207
|
binarize_adata,
|
|
227
208
|
binarize_on_Youden,
|
|
228
209
|
calculate_complexity_II,
|
|
@@ -233,22 +214,39 @@ def preprocess_adata_core(
|
|
|
233
214
|
filter_reads_on_length_quality_mapping,
|
|
234
215
|
filter_reads_on_modification_thresholds,
|
|
235
216
|
flag_duplicate_reads,
|
|
217
|
+
invert_adata,
|
|
236
218
|
load_sample_sheet,
|
|
219
|
+
reindex_references_adata,
|
|
237
220
|
)
|
|
238
221
|
from ..readwrite import make_dirs
|
|
239
222
|
from .helpers import write_gz_h5ad
|
|
240
223
|
|
|
241
224
|
################################### 1) Load existing ###################################
|
|
225
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
226
|
+
now = datetime.now()
|
|
227
|
+
time_str = now.strftime("%H%M%S")
|
|
228
|
+
|
|
229
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
230
|
+
|
|
242
231
|
# General config variable init - Necessary user passed inputs
|
|
243
232
|
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
244
233
|
output_directory = Path(
|
|
245
234
|
cfg.output_directory
|
|
246
235
|
) # Path to the output directory to make for the analysis. Necessary.
|
|
247
|
-
|
|
236
|
+
preprocess_directory = output_directory / PREPROCESS_DIR
|
|
237
|
+
logging_directory = preprocess_directory / LOGGING_DIR
|
|
248
238
|
|
|
249
|
-
|
|
250
|
-
|
|
239
|
+
make_dirs([output_directory, preprocess_directory])
|
|
240
|
+
|
|
241
|
+
if cfg.emit_log_file:
|
|
242
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
243
|
+
make_dirs([logging_directory])
|
|
244
|
+
else:
|
|
245
|
+
log_file = None
|
|
246
|
+
|
|
247
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
251
248
|
|
|
249
|
+
######### Begin Preprocessing #########
|
|
252
250
|
## Load sample sheet metadata based on barcode mapping ##
|
|
253
251
|
if getattr(cfg, "sample_sheet_path", None):
|
|
254
252
|
load_sample_sheet(
|
|
@@ -262,12 +260,12 @@ def preprocess_adata_core(
|
|
|
262
260
|
pass
|
|
263
261
|
|
|
264
262
|
# Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
|
|
265
|
-
pp_length_qc_dir =
|
|
263
|
+
pp_length_qc_dir = preprocess_directory / "01_Read_length_and_quality_QC_metrics"
|
|
266
264
|
|
|
267
265
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
268
266
|
logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
|
|
269
267
|
else:
|
|
270
|
-
make_dirs([
|
|
268
|
+
make_dirs([preprocess_directory, pp_length_qc_dir])
|
|
271
269
|
plot_read_qc_histograms(
|
|
272
270
|
adata,
|
|
273
271
|
pp_length_qc_dir,
|
|
@@ -290,12 +288,12 @@ def preprocess_adata_core(
|
|
|
290
288
|
)
|
|
291
289
|
print(adata.shape)
|
|
292
290
|
|
|
293
|
-
pp_length_qc_dir =
|
|
291
|
+
pp_length_qc_dir = preprocess_directory / "02_Read_length_and_quality_QC_metrics_post_filtering"
|
|
294
292
|
|
|
295
293
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
296
294
|
logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
|
|
297
295
|
else:
|
|
298
|
-
make_dirs([
|
|
296
|
+
make_dirs([preprocess_directory, pp_length_qc_dir])
|
|
299
297
|
plot_read_qc_histograms(
|
|
300
298
|
adata,
|
|
301
299
|
pp_length_qc_dir,
|
|
@@ -308,7 +306,7 @@ def preprocess_adata_core(
|
|
|
308
306
|
if smf_modality == "direct":
|
|
309
307
|
native = True
|
|
310
308
|
if cfg.fit_position_methylation_thresholds:
|
|
311
|
-
pp_Youden_dir =
|
|
309
|
+
pp_Youden_dir = preprocess_directory / "02B_Position_wide_Youden_threshold_performance"
|
|
312
310
|
make_dirs([pp_Youden_dir])
|
|
313
311
|
# Calculate positional methylation thresholds for mod calls
|
|
314
312
|
calculate_position_Youden(
|
|
@@ -357,7 +355,6 @@ def preprocess_adata_core(
|
|
|
357
355
|
)
|
|
358
356
|
|
|
359
357
|
############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
|
|
360
|
-
# Additionally, store base_context level binary modification arrays in adata.obsm
|
|
361
358
|
append_base_context(
|
|
362
359
|
adata,
|
|
363
360
|
ref_column=cfg.reference_column,
|
|
@@ -376,17 +373,18 @@ def preprocess_adata_core(
|
|
|
376
373
|
cfg.mod_target_bases,
|
|
377
374
|
bypass=cfg.bypass_calculate_read_modification_stats,
|
|
378
375
|
force_redo=cfg.force_redo_calculate_read_modification_stats,
|
|
376
|
+
smf_modality=cfg.smf_modality,
|
|
379
377
|
)
|
|
380
378
|
|
|
381
379
|
### Make a dir for outputting sample level read modification metrics before filtering ###
|
|
382
|
-
pp_meth_qc_dir =
|
|
380
|
+
pp_meth_qc_dir = preprocess_directory / "03_read_modification_QC_metrics"
|
|
383
381
|
|
|
384
382
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
385
383
|
logger.debug(
|
|
386
384
|
f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
|
|
387
385
|
)
|
|
388
386
|
else:
|
|
389
|
-
make_dirs([
|
|
387
|
+
make_dirs([preprocess_directory, pp_meth_qc_dir])
|
|
390
388
|
obs_to_plot = ["Raw_modification_signal"]
|
|
391
389
|
if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
|
|
392
390
|
obs_to_plot += [
|
|
@@ -420,14 +418,14 @@ def preprocess_adata_core(
|
|
|
420
418
|
force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
|
|
421
419
|
)
|
|
422
420
|
|
|
423
|
-
pp_meth_qc_dir =
|
|
421
|
+
pp_meth_qc_dir = preprocess_directory / "04_read_modification_QC_metrics_post_filtering"
|
|
424
422
|
|
|
425
423
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
426
424
|
logger.debug(
|
|
427
425
|
f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
|
|
428
426
|
)
|
|
429
427
|
else:
|
|
430
|
-
make_dirs([
|
|
428
|
+
make_dirs([preprocess_directory, pp_meth_qc_dir])
|
|
431
429
|
obs_to_plot = ["Raw_modification_signal"]
|
|
432
430
|
if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
|
|
433
431
|
obs_to_plot += [
|
|
@@ -487,7 +485,7 @@ def preprocess_adata_core(
|
|
|
487
485
|
for site_type in cfg.duplicate_detection_site_types:
|
|
488
486
|
var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
|
|
489
487
|
|
|
490
|
-
pp_dup_qc_dir =
|
|
488
|
+
pp_dup_qc_dir = preprocess_directory / "05_read_duplication_QC_metrics"
|
|
491
489
|
|
|
492
490
|
make_dirs([pp_dup_qc_dir])
|
|
493
491
|
|
|
@@ -512,7 +510,7 @@ def preprocess_adata_core(
|
|
|
512
510
|
hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
|
|
513
511
|
hierarchical_metric="euclidean",
|
|
514
512
|
hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
|
|
515
|
-
demux_types=
|
|
513
|
+
demux_types=cfg.duplicate_detection_demux_types_to_use,
|
|
516
514
|
demux_col="demux_type",
|
|
517
515
|
)
|
|
518
516
|
|
|
@@ -539,6 +537,135 @@ def preprocess_adata_core(
|
|
|
539
537
|
adata_unique = adata
|
|
540
538
|
########################################################################################################################
|
|
541
539
|
|
|
540
|
+
# -----------------------------
|
|
541
|
+
# Optional inversion along positions axis
|
|
542
|
+
# -----------------------------
|
|
543
|
+
if getattr(cfg, "invert_adata", False):
|
|
544
|
+
adata = invert_adata(adata)
|
|
545
|
+
|
|
546
|
+
# -----------------------------
|
|
547
|
+
# Optional reindexing by reference
|
|
548
|
+
# -----------------------------
|
|
549
|
+
reindex_references_adata(
|
|
550
|
+
adata,
|
|
551
|
+
reference_col=cfg.reference_column,
|
|
552
|
+
offsets=cfg.reindexing_offsets,
|
|
553
|
+
new_col=cfg.reindexed_var_suffix,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
############################################### Append mismatch frequency per position ###############################################
|
|
557
|
+
append_mismatch_frequency_sites(
|
|
558
|
+
adata_unique,
|
|
559
|
+
ref_column=cfg.reference_column,
|
|
560
|
+
mismatch_layer=cfg.mismatch_frequency_layer,
|
|
561
|
+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
|
|
562
|
+
mismatch_frequency_range=cfg.mismatch_frequency_range,
|
|
563
|
+
bypass=cfg.bypass_append_mismatch_frequency_sites,
|
|
564
|
+
force_redo=cfg.force_redo_append_mismatch_frequency_sites,
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
############################################### Plot integer sequence encoding clustermaps ###############################################
|
|
568
|
+
if "sequence_integer_encoding" not in adata.layers:
|
|
569
|
+
logger.debug(
|
|
570
|
+
"sequence_integer_encoding layer not found; skipping integer encoding clustermaps."
|
|
571
|
+
)
|
|
572
|
+
else:
|
|
573
|
+
pp_seq_clustermap_dir = preprocess_directory / "06_sequence_integer_encoding_clustermaps"
|
|
574
|
+
if pp_seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
575
|
+
logger.debug(
|
|
576
|
+
f"{pp_seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
|
|
577
|
+
)
|
|
578
|
+
else:
|
|
579
|
+
make_dirs([pp_seq_clustermap_dir])
|
|
580
|
+
plot_sequence_integer_encoding_clustermaps(
|
|
581
|
+
adata,
|
|
582
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
583
|
+
reference_col=cfg.reference_column,
|
|
584
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
585
|
+
min_quality=None,
|
|
586
|
+
min_length=None,
|
|
587
|
+
min_mapped_length_to_reference_length_ratio=None,
|
|
588
|
+
sort_by="none",
|
|
589
|
+
max_unknown_fraction=0.5,
|
|
590
|
+
save_path=pp_seq_clustermap_dir,
|
|
591
|
+
show_position_axis=True,
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
pp_dedup_seq_clustermap_dir = (
|
|
595
|
+
preprocess_directory / "deduplicated" / "06_sequence_integer_encoding_clustermaps"
|
|
596
|
+
)
|
|
597
|
+
if pp_dedup_seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
598
|
+
logger.debug(
|
|
599
|
+
f"{pp_dedup_seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
|
|
600
|
+
)
|
|
601
|
+
else:
|
|
602
|
+
make_dirs([pp_dedup_seq_clustermap_dir])
|
|
603
|
+
plot_sequence_integer_encoding_clustermaps(
|
|
604
|
+
adata_unique,
|
|
605
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
606
|
+
reference_col=cfg.reference_column,
|
|
607
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
608
|
+
min_quality=None,
|
|
609
|
+
min_length=None,
|
|
610
|
+
min_mapped_length_to_reference_length_ratio=None,
|
|
611
|
+
sort_by="none",
|
|
612
|
+
max_unknown_fraction=0.5,
|
|
613
|
+
save_path=pp_dedup_seq_clustermap_dir,
|
|
614
|
+
show_position_axis=True,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
############################################### Plot read span mask + base quality clustermaps ###############################################
|
|
618
|
+
quality_layer = None
|
|
619
|
+
if "base_quality_scores" in adata.layers:
|
|
620
|
+
quality_layer = "base_quality_scores"
|
|
621
|
+
elif "base_qualities" in adata.layers:
|
|
622
|
+
quality_layer = "base_qualities"
|
|
623
|
+
|
|
624
|
+
if "read_span_mask" not in adata.layers or quality_layer is None:
|
|
625
|
+
logger.debug(
|
|
626
|
+
"read_span_mask and base quality layers not found; skipping read span/base quality clustermaps."
|
|
627
|
+
)
|
|
628
|
+
else:
|
|
629
|
+
pp_span_quality_dir = preprocess_directory / "07_read_span_quality_clustermaps"
|
|
630
|
+
if pp_span_quality_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
631
|
+
logger.debug(
|
|
632
|
+
f"{pp_span_quality_dir} already exists. Skipping read span/base quality clustermaps."
|
|
633
|
+
)
|
|
634
|
+
else:
|
|
635
|
+
make_dirs([pp_span_quality_dir])
|
|
636
|
+
plot_read_span_quality_clustermaps(
|
|
637
|
+
adata,
|
|
638
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
639
|
+
reference_col=cfg.reference_column,
|
|
640
|
+
quality_layer=quality_layer,
|
|
641
|
+
read_span_layer="read_span_mask",
|
|
642
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
643
|
+
save_path=pp_span_quality_dir,
|
|
644
|
+
show_position_axis=True,
|
|
645
|
+
max_nan_fraction=0.5,
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
pp_dedup_span_quality_dir = (
|
|
649
|
+
preprocess_directory / "deduplicated" / "07_read_span_quality_clustermaps"
|
|
650
|
+
)
|
|
651
|
+
if pp_dedup_span_quality_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
652
|
+
logger.debug(
|
|
653
|
+
f"{pp_dedup_span_quality_dir} already exists. Skipping read span/base quality clustermaps."
|
|
654
|
+
)
|
|
655
|
+
elif quality_layer in adata_unique.layers and "read_span_mask" in adata_unique.layers:
|
|
656
|
+
make_dirs([pp_dedup_span_quality_dir])
|
|
657
|
+
plot_read_span_quality_clustermaps(
|
|
658
|
+
adata_unique,
|
|
659
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
660
|
+
reference_col=cfg.reference_column,
|
|
661
|
+
quality_layer=quality_layer,
|
|
662
|
+
read_span_layer="read_span_mask",
|
|
663
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
664
|
+
save_path=pp_dedup_span_quality_dir,
|
|
665
|
+
show_position_axis=True,
|
|
666
|
+
max_nan_fraction=0.5,
|
|
667
|
+
)
|
|
668
|
+
|
|
542
669
|
############################################### Save preprocessed adata with duplicate detection ###############################################
|
|
543
670
|
if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
544
671
|
logger.info("Saving preprocessed adata.")
|