smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
smftools/cli/preprocess_adata.py
CHANGED
|
@@ -1,8 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
2
4
|
from typing import Optional, Tuple
|
|
3
5
|
|
|
4
6
|
import anndata as ad
|
|
5
7
|
|
|
8
|
+
from smftools.logging_utils import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
6
13
|
def preprocess_adata(
|
|
7
14
|
config_path: str,
|
|
8
15
|
) -> Tuple[Optional[ad.AnnData], Optional[Path], Optional[ad.AnnData], Optional[Path]]:
|
|
@@ -29,8 +36,8 @@ def preprocess_adata(
|
|
|
29
36
|
Path to preprocessed, duplicate-removed AnnData.
|
|
30
37
|
"""
|
|
31
38
|
from ..readwrite import safe_read_h5ad
|
|
32
|
-
from .load_adata import load_adata
|
|
33
39
|
from .helpers import get_adata_paths
|
|
40
|
+
from .load_adata import load_adata
|
|
34
41
|
|
|
35
42
|
# 1) Ensure config is loaded and at least *some* AnnData stage exists
|
|
36
43
|
loaded_adata, loaded_path, cfg = load_adata(config_path)
|
|
@@ -60,20 +67,27 @@ def preprocess_adata(
|
|
|
60
67
|
# Case A: full redo of preprocessing
|
|
61
68
|
# -----------------------------
|
|
62
69
|
if getattr(cfg, "force_redo_preprocessing", False):
|
|
63
|
-
|
|
70
|
+
logger.info(
|
|
71
|
+
"Forcing full redo of preprocessing workflow, starting from latest stage AnnData available."
|
|
72
|
+
)
|
|
64
73
|
|
|
65
74
|
if hmm_exists:
|
|
66
75
|
adata = _load(hmm_path)
|
|
76
|
+
source_path = hmm_path
|
|
67
77
|
elif spatial_exists:
|
|
68
78
|
adata = _load(spatial_path)
|
|
79
|
+
source_path = spatial_path
|
|
69
80
|
elif pp_dedup_exists:
|
|
70
81
|
adata = _load(pp_dedup_path)
|
|
82
|
+
source_path = pp_dedup_path
|
|
71
83
|
elif pp_exists:
|
|
72
84
|
adata = _load(pp_path)
|
|
85
|
+
source_path = pp_path
|
|
73
86
|
elif raw_exists:
|
|
74
87
|
adata = _load(raw_path)
|
|
88
|
+
source_path = raw_path
|
|
75
89
|
else:
|
|
76
|
-
|
|
90
|
+
logger.error("Cannot redo preprocessing: no AnnData available at any stage.")
|
|
77
91
|
return (None, None, None, None)
|
|
78
92
|
|
|
79
93
|
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
@@ -81,6 +95,8 @@ def preprocess_adata(
|
|
|
81
95
|
cfg=cfg,
|
|
82
96
|
pp_adata_path=pp_path,
|
|
83
97
|
pp_dup_rem_adata_path=pp_dedup_path,
|
|
98
|
+
source_adata_path=source_path,
|
|
99
|
+
config_path=config_path,
|
|
84
100
|
)
|
|
85
101
|
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
86
102
|
|
|
@@ -88,16 +104,18 @@ def preprocess_adata(
|
|
|
88
104
|
# Case B: redo duplicate detection only
|
|
89
105
|
# -----------------------------
|
|
90
106
|
if getattr(cfg, "force_redo_flag_duplicate_reads", False):
|
|
91
|
-
|
|
107
|
+
logger.info(
|
|
92
108
|
"Forcing redo of duplicate detection workflow, starting from the preprocessed AnnData "
|
|
93
109
|
"if available. Otherwise, will use the raw AnnData."
|
|
94
110
|
)
|
|
95
111
|
if pp_exists:
|
|
96
112
|
adata = _load(pp_path)
|
|
113
|
+
source_path = pp_path
|
|
97
114
|
elif raw_exists:
|
|
98
115
|
adata = _load(raw_path)
|
|
116
|
+
source_path = raw_path
|
|
99
117
|
else:
|
|
100
|
-
|
|
118
|
+
logger.error(
|
|
101
119
|
"Cannot redo duplicate detection: no compatible AnnData available "
|
|
102
120
|
"(need at least raw or preprocessed)."
|
|
103
121
|
)
|
|
@@ -108,6 +126,8 @@ def preprocess_adata(
|
|
|
108
126
|
cfg=cfg,
|
|
109
127
|
pp_adata_path=pp_path,
|
|
110
128
|
pp_dup_rem_adata_path=pp_dedup_path,
|
|
129
|
+
source_adata_path=source_path,
|
|
130
|
+
config_path=config_path,
|
|
111
131
|
)
|
|
112
132
|
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
113
133
|
|
|
@@ -117,43 +137,51 @@ def preprocess_adata(
|
|
|
117
137
|
|
|
118
138
|
# If HMM exists, preprocessing is considered “done enough”
|
|
119
139
|
if hmm_exists:
|
|
120
|
-
|
|
140
|
+
logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
|
|
121
141
|
return (None, None, None, None)
|
|
122
142
|
|
|
123
143
|
# If spatial exists, also skip re-preprocessing by default
|
|
124
144
|
if spatial_exists:
|
|
125
|
-
|
|
145
|
+
logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
|
|
126
146
|
return (None, None, None, None)
|
|
127
147
|
|
|
128
148
|
# If pp_dedup exists, just return paths (no recomputation)
|
|
129
149
|
if pp_dedup_exists:
|
|
130
|
-
|
|
150
|
+
logger.debug(
|
|
151
|
+
f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
|
|
152
|
+
)
|
|
131
153
|
return (None, pp_path, None, pp_dedup_path)
|
|
132
154
|
|
|
133
155
|
# If pp exists but pp_dedup does not, load pp and run core
|
|
134
156
|
if pp_exists:
|
|
135
|
-
|
|
157
|
+
logger.debug(f"Preprocessed AnnData found: {pp_path}")
|
|
136
158
|
adata = _load(pp_path)
|
|
159
|
+
source_path = pp_path
|
|
137
160
|
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
138
161
|
adata=adata,
|
|
139
162
|
cfg=cfg,
|
|
140
163
|
pp_adata_path=pp_path,
|
|
141
164
|
pp_dup_rem_adata_path=pp_dedup_path,
|
|
165
|
+
source_adata_path=source_path,
|
|
166
|
+
config_path=config_path,
|
|
142
167
|
)
|
|
143
168
|
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
144
169
|
|
|
145
170
|
# Otherwise, fall back to raw (if available)
|
|
146
171
|
if raw_exists:
|
|
147
172
|
adata = _load(raw_path)
|
|
173
|
+
source_path = raw_path
|
|
148
174
|
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
149
175
|
adata=adata,
|
|
150
176
|
cfg=cfg,
|
|
151
177
|
pp_adata_path=pp_path,
|
|
152
178
|
pp_dup_rem_adata_path=pp_dedup_path,
|
|
179
|
+
source_adata_path=source_path,
|
|
180
|
+
config_path=config_path,
|
|
153
181
|
)
|
|
154
182
|
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
155
183
|
|
|
156
|
-
|
|
184
|
+
logger.error("No AnnData available at any stage for preprocessing.")
|
|
157
185
|
return (None, None, None, None)
|
|
158
186
|
|
|
159
187
|
|
|
@@ -162,6 +190,8 @@ def preprocess_adata_core(
|
|
|
162
190
|
cfg,
|
|
163
191
|
pp_adata_path: Path,
|
|
164
192
|
pp_dup_rem_adata_path: Path,
|
|
193
|
+
source_adata_path: Optional[Path] = None,
|
|
194
|
+
config_path: Optional[str] = None,
|
|
165
195
|
) -> Tuple[ad.AnnData, Path, ad.AnnData, Path]:
|
|
166
196
|
"""
|
|
167
197
|
Core preprocessing pipeline.
|
|
@@ -190,31 +220,32 @@ def preprocess_adata_core(
|
|
|
190
220
|
"""
|
|
191
221
|
from pathlib import Path
|
|
192
222
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
from .helpers import write_gz_h5ad
|
|
196
|
-
from ..readwrite import make_dirs
|
|
223
|
+
from ..metadata import record_smftools_metadata
|
|
224
|
+
from ..plotting import plot_read_qc_histograms
|
|
197
225
|
from ..preprocessing import (
|
|
198
|
-
load_sample_sheet,
|
|
199
|
-
filter_reads_on_length_quality_mapping,
|
|
200
|
-
clean_NaN,
|
|
201
|
-
calculate_coverage,
|
|
202
226
|
append_base_context,
|
|
203
227
|
append_binary_layer_by_base_context,
|
|
228
|
+
binarize_adata,
|
|
229
|
+
binarize_on_Youden,
|
|
230
|
+
calculate_complexity_II,
|
|
231
|
+
calculate_coverage,
|
|
232
|
+
calculate_position_Youden,
|
|
204
233
|
calculate_read_modification_stats,
|
|
234
|
+
clean_NaN,
|
|
235
|
+
filter_reads_on_length_quality_mapping,
|
|
205
236
|
filter_reads_on_modification_thresholds,
|
|
206
237
|
flag_duplicate_reads,
|
|
207
|
-
|
|
208
|
-
calculate_position_Youden,
|
|
209
|
-
binarize_on_Youden,
|
|
210
|
-
binarize_adata,
|
|
238
|
+
load_sample_sheet,
|
|
211
239
|
)
|
|
212
|
-
from ..
|
|
240
|
+
from ..readwrite import make_dirs
|
|
241
|
+
from .helpers import write_gz_h5ad
|
|
213
242
|
|
|
214
243
|
################################### 1) Load existing ###################################
|
|
215
244
|
# General config variable init - Necessary user passed inputs
|
|
216
|
-
smf_modality = cfg.smf_modality
|
|
217
|
-
output_directory = Path(
|
|
245
|
+
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
246
|
+
output_directory = Path(
|
|
247
|
+
cfg.output_directory
|
|
248
|
+
) # Path to the output directory to make for the analysis. Necessary.
|
|
218
249
|
make_dirs([output_directory])
|
|
219
250
|
|
|
220
251
|
######### Begin Preprocessing #########
|
|
@@ -222,172 +253,238 @@ def preprocess_adata_core(
|
|
|
222
253
|
|
|
223
254
|
## Load sample sheet metadata based on barcode mapping ##
|
|
224
255
|
if getattr(cfg, "sample_sheet_path", None):
|
|
225
|
-
load_sample_sheet(
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
256
|
+
load_sample_sheet(
|
|
257
|
+
adata,
|
|
258
|
+
cfg.sample_sheet_path,
|
|
259
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
260
|
+
as_category=True,
|
|
261
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
262
|
+
)
|
|
230
263
|
else:
|
|
231
264
|
pass
|
|
232
|
-
|
|
265
|
+
|
|
233
266
|
# Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
|
|
234
267
|
pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
|
|
235
268
|
|
|
236
269
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
237
|
-
|
|
270
|
+
logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
|
|
238
271
|
else:
|
|
239
272
|
make_dirs([pp_dir, pp_length_qc_dir])
|
|
240
|
-
plot_read_qc_histograms(
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
273
|
+
plot_read_qc_histograms(
|
|
274
|
+
adata,
|
|
275
|
+
pp_length_qc_dir,
|
|
276
|
+
cfg.obs_to_plot_pp_qc,
|
|
277
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
278
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
279
|
+
)
|
|
245
280
|
|
|
246
281
|
# Filter on read length, read quality, reference length, mapped_length, and mapping quality metadata.
|
|
247
282
|
print(adata.shape)
|
|
248
|
-
adata = filter_reads_on_length_quality_mapping(
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
283
|
+
adata = filter_reads_on_length_quality_mapping(
|
|
284
|
+
adata,
|
|
285
|
+
filter_on_coordinates=cfg.read_coord_filter,
|
|
286
|
+
read_length=cfg.read_len_filter_thresholds,
|
|
287
|
+
length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
|
|
288
|
+
read_quality=cfg.read_quality_filter_thresholds,
|
|
289
|
+
mapping_quality=cfg.read_mapping_quality_filter_thresholds,
|
|
290
|
+
bypass=None,
|
|
291
|
+
force_redo=None,
|
|
292
|
+
)
|
|
256
293
|
print(adata.shape)
|
|
257
294
|
|
|
258
295
|
pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
|
|
259
296
|
|
|
260
297
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
261
|
-
|
|
298
|
+
logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
|
|
262
299
|
else:
|
|
263
300
|
make_dirs([pp_dir, pp_length_qc_dir])
|
|
264
|
-
plot_read_qc_histograms(
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
301
|
+
plot_read_qc_histograms(
|
|
302
|
+
adata,
|
|
303
|
+
pp_length_qc_dir,
|
|
304
|
+
cfg.obs_to_plot_pp_qc,
|
|
305
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
306
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
307
|
+
)
|
|
308
|
+
|
|
270
309
|
############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
|
|
271
|
-
if smf_modality ==
|
|
310
|
+
if smf_modality == "direct":
|
|
272
311
|
native = True
|
|
273
312
|
if cfg.fit_position_methylation_thresholds:
|
|
274
313
|
pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
|
|
275
314
|
make_dirs([pp_Youden_dir])
|
|
276
315
|
# Calculate positional methylation thresholds for mod calls
|
|
277
|
-
calculate_position_Youden(
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
316
|
+
calculate_position_Youden(
|
|
317
|
+
adata,
|
|
318
|
+
positive_control_sample=cfg.positive_control_sample_methylation_fitting,
|
|
319
|
+
negative_control_sample=cfg.negative_control_sample_methylation_fitting,
|
|
320
|
+
J_threshold=cfg.fit_j_threshold,
|
|
321
|
+
ref_column=cfg.reference_column,
|
|
322
|
+
sample_column=cfg.sample_column,
|
|
323
|
+
infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
|
|
324
|
+
inference_variable=cfg.inference_variable_sample_methylation_fitting,
|
|
325
|
+
save=True,
|
|
326
|
+
output_directory=pp_Youden_dir,
|
|
327
|
+
)
|
|
288
328
|
# binarize the modcalls based on the determined thresholds
|
|
289
|
-
binarize_on_Youden(
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
329
|
+
binarize_on_Youden(
|
|
330
|
+
adata,
|
|
331
|
+
ref_column=cfg.reference_column,
|
|
332
|
+
output_layer_name=cfg.output_binary_layer_name,
|
|
333
|
+
)
|
|
293
334
|
else:
|
|
294
|
-
binarize_adata(
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
335
|
+
binarize_adata(
|
|
336
|
+
adata,
|
|
337
|
+
source="X",
|
|
338
|
+
target_layer=cfg.output_binary_layer_name,
|
|
339
|
+
threshold=cfg.binarize_on_fixed_methlyation_threshold,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
clean_NaN(
|
|
343
|
+
adata,
|
|
344
|
+
layer=cfg.output_binary_layer_name,
|
|
345
|
+
bypass=cfg.bypass_clean_nan,
|
|
346
|
+
force_redo=cfg.force_redo_clean_nan,
|
|
347
|
+
)
|
|
304
348
|
else:
|
|
305
349
|
native = False
|
|
306
|
-
clean_NaN(adata,
|
|
307
|
-
|
|
308
|
-
force_redo=cfg.force_redo_clean_nan
|
|
309
|
-
)
|
|
310
|
-
|
|
350
|
+
clean_NaN(adata, bypass=cfg.bypass_clean_nan, force_redo=cfg.force_redo_clean_nan)
|
|
351
|
+
|
|
311
352
|
############### Calculate positional coverage by reference set in dataset ###############
|
|
312
|
-
calculate_coverage(
|
|
313
|
-
|
|
314
|
-
|
|
353
|
+
calculate_coverage(
|
|
354
|
+
adata,
|
|
355
|
+
ref_column=cfg.reference_column,
|
|
356
|
+
position_nan_threshold=cfg.position_max_nan_threshold,
|
|
357
|
+
smf_modality=smf_modality,
|
|
358
|
+
target_layer=cfg.output_binary_layer_name,
|
|
359
|
+
)
|
|
315
360
|
|
|
316
361
|
############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
|
|
317
362
|
# Additionally, store base_context level binary modification arrays in adata.obsm
|
|
318
|
-
append_base_context(
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
force_redo=cfg.force_redo_calculate_read_modification_stats)
|
|
339
|
-
|
|
363
|
+
append_base_context(
|
|
364
|
+
adata,
|
|
365
|
+
ref_column=cfg.reference_column,
|
|
366
|
+
use_consensus=False,
|
|
367
|
+
native=native,
|
|
368
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
369
|
+
bypass=cfg.bypass_append_base_context,
|
|
370
|
+
force_redo=cfg.force_redo_append_base_context,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
############### Calculate read methylation/deamination statistics for specific base contexts defined by append_base_context ###############
|
|
374
|
+
calculate_read_modification_stats(
|
|
375
|
+
adata,
|
|
376
|
+
cfg.reference_column,
|
|
377
|
+
cfg.sample_column,
|
|
378
|
+
cfg.mod_target_bases,
|
|
379
|
+
bypass=cfg.bypass_calculate_read_modification_stats,
|
|
380
|
+
force_redo=cfg.force_redo_calculate_read_modification_stats,
|
|
381
|
+
)
|
|
382
|
+
|
|
340
383
|
### Make a dir for outputting sample level read modification metrics before filtering ###
|
|
341
384
|
pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
|
|
342
385
|
|
|
343
386
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
344
|
-
|
|
387
|
+
logger.debug(
|
|
388
|
+
f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
|
|
389
|
+
)
|
|
345
390
|
else:
|
|
346
391
|
make_dirs([pp_dir, pp_meth_qc_dir])
|
|
347
|
-
obs_to_plot = [
|
|
348
|
-
if any(base in cfg.mod_target_bases for base in [
|
|
349
|
-
obs_to_plot += [
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
392
|
+
obs_to_plot = ["Raw_modification_signal"]
|
|
393
|
+
if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
|
|
394
|
+
obs_to_plot += [
|
|
395
|
+
"Fraction_GpC_site_modified",
|
|
396
|
+
"Fraction_CpG_site_modified",
|
|
397
|
+
"Fraction_other_C_site_modified",
|
|
398
|
+
"Fraction_C_site_modified",
|
|
399
|
+
]
|
|
400
|
+
if "A" in cfg.mod_target_bases:
|
|
401
|
+
obs_to_plot += ["Fraction_A_site_modified"]
|
|
402
|
+
plot_read_qc_histograms(
|
|
403
|
+
adata,
|
|
404
|
+
pp_meth_qc_dir,
|
|
405
|
+
obs_to_plot,
|
|
406
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
407
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
408
|
+
)
|
|
356
409
|
|
|
357
410
|
##### Optionally filter reads on modification metrics
|
|
358
|
-
adata = filter_reads_on_modification_thresholds(
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
411
|
+
adata = filter_reads_on_modification_thresholds(
|
|
412
|
+
adata,
|
|
413
|
+
smf_modality=smf_modality,
|
|
414
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
415
|
+
gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
|
|
416
|
+
cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
|
|
417
|
+
any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
|
|
418
|
+
a_thresholds=cfg.read_mod_filtering_a_thresholds,
|
|
419
|
+
use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
|
|
420
|
+
min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
421
|
+
bypass=cfg.bypass_filter_reads_on_modification_thresholds,
|
|
422
|
+
force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
|
|
423
|
+
)
|
|
424
|
+
|
|
370
425
|
pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
|
|
371
|
-
|
|
426
|
+
|
|
372
427
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
373
|
-
|
|
428
|
+
logger.debug(
|
|
429
|
+
f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
|
|
430
|
+
)
|
|
374
431
|
else:
|
|
375
432
|
make_dirs([pp_dir, pp_meth_qc_dir])
|
|
376
|
-
obs_to_plot = [
|
|
377
|
-
if any(base in cfg.mod_target_bases for base in [
|
|
378
|
-
obs_to_plot += [
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
433
|
+
obs_to_plot = ["Raw_modification_signal"]
|
|
434
|
+
if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
|
|
435
|
+
obs_to_plot += [
|
|
436
|
+
"Fraction_GpC_site_modified",
|
|
437
|
+
"Fraction_CpG_site_modified",
|
|
438
|
+
"Fraction_other_C_site_modified",
|
|
439
|
+
"Fraction_C_site_modified",
|
|
440
|
+
]
|
|
441
|
+
if "A" in cfg.mod_target_bases:
|
|
442
|
+
obs_to_plot += ["Fraction_A_site_modified"]
|
|
443
|
+
plot_read_qc_histograms(
|
|
444
|
+
adata,
|
|
445
|
+
pp_meth_qc_dir,
|
|
446
|
+
obs_to_plot,
|
|
447
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
448
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
############### Calculate final positional coverage by reference set in dataset after filtering reads ###############
|
|
452
|
+
calculate_coverage(
|
|
453
|
+
adata,
|
|
454
|
+
ref_column=cfg.reference_column,
|
|
455
|
+
position_nan_threshold=cfg.position_max_nan_threshold,
|
|
456
|
+
smf_modality=smf_modality,
|
|
457
|
+
target_layer=cfg.output_binary_layer_name,
|
|
458
|
+
force_redo=True,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats after filtering reads ###############
|
|
462
|
+
# Additionally, store base_context level binary modification arrays in adata.obsm
|
|
463
|
+
append_base_context(
|
|
464
|
+
adata,
|
|
465
|
+
ref_column=cfg.reference_column,
|
|
466
|
+
use_consensus=False,
|
|
467
|
+
native=native,
|
|
468
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
469
|
+
bypass=cfg.bypass_append_base_context,
|
|
470
|
+
force_redo=True,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
# Add site type binary modification layers for valid coverage sites
|
|
474
|
+
adata = append_binary_layer_by_base_context(
|
|
475
|
+
adata,
|
|
476
|
+
cfg.reference_column,
|
|
477
|
+
smf_modality,
|
|
478
|
+
bypass=cfg.bypass_append_binary_layer_by_base_context,
|
|
479
|
+
force_redo=cfg.force_redo_append_binary_layer_by_base_context,
|
|
480
|
+
from_valid_sites_only=True,
|
|
481
|
+
)
|
|
385
482
|
|
|
386
483
|
############### Duplicate detection for conversion/deamination SMF ###############
|
|
387
|
-
if smf_modality !=
|
|
484
|
+
if smf_modality != "direct":
|
|
388
485
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
389
486
|
|
|
390
|
-
var_filters_sets =[]
|
|
487
|
+
var_filters_sets = []
|
|
391
488
|
for ref in references:
|
|
392
489
|
for site_type in cfg.duplicate_detection_site_types:
|
|
393
490
|
var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
|
|
@@ -397,27 +494,30 @@ def preprocess_adata_core(
|
|
|
397
494
|
make_dirs([pp_dup_qc_dir])
|
|
398
495
|
|
|
399
496
|
# Flag duplicate reads and plot duplicate detection QC
|
|
400
|
-
adata_unique, adata = flag_duplicate_reads(
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
497
|
+
adata_unique, adata = flag_duplicate_reads(
|
|
498
|
+
adata,
|
|
499
|
+
var_filters_sets,
|
|
500
|
+
distance_threshold=cfg.duplicate_detection_distance_threshold,
|
|
501
|
+
obs_reference_col=cfg.reference_column,
|
|
502
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
503
|
+
output_directory=pp_dup_qc_dir,
|
|
504
|
+
metric_keys=cfg.hamming_vs_metric_keys,
|
|
505
|
+
keep_best_metric=cfg.duplicate_detection_keep_best_metric,
|
|
506
|
+
bypass=cfg.bypass_flag_duplicate_reads,
|
|
507
|
+
force_redo=cfg.force_redo_flag_duplicate_reads,
|
|
508
|
+
window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
|
|
509
|
+
min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
|
|
510
|
+
do_pca=cfg.duplicate_detection_do_pca,
|
|
511
|
+
pca_n_components=50,
|
|
512
|
+
pca_center=True,
|
|
513
|
+
do_hierarchical=cfg.duplicate_detection_do_hierarchical,
|
|
514
|
+
hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
|
|
515
|
+
hierarchical_metric="euclidean",
|
|
516
|
+
hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
|
|
517
|
+
demux_types=("double", "already"),
|
|
518
|
+
demux_col="demux_type",
|
|
519
|
+
)
|
|
520
|
+
|
|
421
521
|
# Use the flagged duplicate read groups and perform complexity analysis
|
|
422
522
|
complexity_outs = pp_dup_qc_dir / "sample_complexity_analyses"
|
|
423
523
|
make_dirs([complexity_outs])
|
|
@@ -426,15 +526,15 @@ def preprocess_adata_core(
|
|
|
426
526
|
output_directory=complexity_outs,
|
|
427
527
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
428
528
|
ref_col=cfg.reference_column,
|
|
429
|
-
cluster_col=
|
|
529
|
+
cluster_col="sequence__merged_cluster_id",
|
|
430
530
|
plot=True,
|
|
431
|
-
save_plot=True,
|
|
531
|
+
save_plot=True, # set False to display instead
|
|
432
532
|
n_boot=30,
|
|
433
533
|
n_depths=12,
|
|
434
534
|
random_state=42,
|
|
435
535
|
csv_summary=True,
|
|
436
536
|
bypass=cfg.bypass_complexity_analysis,
|
|
437
|
-
force_redo=cfg.force_redo_complexity_analysis
|
|
537
|
+
force_redo=cfg.force_redo_complexity_analysis,
|
|
438
538
|
)
|
|
439
539
|
|
|
440
540
|
else:
|
|
@@ -443,13 +543,29 @@ def preprocess_adata_core(
|
|
|
443
543
|
|
|
444
544
|
############################################### Save preprocessed adata with duplicate detection ###############################################
|
|
445
545
|
if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
446
|
-
|
|
546
|
+
logger.info("Saving preprocessed adata.")
|
|
547
|
+
record_smftools_metadata(
|
|
548
|
+
adata,
|
|
549
|
+
step_name="preprocess",
|
|
550
|
+
cfg=cfg,
|
|
551
|
+
config_path=config_path,
|
|
552
|
+
input_paths=[source_adata_path] if source_adata_path else None,
|
|
553
|
+
output_path=pp_adata_path,
|
|
554
|
+
)
|
|
447
555
|
write_gz_h5ad(adata, pp_adata_path)
|
|
448
556
|
|
|
449
557
|
if not pp_dup_rem_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
450
|
-
|
|
451
|
-
|
|
558
|
+
logger.info("Saving preprocessed adata with duplicates removed.")
|
|
559
|
+
record_smftools_metadata(
|
|
560
|
+
adata_unique,
|
|
561
|
+
step_name="preprocess",
|
|
562
|
+
cfg=cfg,
|
|
563
|
+
config_path=config_path,
|
|
564
|
+
input_paths=[pp_adata_path],
|
|
565
|
+
output_path=pp_dup_rem_adata_path,
|
|
566
|
+
)
|
|
567
|
+
write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
|
|
452
568
|
|
|
453
569
|
########################################################################################################################
|
|
454
570
|
|
|
455
|
-
return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
|
|
571
|
+
return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
|