smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +7 -1
- smftools/cli/hmm_adata.py +902 -244
- smftools/cli/load_adata.py +318 -198
- smftools/cli/preprocess_adata.py +285 -171
- smftools/cli/spatial_adata.py +137 -53
- smftools/cli_entry.py +94 -178
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +22 -17
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +505 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2125 -1426
- smftools/hmm/__init__.py +2 -3
- smftools/hmm/archived/call_hmm_peaks.py +16 -1
- smftools/hmm/call_hmm_peaks.py +173 -193
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +379 -156
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +195 -29
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +347 -168
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +145 -85
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +8 -8
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +103 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +688 -271
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.4.dist-info/RECORD +0 -176
- /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/cli/preprocess_adata.py
CHANGED
|
@@ -3,6 +3,11 @@ from typing import Optional, Tuple
|
|
|
3
3
|
|
|
4
4
|
import anndata as ad
|
|
5
5
|
|
|
6
|
+
from smftools.logging_utils import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
6
11
|
def preprocess_adata(
|
|
7
12
|
config_path: str,
|
|
8
13
|
) -> Tuple[Optional[ad.AnnData], Optional[Path], Optional[ad.AnnData], Optional[Path]]:
|
|
@@ -29,8 +34,8 @@ def preprocess_adata(
|
|
|
29
34
|
Path to preprocessed, duplicate-removed AnnData.
|
|
30
35
|
"""
|
|
31
36
|
from ..readwrite import safe_read_h5ad
|
|
32
|
-
from .load_adata import load_adata
|
|
33
37
|
from .helpers import get_adata_paths
|
|
38
|
+
from .load_adata import load_adata
|
|
34
39
|
|
|
35
40
|
# 1) Ensure config is loaded and at least *some* AnnData stage exists
|
|
36
41
|
loaded_adata, loaded_path, cfg = load_adata(config_path)
|
|
@@ -60,20 +65,27 @@ def preprocess_adata(
|
|
|
60
65
|
# Case A: full redo of preprocessing
|
|
61
66
|
# -----------------------------
|
|
62
67
|
if getattr(cfg, "force_redo_preprocessing", False):
|
|
63
|
-
|
|
68
|
+
logger.info(
|
|
69
|
+
"Forcing full redo of preprocessing workflow, starting from latest stage AnnData available."
|
|
70
|
+
)
|
|
64
71
|
|
|
65
72
|
if hmm_exists:
|
|
66
73
|
adata = _load(hmm_path)
|
|
74
|
+
source_path = hmm_path
|
|
67
75
|
elif spatial_exists:
|
|
68
76
|
adata = _load(spatial_path)
|
|
77
|
+
source_path = spatial_path
|
|
69
78
|
elif pp_dedup_exists:
|
|
70
79
|
adata = _load(pp_dedup_path)
|
|
80
|
+
source_path = pp_dedup_path
|
|
71
81
|
elif pp_exists:
|
|
72
82
|
adata = _load(pp_path)
|
|
83
|
+
source_path = pp_path
|
|
73
84
|
elif raw_exists:
|
|
74
85
|
adata = _load(raw_path)
|
|
86
|
+
source_path = raw_path
|
|
75
87
|
else:
|
|
76
|
-
|
|
88
|
+
logger.error("Cannot redo preprocessing: no AnnData available at any stage.")
|
|
77
89
|
return (None, None, None, None)
|
|
78
90
|
|
|
79
91
|
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
@@ -81,6 +93,8 @@ def preprocess_adata(
|
|
|
81
93
|
cfg=cfg,
|
|
82
94
|
pp_adata_path=pp_path,
|
|
83
95
|
pp_dup_rem_adata_path=pp_dedup_path,
|
|
96
|
+
source_adata_path=source_path,
|
|
97
|
+
config_path=config_path,
|
|
84
98
|
)
|
|
85
99
|
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
86
100
|
|
|
@@ -88,16 +102,18 @@ def preprocess_adata(
|
|
|
88
102
|
# Case B: redo duplicate detection only
|
|
89
103
|
# -----------------------------
|
|
90
104
|
if getattr(cfg, "force_redo_flag_duplicate_reads", False):
|
|
91
|
-
|
|
105
|
+
logger.info(
|
|
92
106
|
"Forcing redo of duplicate detection workflow, starting from the preprocessed AnnData "
|
|
93
107
|
"if available. Otherwise, will use the raw AnnData."
|
|
94
108
|
)
|
|
95
109
|
if pp_exists:
|
|
96
110
|
adata = _load(pp_path)
|
|
111
|
+
source_path = pp_path
|
|
97
112
|
elif raw_exists:
|
|
98
113
|
adata = _load(raw_path)
|
|
114
|
+
source_path = raw_path
|
|
99
115
|
else:
|
|
100
|
-
|
|
116
|
+
logger.error(
|
|
101
117
|
"Cannot redo duplicate detection: no compatible AnnData available "
|
|
102
118
|
"(need at least raw or preprocessed)."
|
|
103
119
|
)
|
|
@@ -108,6 +124,8 @@ def preprocess_adata(
|
|
|
108
124
|
cfg=cfg,
|
|
109
125
|
pp_adata_path=pp_path,
|
|
110
126
|
pp_dup_rem_adata_path=pp_dedup_path,
|
|
127
|
+
source_adata_path=source_path,
|
|
128
|
+
config_path=config_path,
|
|
111
129
|
)
|
|
112
130
|
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
113
131
|
|
|
@@ -117,43 +135,51 @@ def preprocess_adata(
|
|
|
117
135
|
|
|
118
136
|
# If HMM exists, preprocessing is considered “done enough”
|
|
119
137
|
if hmm_exists:
|
|
120
|
-
|
|
138
|
+
logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
|
|
121
139
|
return (None, None, None, None)
|
|
122
140
|
|
|
123
141
|
# If spatial exists, also skip re-preprocessing by default
|
|
124
142
|
if spatial_exists:
|
|
125
|
-
|
|
143
|
+
logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
|
|
126
144
|
return (None, None, None, None)
|
|
127
145
|
|
|
128
146
|
# If pp_dedup exists, just return paths (no recomputation)
|
|
129
147
|
if pp_dedup_exists:
|
|
130
|
-
|
|
148
|
+
logger.debug(
|
|
149
|
+
f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
|
|
150
|
+
)
|
|
131
151
|
return (None, pp_path, None, pp_dedup_path)
|
|
132
152
|
|
|
133
153
|
# If pp exists but pp_dedup does not, load pp and run core
|
|
134
154
|
if pp_exists:
|
|
135
|
-
|
|
155
|
+
logger.debug(f"Preprocessed AnnData found: {pp_path}")
|
|
136
156
|
adata = _load(pp_path)
|
|
157
|
+
source_path = pp_path
|
|
137
158
|
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
138
159
|
adata=adata,
|
|
139
160
|
cfg=cfg,
|
|
140
161
|
pp_adata_path=pp_path,
|
|
141
162
|
pp_dup_rem_adata_path=pp_dedup_path,
|
|
163
|
+
source_adata_path=source_path,
|
|
164
|
+
config_path=config_path,
|
|
142
165
|
)
|
|
143
166
|
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
144
167
|
|
|
145
168
|
# Otherwise, fall back to raw (if available)
|
|
146
169
|
if raw_exists:
|
|
147
170
|
adata = _load(raw_path)
|
|
171
|
+
source_path = raw_path
|
|
148
172
|
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
149
173
|
adata=adata,
|
|
150
174
|
cfg=cfg,
|
|
151
175
|
pp_adata_path=pp_path,
|
|
152
176
|
pp_dup_rem_adata_path=pp_dedup_path,
|
|
177
|
+
source_adata_path=source_path,
|
|
178
|
+
config_path=config_path,
|
|
153
179
|
)
|
|
154
180
|
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
155
181
|
|
|
156
|
-
|
|
182
|
+
logger.error("No AnnData available at any stage for preprocessing.")
|
|
157
183
|
return (None, None, None, None)
|
|
158
184
|
|
|
159
185
|
|
|
@@ -162,6 +188,8 @@ def preprocess_adata_core(
|
|
|
162
188
|
cfg,
|
|
163
189
|
pp_adata_path: Path,
|
|
164
190
|
pp_dup_rem_adata_path: Path,
|
|
191
|
+
source_adata_path: Optional[Path] = None,
|
|
192
|
+
config_path: Optional[str] = None,
|
|
165
193
|
) -> Tuple[ad.AnnData, Path, ad.AnnData, Path]:
|
|
166
194
|
"""
|
|
167
195
|
Core preprocessing pipeline.
|
|
@@ -190,31 +218,32 @@ def preprocess_adata_core(
|
|
|
190
218
|
"""
|
|
191
219
|
from pathlib import Path
|
|
192
220
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
from .helpers import write_gz_h5ad
|
|
196
|
-
from ..readwrite import make_dirs
|
|
221
|
+
from ..metadata import record_smftools_metadata
|
|
222
|
+
from ..plotting import plot_read_qc_histograms
|
|
197
223
|
from ..preprocessing import (
|
|
198
|
-
load_sample_sheet,
|
|
199
|
-
filter_reads_on_length_quality_mapping,
|
|
200
|
-
clean_NaN,
|
|
201
|
-
calculate_coverage,
|
|
202
224
|
append_base_context,
|
|
203
225
|
append_binary_layer_by_base_context,
|
|
226
|
+
binarize_adata,
|
|
227
|
+
binarize_on_Youden,
|
|
228
|
+
calculate_complexity_II,
|
|
229
|
+
calculate_coverage,
|
|
230
|
+
calculate_position_Youden,
|
|
204
231
|
calculate_read_modification_stats,
|
|
232
|
+
clean_NaN,
|
|
233
|
+
filter_reads_on_length_quality_mapping,
|
|
205
234
|
filter_reads_on_modification_thresholds,
|
|
206
235
|
flag_duplicate_reads,
|
|
207
|
-
|
|
208
|
-
calculate_position_Youden,
|
|
209
|
-
binarize_on_Youden,
|
|
210
|
-
binarize_adata,
|
|
236
|
+
load_sample_sheet,
|
|
211
237
|
)
|
|
212
|
-
from ..
|
|
238
|
+
from ..readwrite import make_dirs
|
|
239
|
+
from .helpers import write_gz_h5ad
|
|
213
240
|
|
|
214
241
|
################################### 1) Load existing ###################################
|
|
215
242
|
# General config variable init - Necessary user passed inputs
|
|
216
|
-
smf_modality = cfg.smf_modality
|
|
217
|
-
output_directory = Path(
|
|
243
|
+
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
244
|
+
output_directory = Path(
|
|
245
|
+
cfg.output_directory
|
|
246
|
+
) # Path to the output directory to make for the analysis. Necessary.
|
|
218
247
|
make_dirs([output_directory])
|
|
219
248
|
|
|
220
249
|
######### Begin Preprocessing #########
|
|
@@ -222,172 +251,238 @@ def preprocess_adata_core(
|
|
|
222
251
|
|
|
223
252
|
## Load sample sheet metadata based on barcode mapping ##
|
|
224
253
|
if getattr(cfg, "sample_sheet_path", None):
|
|
225
|
-
load_sample_sheet(
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
254
|
+
load_sample_sheet(
|
|
255
|
+
adata,
|
|
256
|
+
cfg.sample_sheet_path,
|
|
257
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
258
|
+
as_category=True,
|
|
259
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
260
|
+
)
|
|
230
261
|
else:
|
|
231
262
|
pass
|
|
232
|
-
|
|
263
|
+
|
|
233
264
|
# Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
|
|
234
265
|
pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
|
|
235
266
|
|
|
236
267
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
237
|
-
|
|
268
|
+
logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
|
|
238
269
|
else:
|
|
239
270
|
make_dirs([pp_dir, pp_length_qc_dir])
|
|
240
|
-
plot_read_qc_histograms(
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
271
|
+
plot_read_qc_histograms(
|
|
272
|
+
adata,
|
|
273
|
+
pp_length_qc_dir,
|
|
274
|
+
cfg.obs_to_plot_pp_qc,
|
|
275
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
276
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
277
|
+
)
|
|
245
278
|
|
|
246
279
|
# Filter on read length, read quality, reference length, mapped_length, and mapping quality metadata.
|
|
247
280
|
print(adata.shape)
|
|
248
|
-
adata = filter_reads_on_length_quality_mapping(
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
281
|
+
adata = filter_reads_on_length_quality_mapping(
|
|
282
|
+
adata,
|
|
283
|
+
filter_on_coordinates=cfg.read_coord_filter,
|
|
284
|
+
read_length=cfg.read_len_filter_thresholds,
|
|
285
|
+
length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
|
|
286
|
+
read_quality=cfg.read_quality_filter_thresholds,
|
|
287
|
+
mapping_quality=cfg.read_mapping_quality_filter_thresholds,
|
|
288
|
+
bypass=None,
|
|
289
|
+
force_redo=None,
|
|
290
|
+
)
|
|
256
291
|
print(adata.shape)
|
|
257
292
|
|
|
258
293
|
pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
|
|
259
294
|
|
|
260
295
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
261
|
-
|
|
296
|
+
logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
|
|
262
297
|
else:
|
|
263
298
|
make_dirs([pp_dir, pp_length_qc_dir])
|
|
264
|
-
plot_read_qc_histograms(
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
299
|
+
plot_read_qc_histograms(
|
|
300
|
+
adata,
|
|
301
|
+
pp_length_qc_dir,
|
|
302
|
+
cfg.obs_to_plot_pp_qc,
|
|
303
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
304
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
305
|
+
)
|
|
306
|
+
|
|
270
307
|
############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
|
|
271
|
-
if smf_modality ==
|
|
308
|
+
if smf_modality == "direct":
|
|
272
309
|
native = True
|
|
273
310
|
if cfg.fit_position_methylation_thresholds:
|
|
274
311
|
pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
|
|
275
312
|
make_dirs([pp_Youden_dir])
|
|
276
313
|
# Calculate positional methylation thresholds for mod calls
|
|
277
|
-
calculate_position_Youden(
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
314
|
+
calculate_position_Youden(
|
|
315
|
+
adata,
|
|
316
|
+
positive_control_sample=cfg.positive_control_sample_methylation_fitting,
|
|
317
|
+
negative_control_sample=cfg.negative_control_sample_methylation_fitting,
|
|
318
|
+
J_threshold=cfg.fit_j_threshold,
|
|
319
|
+
ref_column=cfg.reference_column,
|
|
320
|
+
sample_column=cfg.sample_column,
|
|
321
|
+
infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
|
|
322
|
+
inference_variable=cfg.inference_variable_sample_methylation_fitting,
|
|
323
|
+
save=True,
|
|
324
|
+
output_directory=pp_Youden_dir,
|
|
325
|
+
)
|
|
288
326
|
# binarize the modcalls based on the determined thresholds
|
|
289
|
-
binarize_on_Youden(
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
327
|
+
binarize_on_Youden(
|
|
328
|
+
adata,
|
|
329
|
+
ref_column=cfg.reference_column,
|
|
330
|
+
output_layer_name=cfg.output_binary_layer_name,
|
|
331
|
+
)
|
|
293
332
|
else:
|
|
294
|
-
binarize_adata(
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
333
|
+
binarize_adata(
|
|
334
|
+
adata,
|
|
335
|
+
source="X",
|
|
336
|
+
target_layer=cfg.output_binary_layer_name,
|
|
337
|
+
threshold=cfg.binarize_on_fixed_methlyation_threshold,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
clean_NaN(
|
|
341
|
+
adata,
|
|
342
|
+
layer=cfg.output_binary_layer_name,
|
|
343
|
+
bypass=cfg.bypass_clean_nan,
|
|
344
|
+
force_redo=cfg.force_redo_clean_nan,
|
|
345
|
+
)
|
|
304
346
|
else:
|
|
305
347
|
native = False
|
|
306
|
-
clean_NaN(adata,
|
|
307
|
-
|
|
308
|
-
force_redo=cfg.force_redo_clean_nan
|
|
309
|
-
)
|
|
310
|
-
|
|
348
|
+
clean_NaN(adata, bypass=cfg.bypass_clean_nan, force_redo=cfg.force_redo_clean_nan)
|
|
349
|
+
|
|
311
350
|
############### Calculate positional coverage by reference set in dataset ###############
|
|
312
|
-
calculate_coverage(
|
|
313
|
-
|
|
314
|
-
|
|
351
|
+
calculate_coverage(
|
|
352
|
+
adata,
|
|
353
|
+
ref_column=cfg.reference_column,
|
|
354
|
+
position_nan_threshold=cfg.position_max_nan_threshold,
|
|
355
|
+
smf_modality=smf_modality,
|
|
356
|
+
target_layer=cfg.output_binary_layer_name,
|
|
357
|
+
)
|
|
315
358
|
|
|
316
359
|
############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
|
|
317
360
|
# Additionally, store base_context level binary modification arrays in adata.obsm
|
|
318
|
-
append_base_context(
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
force_redo=cfg.force_redo_calculate_read_modification_stats)
|
|
339
|
-
|
|
361
|
+
append_base_context(
|
|
362
|
+
adata,
|
|
363
|
+
ref_column=cfg.reference_column,
|
|
364
|
+
use_consensus=False,
|
|
365
|
+
native=native,
|
|
366
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
367
|
+
bypass=cfg.bypass_append_base_context,
|
|
368
|
+
force_redo=cfg.force_redo_append_base_context,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
############### Calculate read methylation/deamination statistics for specific base contexts defined by append_base_context ###############
|
|
372
|
+
calculate_read_modification_stats(
|
|
373
|
+
adata,
|
|
374
|
+
cfg.reference_column,
|
|
375
|
+
cfg.sample_column,
|
|
376
|
+
cfg.mod_target_bases,
|
|
377
|
+
bypass=cfg.bypass_calculate_read_modification_stats,
|
|
378
|
+
force_redo=cfg.force_redo_calculate_read_modification_stats,
|
|
379
|
+
)
|
|
380
|
+
|
|
340
381
|
### Make a dir for outputting sample level read modification metrics before filtering ###
|
|
341
382
|
pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
|
|
342
383
|
|
|
343
384
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
344
|
-
|
|
385
|
+
logger.debug(
|
|
386
|
+
f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
|
|
387
|
+
)
|
|
345
388
|
else:
|
|
346
389
|
make_dirs([pp_dir, pp_meth_qc_dir])
|
|
347
|
-
obs_to_plot = [
|
|
348
|
-
if any(base in cfg.mod_target_bases for base in [
|
|
349
|
-
obs_to_plot += [
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
390
|
+
obs_to_plot = ["Raw_modification_signal"]
|
|
391
|
+
if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
|
|
392
|
+
obs_to_plot += [
|
|
393
|
+
"Fraction_GpC_site_modified",
|
|
394
|
+
"Fraction_CpG_site_modified",
|
|
395
|
+
"Fraction_other_C_site_modified",
|
|
396
|
+
"Fraction_C_site_modified",
|
|
397
|
+
]
|
|
398
|
+
if "A" in cfg.mod_target_bases:
|
|
399
|
+
obs_to_plot += ["Fraction_A_site_modified"]
|
|
400
|
+
plot_read_qc_histograms(
|
|
401
|
+
adata,
|
|
402
|
+
pp_meth_qc_dir,
|
|
403
|
+
obs_to_plot,
|
|
404
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
405
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
406
|
+
)
|
|
356
407
|
|
|
357
408
|
##### Optionally filter reads on modification metrics
|
|
358
|
-
adata = filter_reads_on_modification_thresholds(
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
409
|
+
adata = filter_reads_on_modification_thresholds(
|
|
410
|
+
adata,
|
|
411
|
+
smf_modality=smf_modality,
|
|
412
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
413
|
+
gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
|
|
414
|
+
cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
|
|
415
|
+
any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
|
|
416
|
+
a_thresholds=cfg.read_mod_filtering_a_thresholds,
|
|
417
|
+
use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
|
|
418
|
+
min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
419
|
+
bypass=cfg.bypass_filter_reads_on_modification_thresholds,
|
|
420
|
+
force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
|
|
421
|
+
)
|
|
422
|
+
|
|
370
423
|
pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
|
|
371
|
-
|
|
424
|
+
|
|
372
425
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
373
|
-
|
|
426
|
+
logger.debug(
|
|
427
|
+
f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
|
|
428
|
+
)
|
|
374
429
|
else:
|
|
375
430
|
make_dirs([pp_dir, pp_meth_qc_dir])
|
|
376
|
-
obs_to_plot = [
|
|
377
|
-
if any(base in cfg.mod_target_bases for base in [
|
|
378
|
-
obs_to_plot += [
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
431
|
+
obs_to_plot = ["Raw_modification_signal"]
|
|
432
|
+
if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
|
|
433
|
+
obs_to_plot += [
|
|
434
|
+
"Fraction_GpC_site_modified",
|
|
435
|
+
"Fraction_CpG_site_modified",
|
|
436
|
+
"Fraction_other_C_site_modified",
|
|
437
|
+
"Fraction_C_site_modified",
|
|
438
|
+
]
|
|
439
|
+
if "A" in cfg.mod_target_bases:
|
|
440
|
+
obs_to_plot += ["Fraction_A_site_modified"]
|
|
441
|
+
plot_read_qc_histograms(
|
|
442
|
+
adata,
|
|
443
|
+
pp_meth_qc_dir,
|
|
444
|
+
obs_to_plot,
|
|
445
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
446
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
############### Calculate final positional coverage by reference set in dataset after filtering reads ###############
|
|
450
|
+
calculate_coverage(
|
|
451
|
+
adata,
|
|
452
|
+
ref_column=cfg.reference_column,
|
|
453
|
+
position_nan_threshold=cfg.position_max_nan_threshold,
|
|
454
|
+
smf_modality=smf_modality,
|
|
455
|
+
target_layer=cfg.output_binary_layer_name,
|
|
456
|
+
force_redo=True,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats after filtering reads ###############
|
|
460
|
+
# Additionally, store base_context level binary modification arrays in adata.obsm
|
|
461
|
+
append_base_context(
|
|
462
|
+
adata,
|
|
463
|
+
ref_column=cfg.reference_column,
|
|
464
|
+
use_consensus=False,
|
|
465
|
+
native=native,
|
|
466
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
467
|
+
bypass=cfg.bypass_append_base_context,
|
|
468
|
+
force_redo=True,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# Add site type binary modification layers for valid coverage sites
|
|
472
|
+
adata = append_binary_layer_by_base_context(
|
|
473
|
+
adata,
|
|
474
|
+
cfg.reference_column,
|
|
475
|
+
smf_modality,
|
|
476
|
+
bypass=cfg.bypass_append_binary_layer_by_base_context,
|
|
477
|
+
force_redo=cfg.force_redo_append_binary_layer_by_base_context,
|
|
478
|
+
from_valid_sites_only=True,
|
|
479
|
+
)
|
|
385
480
|
|
|
386
481
|
############### Duplicate detection for conversion/deamination SMF ###############
|
|
387
|
-
if smf_modality !=
|
|
482
|
+
if smf_modality != "direct":
|
|
388
483
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
389
484
|
|
|
390
|
-
var_filters_sets =[]
|
|
485
|
+
var_filters_sets = []
|
|
391
486
|
for ref in references:
|
|
392
487
|
for site_type in cfg.duplicate_detection_site_types:
|
|
393
488
|
var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
|
|
@@ -397,27 +492,30 @@ def preprocess_adata_core(
|
|
|
397
492
|
make_dirs([pp_dup_qc_dir])
|
|
398
493
|
|
|
399
494
|
# Flag duplicate reads and plot duplicate detection QC
|
|
400
|
-
adata_unique, adata = flag_duplicate_reads(
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
495
|
+
adata_unique, adata = flag_duplicate_reads(
|
|
496
|
+
adata,
|
|
497
|
+
var_filters_sets,
|
|
498
|
+
distance_threshold=cfg.duplicate_detection_distance_threshold,
|
|
499
|
+
obs_reference_col=cfg.reference_column,
|
|
500
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
501
|
+
output_directory=pp_dup_qc_dir,
|
|
502
|
+
metric_keys=cfg.hamming_vs_metric_keys,
|
|
503
|
+
keep_best_metric=cfg.duplicate_detection_keep_best_metric,
|
|
504
|
+
bypass=cfg.bypass_flag_duplicate_reads,
|
|
505
|
+
force_redo=cfg.force_redo_flag_duplicate_reads,
|
|
506
|
+
window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
|
|
507
|
+
min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
|
|
508
|
+
do_pca=cfg.duplicate_detection_do_pca,
|
|
509
|
+
pca_n_components=50,
|
|
510
|
+
pca_center=True,
|
|
511
|
+
do_hierarchical=cfg.duplicate_detection_do_hierarchical,
|
|
512
|
+
hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
|
|
513
|
+
hierarchical_metric="euclidean",
|
|
514
|
+
hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
|
|
515
|
+
demux_types=("double", "already"),
|
|
516
|
+
demux_col="demux_type",
|
|
517
|
+
)
|
|
518
|
+
|
|
421
519
|
# Use the flagged duplicate read groups and perform complexity analysis
|
|
422
520
|
complexity_outs = pp_dup_qc_dir / "sample_complexity_analyses"
|
|
423
521
|
make_dirs([complexity_outs])
|
|
@@ -426,15 +524,15 @@ def preprocess_adata_core(
|
|
|
426
524
|
output_directory=complexity_outs,
|
|
427
525
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
428
526
|
ref_col=cfg.reference_column,
|
|
429
|
-
cluster_col=
|
|
527
|
+
cluster_col="sequence__merged_cluster_id",
|
|
430
528
|
plot=True,
|
|
431
|
-
save_plot=True,
|
|
529
|
+
save_plot=True, # set False to display instead
|
|
432
530
|
n_boot=30,
|
|
433
531
|
n_depths=12,
|
|
434
532
|
random_state=42,
|
|
435
533
|
csv_summary=True,
|
|
436
534
|
bypass=cfg.bypass_complexity_analysis,
|
|
437
|
-
force_redo=cfg.force_redo_complexity_analysis
|
|
535
|
+
force_redo=cfg.force_redo_complexity_analysis,
|
|
438
536
|
)
|
|
439
537
|
|
|
440
538
|
else:
|
|
@@ -443,13 +541,29 @@ def preprocess_adata_core(
|
|
|
443
541
|
|
|
444
542
|
############################################### Save preprocessed adata with duplicate detection ###############################################
|
|
445
543
|
if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
446
|
-
|
|
544
|
+
logger.info("Saving preprocessed adata.")
|
|
545
|
+
record_smftools_metadata(
|
|
546
|
+
adata,
|
|
547
|
+
step_name="preprocess",
|
|
548
|
+
cfg=cfg,
|
|
549
|
+
config_path=config_path,
|
|
550
|
+
input_paths=[source_adata_path] if source_adata_path else None,
|
|
551
|
+
output_path=pp_adata_path,
|
|
552
|
+
)
|
|
447
553
|
write_gz_h5ad(adata, pp_adata_path)
|
|
448
554
|
|
|
449
555
|
if not pp_dup_rem_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
450
|
-
|
|
451
|
-
|
|
556
|
+
logger.info("Saving preprocessed adata with duplicates removed.")
|
|
557
|
+
record_smftools_metadata(
|
|
558
|
+
adata_unique,
|
|
559
|
+
step_name="preprocess",
|
|
560
|
+
cfg=cfg,
|
|
561
|
+
config_path=config_path,
|
|
562
|
+
input_paths=[pp_adata_path],
|
|
563
|
+
output_path=pp_dup_rem_adata_path,
|
|
564
|
+
)
|
|
565
|
+
write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
|
|
452
566
|
|
|
453
567
|
########################################################################################################################
|
|
454
568
|
|
|
455
|
-
return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
|
|
569
|
+
return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
|