smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +168 -145
- smftools/cli/load_adata.py +155 -95
- smftools/cli/preprocess_adata.py +222 -130
- smftools/cli/spatial_adata.py +441 -308
- smftools/cli_entry.py +4 -5
- smftools/config/conversion.yaml +12 -5
- smftools/config/deaminase.yaml +11 -9
- smftools/config/default.yaml +123 -19
- smftools/config/direct.yaml +3 -0
- smftools/config/experiment_config.py +120 -19
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/bam_functions.py +28 -29
- smftools/informatics/h5ad_functions.py +1 -1
- smftools/plotting/general_plotting.py +97 -51
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +2 -4
- smftools/preprocessing/append_base_context.py +34 -25
- smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
- smftools/preprocessing/binarize_on_Youden.py +10 -8
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +41 -25
- smftools/preprocessing/calculate_read_modification_stats.py +1 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
- smftools/preprocessing/flag_duplicate_reads.py +1 -1
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +94 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
smftools/cli/preprocess_adata.py
CHANGED
|
@@ -1,109 +1,227 @@
|
|
|
1
|
-
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import anndata as ad
|
|
5
|
+
|
|
6
|
+
def preprocess_adata(
|
|
7
|
+
config_path: str,
|
|
8
|
+
) -> Tuple[Optional[ad.AnnData], Optional[Path], Optional[ad.AnnData], Optional[Path]]:
|
|
2
9
|
"""
|
|
3
|
-
|
|
4
|
-
Command line accesses this through smftools preprocess <config_path>
|
|
10
|
+
CLI-facing wrapper for preprocessing.
|
|
5
11
|
|
|
6
|
-
|
|
7
|
-
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
12
|
+
Called by: `smftools preprocess <config_path>`
|
|
8
13
|
|
|
9
|
-
|
|
10
|
-
|
|
14
|
+
- Ensure a raw AnnData exists (or some later-stage AnnData) via `load_adata`.
|
|
15
|
+
- Determine which AnnData stages exist (raw, pp, pp_dedup, spatial, hmm).
|
|
16
|
+
- Respect cfg flags (force_redo_preprocessing, force_redo_flag_duplicate_reads).
|
|
17
|
+
- Decide what starting AnnData to load (or whether to early-return).
|
|
18
|
+
- Call `preprocess_adata_core(...)` when appropriate.
|
|
19
|
+
|
|
20
|
+
Returns
|
|
21
|
+
-------
|
|
22
|
+
pp_adata : AnnData | None
|
|
23
|
+
Preprocessed AnnData (may be None if we skipped work).
|
|
24
|
+
pp_adata_path : Path | None
|
|
25
|
+
Path to preprocessed AnnData.
|
|
26
|
+
pp_dedup_adata : AnnData | None
|
|
27
|
+
Preprocessed, duplicate-removed AnnData.
|
|
28
|
+
pp_dedup_adata_path : Path | None
|
|
29
|
+
Path to preprocessed, duplicate-removed AnnData.
|
|
11
30
|
"""
|
|
12
|
-
from ..readwrite import safe_read_h5ad
|
|
31
|
+
from ..readwrite import safe_read_h5ad
|
|
13
32
|
from .load_adata import load_adata
|
|
33
|
+
from .helpers import get_adata_paths
|
|
14
34
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
35
|
+
# 1) Ensure config is loaded and at least *some* AnnData stage exists
|
|
36
|
+
loaded_adata, loaded_path, cfg = load_adata(config_path)
|
|
37
|
+
|
|
38
|
+
# 2) Compute canonical paths
|
|
39
|
+
paths = get_adata_paths(cfg)
|
|
40
|
+
raw_path = paths.raw
|
|
41
|
+
pp_path = paths.pp
|
|
42
|
+
pp_dedup_path = paths.pp_dedup
|
|
43
|
+
spatial_path = paths.spatial
|
|
44
|
+
hmm_path = paths.hmm
|
|
45
|
+
|
|
46
|
+
raw_exists = raw_path.exists()
|
|
47
|
+
pp_exists = pp_path.exists()
|
|
48
|
+
pp_dedup_exists = pp_dedup_path.exists()
|
|
49
|
+
spatial_exists = spatial_path.exists()
|
|
50
|
+
hmm_exists = hmm_path.exists()
|
|
51
|
+
|
|
52
|
+
# Helper: reuse loaded_adata if it matches the path we want, else read from disk
|
|
53
|
+
def _load(path: Path):
|
|
54
|
+
if loaded_adata is not None and loaded_path == path:
|
|
55
|
+
return loaded_adata
|
|
56
|
+
adata, _ = safe_read_h5ad(path)
|
|
57
|
+
return adata
|
|
58
|
+
|
|
59
|
+
# -----------------------------
|
|
60
|
+
# Case A: full redo of preprocessing
|
|
61
|
+
# -----------------------------
|
|
62
|
+
if getattr(cfg, "force_redo_preprocessing", False):
|
|
63
|
+
print("Forcing full redo of preprocessing workflow, starting from latest stage AnnData available.")
|
|
64
|
+
|
|
65
|
+
if hmm_exists:
|
|
66
|
+
adata = _load(hmm_path)
|
|
67
|
+
elif spatial_exists:
|
|
68
|
+
adata = _load(spatial_path)
|
|
69
|
+
elif pp_dedup_exists:
|
|
70
|
+
adata = _load(pp_dedup_path)
|
|
71
|
+
elif pp_exists:
|
|
72
|
+
adata = _load(pp_path)
|
|
73
|
+
elif raw_exists:
|
|
74
|
+
adata = _load(raw_path)
|
|
75
|
+
else:
|
|
76
|
+
print("Cannot redo preprocessing: no AnnData available at any stage.")
|
|
77
|
+
return (None, None, None, None)
|
|
78
|
+
|
|
79
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
80
|
+
adata=adata,
|
|
81
|
+
cfg=cfg,
|
|
82
|
+
pp_adata_path=pp_path,
|
|
83
|
+
pp_dup_rem_adata_path=pp_dedup_path,
|
|
84
|
+
)
|
|
85
|
+
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
86
|
+
|
|
87
|
+
# -----------------------------
|
|
88
|
+
# Case B: redo duplicate detection only
|
|
89
|
+
# -----------------------------
|
|
90
|
+
if getattr(cfg, "force_redo_flag_duplicate_reads", False):
|
|
91
|
+
print(
|
|
92
|
+
"Forcing redo of duplicate detection workflow, starting from the preprocessed AnnData "
|
|
93
|
+
"if available. Otherwise, will use the raw AnnData."
|
|
94
|
+
)
|
|
95
|
+
if pp_exists:
|
|
96
|
+
adata = _load(pp_path)
|
|
97
|
+
elif raw_exists:
|
|
98
|
+
adata = _load(raw_path)
|
|
99
|
+
else:
|
|
100
|
+
print(
|
|
101
|
+
"Cannot redo duplicate detection: no compatible AnnData available "
|
|
102
|
+
"(need at least raw or preprocessed)."
|
|
103
|
+
)
|
|
104
|
+
return (None, None, None, None)
|
|
105
|
+
|
|
106
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
107
|
+
adata=adata,
|
|
108
|
+
cfg=cfg,
|
|
109
|
+
pp_adata_path=pp_path,
|
|
110
|
+
pp_dup_rem_adata_path=pp_dedup_path,
|
|
111
|
+
)
|
|
112
|
+
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
113
|
+
|
|
114
|
+
# -----------------------------
|
|
115
|
+
# Case C: normal behavior (no explicit redo flags)
|
|
116
|
+
# -----------------------------
|
|
117
|
+
|
|
118
|
+
# If HMM exists, preprocessing is considered “done enough”
|
|
119
|
+
if hmm_exists:
|
|
120
|
+
print(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
|
|
121
|
+
return (None, None, None, None)
|
|
122
|
+
|
|
123
|
+
# If spatial exists, also skip re-preprocessing by default
|
|
124
|
+
if spatial_exists:
|
|
125
|
+
print(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
|
|
126
|
+
return (None, None, None, None)
|
|
127
|
+
|
|
128
|
+
# If pp_dedup exists, just return paths (no recomputation)
|
|
129
|
+
if pp_dedup_exists:
|
|
130
|
+
print(f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}")
|
|
131
|
+
return (None, pp_path, None, pp_dedup_path)
|
|
132
|
+
|
|
133
|
+
# If pp exists but pp_dedup does not, load pp and run core
|
|
134
|
+
if pp_exists:
|
|
135
|
+
print(f"Preprocessed AnnData found: {pp_path}")
|
|
136
|
+
adata = _load(pp_path)
|
|
137
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
138
|
+
adata=adata,
|
|
139
|
+
cfg=cfg,
|
|
140
|
+
pp_adata_path=pp_path,
|
|
141
|
+
pp_dup_rem_adata_path=pp_dedup_path,
|
|
142
|
+
)
|
|
143
|
+
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
144
|
+
|
|
145
|
+
# Otherwise, fall back to raw (if available)
|
|
146
|
+
if raw_exists:
|
|
147
|
+
adata = _load(raw_path)
|
|
148
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
149
|
+
adata=adata,
|
|
150
|
+
cfg=cfg,
|
|
151
|
+
pp_adata_path=pp_path,
|
|
152
|
+
pp_dup_rem_adata_path=pp_dedup_path,
|
|
153
|
+
)
|
|
154
|
+
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
155
|
+
|
|
156
|
+
print("No AnnData available at any stage for preprocessing.")
|
|
157
|
+
return (None, None, None, None)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def preprocess_adata_core(
|
|
161
|
+
adata: ad.AnnData,
|
|
162
|
+
cfg,
|
|
163
|
+
pp_adata_path: Path,
|
|
164
|
+
pp_dup_rem_adata_path: Path,
|
|
165
|
+
) -> Tuple[ad.AnnData, Path, ad.AnnData, Path]:
|
|
166
|
+
"""
|
|
167
|
+
Core preprocessing pipeline.
|
|
168
|
+
|
|
169
|
+
Assumes:
|
|
170
|
+
- `adata` is an AnnData object at some stage (raw/pp/etc.) to start preprocessing from.
|
|
171
|
+
- `cfg` is the ExperimentConfig containing all thresholds & options.
|
|
172
|
+
- `pp_adata_path` and `pp_dup_rem_adata_path` are the target output paths for
|
|
173
|
+
preprocessed and preprocessed+deduplicated AnnData.
|
|
19
174
|
|
|
20
|
-
|
|
21
|
-
from
|
|
175
|
+
Does NOT:
|
|
176
|
+
- Decide which stage to load from (that's the wrapper's job).
|
|
177
|
+
- Decide whether to skip entirely; it always runs its steps, but individual
|
|
178
|
+
sub-steps may skip based on `cfg.bypass_*` or directory existence.
|
|
179
|
+
|
|
180
|
+
Returns
|
|
181
|
+
-------
|
|
182
|
+
pp_adata : AnnData
|
|
183
|
+
Preprocessed AnnData (with QC filters, binarization, etc.).
|
|
184
|
+
pp_adata_path : Path
|
|
185
|
+
Path where pp_adata was written.
|
|
186
|
+
pp_dedup_adata : AnnData
|
|
187
|
+
Preprocessed AnnData with duplicate reads removed (for non-direct SMF).
|
|
188
|
+
pp_dup_rem_adata_path : Path
|
|
189
|
+
Path where pp_dedup_adata was written.
|
|
190
|
+
"""
|
|
22
191
|
from pathlib import Path
|
|
23
192
|
|
|
24
|
-
|
|
25
|
-
date_str = datetime.today().strftime("%y%m%d")
|
|
193
|
+
import numpy as np
|
|
26
194
|
|
|
27
|
-
|
|
28
|
-
|
|
195
|
+
from .helpers import write_gz_h5ad
|
|
196
|
+
from ..readwrite import make_dirs
|
|
197
|
+
from ..preprocessing import (
|
|
198
|
+
load_sample_sheet,
|
|
199
|
+
filter_reads_on_length_quality_mapping,
|
|
200
|
+
clean_NaN,
|
|
201
|
+
calculate_coverage,
|
|
202
|
+
append_base_context,
|
|
203
|
+
append_binary_layer_by_base_context,
|
|
204
|
+
calculate_read_modification_stats,
|
|
205
|
+
filter_reads_on_modification_thresholds,
|
|
206
|
+
flag_duplicate_reads,
|
|
207
|
+
calculate_complexity_II,
|
|
208
|
+
calculate_position_Youden,
|
|
209
|
+
binarize_on_Youden,
|
|
210
|
+
binarize_adata,
|
|
211
|
+
)
|
|
212
|
+
from ..plotting import plot_read_qc_histograms
|
|
29
213
|
|
|
214
|
+
################################### 1) Load existing ###################################
|
|
30
215
|
# General config variable init - Necessary user passed inputs
|
|
31
216
|
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
32
217
|
output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
|
|
33
|
-
|
|
34
|
-
# Make initial output directory
|
|
35
218
|
make_dirs([output_directory])
|
|
36
219
|
|
|
37
|
-
input_manager_df = pd.read_csv(cfg.summary_file)
|
|
38
|
-
initial_adata_path = Path(input_manager_df['load_adata'][0])
|
|
39
|
-
pp_adata_path = Path(input_manager_df['pp_adata'][0])
|
|
40
|
-
pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
|
|
41
|
-
spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
|
|
42
|
-
hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
|
|
43
|
-
|
|
44
|
-
if adata:
|
|
45
|
-
# This happens on first run of the load pipeline
|
|
46
|
-
pass
|
|
47
|
-
else:
|
|
48
|
-
# If an anndata is saved, check which stages of the anndata are available
|
|
49
|
-
initial_version_available = initial_adata_path.exists()
|
|
50
|
-
preprocessed_version_available = pp_adata_path.exists()
|
|
51
|
-
preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
|
|
52
|
-
spatial_adata_exists = spatial_adata_path.exists()
|
|
53
|
-
hmm_adata_exists = hmm_adata_path.exists()
|
|
54
|
-
|
|
55
|
-
if cfg.force_redo_preprocessing:
|
|
56
|
-
print(f"Forcing full redo of preprocessing workflow, starting from earliest stage adata available.")
|
|
57
|
-
if initial_version_available:
|
|
58
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
59
|
-
elif preprocessed_version_available:
|
|
60
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
61
|
-
elif preprocessed_dup_removed_version_available:
|
|
62
|
-
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
63
|
-
else:
|
|
64
|
-
print(f"Can not redo preprocessing when there is no adata available.")
|
|
65
|
-
return
|
|
66
|
-
elif cfg.force_redo_flag_duplicate_reads:
|
|
67
|
-
print(f"Forcing redo of duplicate detection workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
|
|
68
|
-
if preprocessed_version_available:
|
|
69
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
70
|
-
elif initial_version_available:
|
|
71
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
72
|
-
else:
|
|
73
|
-
print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
|
|
74
|
-
return
|
|
75
|
-
elif cfg.force_redo_basic_analyses:
|
|
76
|
-
print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
|
|
77
|
-
if preprocessed_version_available:
|
|
78
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
79
|
-
elif initial_version_available:
|
|
80
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
81
|
-
else:
|
|
82
|
-
print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
|
|
83
|
-
elif hmm_adata_exists:
|
|
84
|
-
print(f"HMM anndata found: {hmm_adata_path}")
|
|
85
|
-
return (None, None, None, None)
|
|
86
|
-
elif spatial_adata_exists:
|
|
87
|
-
print(f"Spatial anndata found: {spatial_adata_exists}")
|
|
88
|
-
return (None, None, None, None)
|
|
89
|
-
elif preprocessed_dup_removed_version_available:
|
|
90
|
-
print(f"Preprocessed deduplicated anndata found: {pp_dup_rem_adata_path}")
|
|
91
|
-
return (None, pp_adata_path, None, pp_dup_rem_adata_path)
|
|
92
|
-
elif preprocessed_version_available:
|
|
93
|
-
print(f"Preprocessed anndata found: {pp_adata_path}")
|
|
94
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
95
|
-
elif initial_version_available:
|
|
96
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
97
|
-
else:
|
|
98
|
-
print(f"No adata available.")
|
|
99
|
-
return
|
|
100
|
-
|
|
101
220
|
######### Begin Preprocessing #########
|
|
102
221
|
pp_dir = output_directory / "preprocessed"
|
|
103
222
|
|
|
104
223
|
## Load sample sheet metadata based on barcode mapping ##
|
|
105
|
-
if cfg
|
|
106
|
-
from ..preprocessing import load_sample_sheet
|
|
224
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
107
225
|
load_sample_sheet(adata,
|
|
108
226
|
cfg.sample_sheet_path,
|
|
109
227
|
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
@@ -118,17 +236,14 @@ def preprocess_adata(config_path):
|
|
|
118
236
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
119
237
|
print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
|
|
120
238
|
else:
|
|
121
|
-
from ..plotting import plot_read_qc_histograms
|
|
122
239
|
make_dirs([pp_dir, pp_length_qc_dir])
|
|
123
|
-
obs_to_plot = ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal']
|
|
124
240
|
plot_read_qc_histograms(adata,
|
|
125
241
|
pp_length_qc_dir,
|
|
126
|
-
|
|
242
|
+
cfg.obs_to_plot_pp_qc,
|
|
127
243
|
sample_key=cfg.sample_name_col_for_plotting,
|
|
128
244
|
rows_per_fig=cfg.rows_per_qc_histogram_grid)
|
|
129
245
|
|
|
130
246
|
# Filter on read length, read quality, reference length, mapped_length, and mapping quality metadata.
|
|
131
|
-
from ..preprocessing import filter_reads_on_length_quality_mapping
|
|
132
247
|
print(adata.shape)
|
|
133
248
|
adata = filter_reads_on_length_quality_mapping(adata,
|
|
134
249
|
filter_on_coordinates=cfg.read_coord_filter,
|
|
@@ -145,19 +260,15 @@ def preprocess_adata(config_path):
|
|
|
145
260
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
146
261
|
print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
|
|
147
262
|
else:
|
|
148
|
-
from ..plotting import plot_read_qc_histograms
|
|
149
263
|
make_dirs([pp_dir, pp_length_qc_dir])
|
|
150
|
-
obs_to_plot = ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal']
|
|
151
264
|
plot_read_qc_histograms(adata,
|
|
152
265
|
pp_length_qc_dir,
|
|
153
|
-
|
|
266
|
+
cfg.obs_to_plot_pp_qc,
|
|
154
267
|
sample_key=cfg.sample_name_col_for_plotting,
|
|
155
268
|
rows_per_fig=cfg.rows_per_qc_histogram_grid)
|
|
156
269
|
|
|
157
270
|
############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
|
|
158
|
-
from ..preprocessing import clean_NaN
|
|
159
271
|
if smf_modality == 'direct':
|
|
160
|
-
from ..preprocessing import calculate_position_Youden, binarize_on_Youden, binarize_adata
|
|
161
272
|
native = True
|
|
162
273
|
if cfg.fit_position_methylation_thresholds:
|
|
163
274
|
pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
|
|
@@ -167,7 +278,8 @@ def preprocess_adata(config_path):
|
|
|
167
278
|
positive_control_sample=cfg.positive_control_sample_methylation_fitting,
|
|
168
279
|
negative_control_sample=cfg.negative_control_sample_methylation_fitting,
|
|
169
280
|
J_threshold=cfg.fit_j_threshold,
|
|
170
|
-
|
|
281
|
+
ref_column=cfg.reference_column,
|
|
282
|
+
sample_column=cfg.sample_column,
|
|
171
283
|
infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
|
|
172
284
|
inference_variable=cfg.inference_variable_sample_methylation_fitting,
|
|
173
285
|
save=True,
|
|
@@ -175,7 +287,7 @@ def preprocess_adata(config_path):
|
|
|
175
287
|
)
|
|
176
288
|
# binarize the modcalls based on the determined thresholds
|
|
177
289
|
binarize_on_Youden(adata,
|
|
178
|
-
|
|
290
|
+
ref_column=cfg.reference_column,
|
|
179
291
|
output_layer_name=cfg.output_binary_layer_name
|
|
180
292
|
)
|
|
181
293
|
else:
|
|
@@ -195,12 +307,16 @@ def preprocess_adata(config_path):
|
|
|
195
307
|
bypass=cfg.bypass_clean_nan,
|
|
196
308
|
force_redo=cfg.force_redo_clean_nan
|
|
197
309
|
)
|
|
310
|
+
|
|
311
|
+
############### Calculate positional coverage by reference set in dataset ###############
|
|
312
|
+
calculate_coverage(adata,
|
|
313
|
+
ref_column=cfg.reference_column,
|
|
314
|
+
position_nan_threshold=cfg.position_max_nan_threshold)
|
|
198
315
|
|
|
199
316
|
############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
|
|
200
|
-
from ..preprocessing import append_base_context, append_binary_layer_by_base_context
|
|
201
317
|
# Additionally, store base_context level binary modification arrays in adata.obsm
|
|
202
318
|
append_base_context(adata,
|
|
203
|
-
|
|
319
|
+
ref_column=cfg.reference_column,
|
|
204
320
|
use_consensus=False,
|
|
205
321
|
native=native,
|
|
206
322
|
mod_target_bases=cfg.mod_target_bases,
|
|
@@ -212,20 +328,14 @@ def preprocess_adata(config_path):
|
|
|
212
328
|
smf_modality,
|
|
213
329
|
bypass=cfg.bypass_append_binary_layer_by_base_context,
|
|
214
330
|
force_redo=cfg.force_redo_append_binary_layer_by_base_context)
|
|
215
|
-
|
|
216
|
-
############### Optional inversion of the adata along positions axis ###################
|
|
217
|
-
if cfg.invert_adata:
|
|
218
|
-
from ..preprocessing import invert_adata
|
|
219
|
-
adata = invert_adata(adata)
|
|
220
331
|
|
|
221
332
|
############### Calculate read methylation/deamination statistics for specific base contexts defined above ###############
|
|
222
|
-
from ..preprocessing import calculate_read_modification_stats
|
|
223
333
|
calculate_read_modification_stats(adata,
|
|
224
334
|
cfg.reference_column,
|
|
225
335
|
cfg.sample_column,
|
|
226
336
|
cfg.mod_target_bases,
|
|
227
337
|
bypass=cfg.bypass_calculate_read_modification_stats,
|
|
228
|
-
force_redo=cfg.force_redo_calculate_read_modification_stats)
|
|
338
|
+
force_redo=cfg.force_redo_calculate_read_modification_stats)
|
|
229
339
|
|
|
230
340
|
### Make a dir for outputting sample level read modification metrics before filtering ###
|
|
231
341
|
pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
|
|
@@ -233,11 +343,10 @@ def preprocess_adata(config_path):
|
|
|
233
343
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
234
344
|
print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
|
|
235
345
|
else:
|
|
236
|
-
from ..plotting import plot_read_qc_histograms
|
|
237
346
|
make_dirs([pp_dir, pp_meth_qc_dir])
|
|
238
347
|
obs_to_plot = ['Raw_modification_signal']
|
|
239
348
|
if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
240
|
-
obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', '
|
|
349
|
+
obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_C_site_modified']
|
|
241
350
|
if 'A' in cfg.mod_target_bases:
|
|
242
351
|
obs_to_plot += ['Fraction_A_site_modified']
|
|
243
352
|
plot_read_qc_histograms(adata,
|
|
@@ -246,13 +355,12 @@ def preprocess_adata(config_path):
|
|
|
246
355
|
rows_per_fig=cfg.rows_per_qc_histogram_grid)
|
|
247
356
|
|
|
248
357
|
##### Optionally filter reads on modification metrics
|
|
249
|
-
from ..preprocessing import filter_reads_on_modification_thresholds
|
|
250
358
|
adata = filter_reads_on_modification_thresholds(adata,
|
|
251
359
|
smf_modality=smf_modality,
|
|
252
360
|
mod_target_bases=cfg.mod_target_bases,
|
|
253
361
|
gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
|
|
254
362
|
cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
|
|
255
|
-
any_c_thresholds=cfg.
|
|
363
|
+
any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
|
|
256
364
|
a_thresholds=cfg.read_mod_filtering_a_thresholds,
|
|
257
365
|
use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
|
|
258
366
|
min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
@@ -264,27 +372,19 @@ def preprocess_adata(config_path):
|
|
|
264
372
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
265
373
|
print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
|
|
266
374
|
else:
|
|
267
|
-
from ..plotting import plot_read_qc_histograms
|
|
268
375
|
make_dirs([pp_dir, pp_meth_qc_dir])
|
|
269
376
|
obs_to_plot = ['Raw_modification_signal']
|
|
270
377
|
if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
271
|
-
obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', '
|
|
378
|
+
obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_C_site_modified']
|
|
272
379
|
if 'A' in cfg.mod_target_bases:
|
|
273
380
|
obs_to_plot += ['Fraction_A_site_modified']
|
|
274
381
|
plot_read_qc_histograms(adata,
|
|
275
382
|
pp_meth_qc_dir, obs_to_plot,
|
|
276
383
|
sample_key=cfg.sample_name_col_for_plotting,
|
|
277
384
|
rows_per_fig=cfg.rows_per_qc_histogram_grid)
|
|
278
|
-
|
|
279
|
-
############### Calculate positional coverage in dataset ###############
|
|
280
|
-
from ..preprocessing import calculate_coverage
|
|
281
|
-
calculate_coverage(adata,
|
|
282
|
-
obs_column=cfg.reference_column,
|
|
283
|
-
position_nan_threshold=cfg.position_max_nan_threshold)
|
|
284
385
|
|
|
285
386
|
############### Duplicate detection for conversion/deamination SMF ###############
|
|
286
387
|
if smf_modality != 'direct':
|
|
287
|
-
from ..preprocessing import flag_duplicate_reads, calculate_complexity_II
|
|
288
388
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
289
389
|
|
|
290
390
|
var_filters_sets =[]
|
|
@@ -342,22 +442,14 @@ def preprocess_adata(config_path):
|
|
|
342
442
|
########################################################################################################################
|
|
343
443
|
|
|
344
444
|
############################################### Save preprocessed adata with duplicate detection ###############################################
|
|
345
|
-
from ..readwrite import safe_write_h5ad
|
|
346
445
|
if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
347
446
|
print('Saving preprocessed adata.')
|
|
348
|
-
|
|
349
|
-
safe_write_h5ad(adata, pp_adata_path, compression='gzip', backup=True)
|
|
350
|
-
else:
|
|
351
|
-
pp_adata_path = pp_adata_path.with_name(pp_adata_path.name + '.gz')
|
|
352
|
-
safe_write_h5ad(adata, pp_adata_path, compression='gzip', backup=True)
|
|
447
|
+
write_gz_h5ad(adata, pp_adata_path)
|
|
353
448
|
|
|
354
449
|
if not pp_dup_rem_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
355
450
|
print('Saving preprocessed adata with duplicates removed.')
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
else:
|
|
359
|
-
pp_adata_path = pp_dup_rem_adata_path.with_name(pp_dup_rem_adata_path.name + '.gz')
|
|
360
|
-
safe_write_h5ad(adata_unique, pp_dup_rem_adata_path, compression='gzip', backup=True)
|
|
451
|
+
write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
|
|
452
|
+
|
|
361
453
|
########################################################################################################################
|
|
362
454
|
|
|
363
455
|
return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
|