smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,9 +1,14 @@
|
|
|
1
|
-
import math
|
|
2
1
|
import gc
|
|
2
|
+
from typing import List, Optional, Sequence
|
|
3
|
+
|
|
4
|
+
import anndata as ad
|
|
3
5
|
import numpy as np
|
|
4
6
|
import pandas as pd
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
+
|
|
8
|
+
from smftools.logging_utils import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger(__name__)
|
|
11
|
+
|
|
7
12
|
|
|
8
13
|
def filter_reads_on_modification_thresholds(
|
|
9
14
|
adata: ad.AnnData,
|
|
@@ -15,32 +20,40 @@ def filter_reads_on_modification_thresholds(
|
|
|
15
20
|
a_thresholds: Optional[Sequence[float]] = None,
|
|
16
21
|
use_other_c_as_background: bool = False,
|
|
17
22
|
min_valid_fraction_positions_in_read_vs_ref: Optional[float] = None,
|
|
18
|
-
uns_flag: str =
|
|
23
|
+
uns_flag: str = "filter_reads_on_modification_thresholds_performed",
|
|
19
24
|
bypass: bool = False,
|
|
20
25
|
force_redo: bool = False,
|
|
21
|
-
reference_column: str =
|
|
26
|
+
reference_column: str = "Reference_strand",
|
|
22
27
|
# memory-control options:
|
|
23
28
|
batch_size: int = 200,
|
|
24
29
|
compute_obs_if_missing: bool = True,
|
|
25
|
-
treat_zero_as_invalid: bool = False
|
|
30
|
+
treat_zero_as_invalid: bool = False,
|
|
26
31
|
) -> ad.AnnData:
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
32
|
+
"""Filter reads based on per-read modification thresholds.
|
|
33
|
+
|
|
34
|
+
If required obs columns exist, they are used directly. Otherwise, the function
|
|
35
|
+
computes the relevant per-read metrics in batches and stores them in ``adata.obs``.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
adata: AnnData object to filter.
|
|
39
|
+
smf_modality: SMF modality identifier.
|
|
40
|
+
mod_target_bases: List of target bases to evaluate.
|
|
41
|
+
gpc_thresholds: ``[min, max]`` thresholds for GpC (0..1) or ``None``.
|
|
42
|
+
cpg_thresholds: ``[min, max]`` thresholds for CpG (0..1) or ``None``.
|
|
43
|
+
any_c_thresholds: ``[min, max]`` thresholds for any C (0..1) or ``None``.
|
|
44
|
+
a_thresholds: ``[min, max]`` thresholds for A (0..1) or ``None``.
|
|
45
|
+
use_other_c_as_background: Require GpC/CpG > other_C background if present.
|
|
46
|
+
min_valid_fraction_positions_in_read_vs_ref: Minimum valid-site fraction per read.
|
|
47
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
48
|
+
bypass: Whether to skip processing.
|
|
49
|
+
force_redo: Whether to rerun even if ``uns_flag`` is set.
|
|
50
|
+
reference_column: Obs column containing reference identifiers.
|
|
51
|
+
batch_size: Batch size for low-memory computation.
|
|
52
|
+
compute_obs_if_missing: Whether to compute missing obs summaries.
|
|
53
|
+
treat_zero_as_invalid: Whether zeros should be treated as invalid positions.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
anndata.AnnData: Filtered AnnData object.
|
|
44
57
|
"""
|
|
45
58
|
|
|
46
59
|
# quick exit flags:
|
|
@@ -50,17 +63,23 @@ def filter_reads_on_modification_thresholds(
|
|
|
50
63
|
|
|
51
64
|
# helper: check whether obs columns exist for a particular mod type
|
|
52
65
|
def obs_has_columns_for(mod_type):
|
|
66
|
+
"""Return True if per-read summary columns exist for a mod type."""
|
|
53
67
|
col_pref = {
|
|
54
|
-
"GpC": ("Fraction_GpC_site_modified",
|
|
55
|
-
"CpG": ("Fraction_CpG_site_modified",
|
|
56
|
-
"C": ("Fraction_C_site_modified",
|
|
57
|
-
"A": ("Fraction_A_site_modified",
|
|
68
|
+
"GpC": ("Fraction_GpC_site_modified", "Valid_GpC_site_in_read_vs_reference"),
|
|
69
|
+
"CpG": ("Fraction_CpG_site_modified", "Valid_CpG_site_in_read_vs_reference"),
|
|
70
|
+
"C": ("Fraction_C_site_modified", "Valid_C_site_in_read_vs_reference"),
|
|
71
|
+
"A": ("Fraction_A_site_modified", "Valid_A_site_in_read_vs_reference"),
|
|
58
72
|
}.get(mod_type, (None, None))
|
|
59
73
|
return (col_pref[0] in adata.obs.columns) and (col_pref[1] in adata.obs.columns)
|
|
60
74
|
|
|
61
75
|
# if all required obs columns are present, use them directly (fast path)
|
|
62
76
|
required_present = True
|
|
63
|
-
for mt, thr in (
|
|
77
|
+
for mt, thr in (
|
|
78
|
+
("GpC", gpc_thresholds),
|
|
79
|
+
("CpG", cpg_thresholds),
|
|
80
|
+
("C", any_c_thresholds),
|
|
81
|
+
("A", a_thresholds),
|
|
82
|
+
):
|
|
64
83
|
if thr is not None and mt in mod_target_bases:
|
|
65
84
|
if not obs_has_columns_for(mt):
|
|
66
85
|
required_present = False
|
|
@@ -75,9 +94,10 @@ def filter_reads_on_modification_thresholds(
|
|
|
75
94
|
# Build mapping from reference -> var column names (expected pattern)
|
|
76
95
|
# e.g. var column names: "{ref}_GpC_site", "{ref}_CpG_site", "{ref}_any_C_site", "{ref}_other_C_site", "{ref}_A_site"
|
|
77
96
|
# If your var column naming differs, adjust these suffixes.
|
|
78
|
-
refs = list(adata.obs[reference_column].astype(
|
|
97
|
+
refs = list(adata.obs[reference_column].astype("category").cat.categories)
|
|
79
98
|
|
|
80
99
|
def _find_var_col_for(ref, suffix):
|
|
100
|
+
"""Resolve a var column name for a reference/suffix pair."""
|
|
81
101
|
name = f"{ref}_{suffix}"
|
|
82
102
|
if name in adata.var.columns:
|
|
83
103
|
return name
|
|
@@ -121,7 +141,9 @@ def filter_reads_on_modification_thresholds(
|
|
|
121
141
|
var_mask_bool = np.asarray(adata.var[var_colname].values).astype(bool)
|
|
122
142
|
except Exception:
|
|
123
143
|
# if var has values not boolean, attempt coercion
|
|
124
|
-
var_mask_bool = np.asarray(
|
|
144
|
+
var_mask_bool = np.asarray(
|
|
145
|
+
pd.to_numeric(adata.var[var_colname], errors="coerce").fillna(0).astype(bool)
|
|
146
|
+
)
|
|
125
147
|
|
|
126
148
|
if not var_mask_bool.any():
|
|
127
149
|
return
|
|
@@ -154,16 +176,20 @@ def filter_reads_on_modification_thresholds(
|
|
|
154
176
|
# valid_count = (non-nan if float data else non-zero) per row
|
|
155
177
|
# For sparse, .data are only stored nonzeros, so (X_block > 0).sum is fine
|
|
156
178
|
modified_count = np.asarray((X_block > 0).sum(axis=1)).ravel()
|
|
157
|
-
if np.isnan(X_block.data).any() if hasattr(X_block,
|
|
179
|
+
if np.isnan(X_block.data).any() if hasattr(X_block, "data") else False:
|
|
158
180
|
# if sparse with stored NaNs (!) handle differently - unlikely
|
|
159
|
-
valid_count = np.asarray(
|
|
181
|
+
valid_count = np.asarray(
|
|
182
|
+
~np.isnan(X_block.toarray()).sum(axis=1)
|
|
183
|
+
).ravel()
|
|
160
184
|
else:
|
|
161
185
|
if treat_zero_as_invalid:
|
|
162
186
|
# valid = number of non-zero entries
|
|
163
187
|
valid_count = np.asarray((X_block != 0).sum(axis=1)).ravel()
|
|
164
188
|
else:
|
|
165
189
|
# treat all positions as valid positions (they exist in reference) -> denominator = n_cols_for_ref
|
|
166
|
-
valid_count = np.full_like(
|
|
190
|
+
valid_count = np.full_like(
|
|
191
|
+
modified_count, n_cols_for_ref, dtype=float
|
|
192
|
+
)
|
|
167
193
|
else:
|
|
168
194
|
# dense numpy
|
|
169
195
|
Xb = np.asarray(X_block)
|
|
@@ -193,14 +219,18 @@ def filter_reads_on_modification_thresholds(
|
|
|
193
219
|
|
|
194
220
|
# fraction modified = modified_count / valid_count (guard divide-by-zero)
|
|
195
221
|
frac = np.zeros_like(modified_count, dtype=float)
|
|
196
|
-
mask_valid_nonzero =
|
|
197
|
-
frac[mask_valid_nonzero] =
|
|
222
|
+
mask_valid_nonzero = valid_count > 0
|
|
223
|
+
frac[mask_valid_nonzero] = (
|
|
224
|
+
modified_count[mask_valid_nonzero] / valid_count[mask_valid_nonzero]
|
|
225
|
+
)
|
|
198
226
|
|
|
199
227
|
# write to out arrays
|
|
200
228
|
out_frac_arr[block_rows_idx] = frac
|
|
201
229
|
# valid fraction relative to reference = valid_count / n_cols_for_ref
|
|
202
230
|
out_valid_arr[block_rows_idx] = np.zeros_like(valid_count, dtype=float)
|
|
203
|
-
out_valid_arr[block_rows_idx][mask_valid_nonzero] =
|
|
231
|
+
out_valid_arr[block_rows_idx][mask_valid_nonzero] = valid_count[
|
|
232
|
+
mask_valid_nonzero
|
|
233
|
+
] / float(n_cols_for_ref)
|
|
204
234
|
|
|
205
235
|
# free block memory ASAP
|
|
206
236
|
del X_block, modified_count, valid_count, frac
|
|
@@ -210,29 +240,51 @@ def filter_reads_on_modification_thresholds(
|
|
|
210
240
|
# GpC
|
|
211
241
|
if "GpC" in mod_target_bases:
|
|
212
242
|
for ref in refs:
|
|
213
|
-
_compute_for_ref_and_suffix(
|
|
243
|
+
_compute_for_ref_and_suffix(
|
|
244
|
+
ref,
|
|
245
|
+
"GpC_site",
|
|
246
|
+
create_cols["Fraction_GpC_site_modified"],
|
|
247
|
+
create_cols["Valid_GpC_site_in_read_vs_reference"],
|
|
248
|
+
)
|
|
214
249
|
# other_C (for background)
|
|
215
250
|
# We'll also compute 'other_C' per reference if it exists
|
|
216
251
|
other_c_per_ref = {}
|
|
217
252
|
for ref in refs:
|
|
218
253
|
other_col = _find_var_col_for(ref, "other_C_site")
|
|
219
254
|
if other_col:
|
|
220
|
-
other_c_per_ref[ref] = np.where(
|
|
255
|
+
other_c_per_ref[ref] = np.where(
|
|
256
|
+
np.asarray(adata.var[other_col].values).astype(bool)
|
|
257
|
+
)[0]
|
|
221
258
|
|
|
222
259
|
# CpG
|
|
223
260
|
if "CpG" in mod_target_bases:
|
|
224
261
|
for ref in refs:
|
|
225
|
-
_compute_for_ref_and_suffix(
|
|
262
|
+
_compute_for_ref_and_suffix(
|
|
263
|
+
ref,
|
|
264
|
+
"CpG_site",
|
|
265
|
+
create_cols["Fraction_CpG_site_modified"],
|
|
266
|
+
create_cols["Valid_CpG_site_in_read_vs_reference"],
|
|
267
|
+
)
|
|
226
268
|
|
|
227
269
|
# any C
|
|
228
270
|
if "C" in mod_target_bases:
|
|
229
271
|
for ref in refs:
|
|
230
|
-
_compute_for_ref_and_suffix(
|
|
272
|
+
_compute_for_ref_and_suffix(
|
|
273
|
+
ref,
|
|
274
|
+
"C_site",
|
|
275
|
+
create_cols["Fraction_C_site_modified"],
|
|
276
|
+
create_cols["Valid_C_site_in_read_vs_reference"],
|
|
277
|
+
)
|
|
231
278
|
|
|
232
279
|
# A
|
|
233
280
|
if "A" in mod_target_bases:
|
|
234
281
|
for ref in refs:
|
|
235
|
-
_compute_for_ref_and_suffix(
|
|
282
|
+
_compute_for_ref_and_suffix(
|
|
283
|
+
ref,
|
|
284
|
+
"A_site",
|
|
285
|
+
create_cols["Fraction_A_site_modified"],
|
|
286
|
+
create_cols["Valid_A_site_in_read_vs_reference"],
|
|
287
|
+
)
|
|
236
288
|
|
|
237
289
|
# write created arrays into adata.obs
|
|
238
290
|
for cname, arr in create_cols.items():
|
|
@@ -243,16 +295,20 @@ def filter_reads_on_modification_thresholds(
|
|
|
243
295
|
# compute per-ref background ratio if both exist
|
|
244
296
|
# Simplest approach: if 'Fraction_GpC_site_modified' and 'Fraction_other_C_site_modified' exist, compute ratio
|
|
245
297
|
if "Fraction_other_C_site_modified" in adata.obs.columns:
|
|
246
|
-
with np.errstate(divide=
|
|
247
|
-
ratio = adata.obs["Fraction_GpC_site_modified"].astype(float) / adata.obs[
|
|
298
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
299
|
+
ratio = adata.obs["Fraction_GpC_site_modified"].astype(float) / adata.obs[
|
|
300
|
+
"Fraction_other_C_site_modified"
|
|
301
|
+
].astype(float)
|
|
248
302
|
adata.obs["GpC_to_other_C_mod_ratio"] = ratio.fillna(0.0)
|
|
249
303
|
else:
|
|
250
304
|
adata.obs["GpC_to_other_C_mod_ratio"] = np.nan
|
|
251
305
|
|
|
252
306
|
if "CpG" in mod_target_bases and use_other_c_as_background:
|
|
253
307
|
if "Fraction_other_C_site_modified" in adata.obs.columns:
|
|
254
|
-
with np.errstate(divide=
|
|
255
|
-
ratio = adata.obs["Fraction_CpG_site_modified"].astype(float) / adata.obs[
|
|
308
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
309
|
+
ratio = adata.obs["Fraction_CpG_site_modified"].astype(float) / adata.obs[
|
|
310
|
+
"Fraction_other_C_site_modified"
|
|
311
|
+
].astype(float)
|
|
256
312
|
adata.obs["CpG_to_other_C_mod_ratio"] = ratio.fillna(0.0)
|
|
257
313
|
else:
|
|
258
314
|
adata.obs["CpG_to_other_C_mod_ratio"] = np.nan
|
|
@@ -266,10 +322,14 @@ def filter_reads_on_modification_thresholds(
|
|
|
266
322
|
|
|
267
323
|
# helper to get min/max from param like [min, max] or tuple(None,..)
|
|
268
324
|
def _unpack_minmax(thr):
|
|
325
|
+
"""Normalize a threshold pair to ordered (min, max) floats."""
|
|
269
326
|
if thr is None:
|
|
270
327
|
return None, None
|
|
271
328
|
try:
|
|
272
|
-
lo, hi =
|
|
329
|
+
lo, hi = (
|
|
330
|
+
float(thr[0]) if thr[0] is not None else None,
|
|
331
|
+
float(thr[1]) if thr[1] is not None else None,
|
|
332
|
+
)
|
|
273
333
|
if lo is not None and hi is not None and lo > hi:
|
|
274
334
|
lo, hi = hi, lo
|
|
275
335
|
return lo, hi
|
|
@@ -277,76 +337,124 @@ def filter_reads_on_modification_thresholds(
|
|
|
277
337
|
return None, None
|
|
278
338
|
|
|
279
339
|
# GpC thresholds
|
|
280
|
-
if gpc_thresholds and
|
|
340
|
+
if gpc_thresholds and "GpC" in mod_target_bases:
|
|
281
341
|
lo, hi = _unpack_minmax(gpc_thresholds)
|
|
282
|
-
if
|
|
342
|
+
if (
|
|
343
|
+
use_other_c_as_background
|
|
344
|
+
and smf_modality != "deaminase"
|
|
345
|
+
and "GpC_to_other_C_mod_ratio" in filtered.obs.columns
|
|
346
|
+
):
|
|
283
347
|
filtered = filtered[filtered.obs["GpC_to_other_C_mod_ratio"].astype(float) > 1]
|
|
284
348
|
if lo is not None:
|
|
285
349
|
s0 = filtered.n_obs
|
|
286
350
|
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) >= lo]
|
|
287
|
-
|
|
351
|
+
logger.info("Removed %s reads below min GpC fraction %s", s0 - filtered.n_obs, lo)
|
|
288
352
|
if hi is not None:
|
|
289
353
|
s0 = filtered.n_obs
|
|
290
354
|
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) <= hi]
|
|
291
|
-
|
|
292
|
-
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
355
|
+
logger.info("Removed %s reads above max GpC fraction %s", s0 - filtered.n_obs, hi)
|
|
356
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
357
|
+
"Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns
|
|
358
|
+
):
|
|
293
359
|
s0 = filtered.n_obs
|
|
294
|
-
filtered = filtered[
|
|
295
|
-
|
|
360
|
+
filtered = filtered[
|
|
361
|
+
filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float)
|
|
362
|
+
>= float(min_valid_fraction_positions_in_read_vs_ref)
|
|
363
|
+
]
|
|
364
|
+
logger.info(
|
|
365
|
+
"Removed %s reads with insufficient valid GpC site fraction vs ref",
|
|
366
|
+
s0 - filtered.n_obs,
|
|
367
|
+
)
|
|
296
368
|
|
|
297
369
|
# CpG thresholds
|
|
298
|
-
if cpg_thresholds and
|
|
370
|
+
if cpg_thresholds and "CpG" in mod_target_bases:
|
|
299
371
|
lo, hi = _unpack_minmax(cpg_thresholds)
|
|
300
|
-
if
|
|
372
|
+
if (
|
|
373
|
+
use_other_c_as_background
|
|
374
|
+
and smf_modality != "deaminase"
|
|
375
|
+
and "CpG_to_other_C_mod_ratio" in filtered.obs.columns
|
|
376
|
+
):
|
|
301
377
|
filtered = filtered[filtered.obs["CpG_to_other_C_mod_ratio"].astype(float) > 1]
|
|
302
378
|
if lo is not None:
|
|
303
379
|
s0 = filtered.n_obs
|
|
304
380
|
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) >= lo]
|
|
305
|
-
|
|
381
|
+
logger.info("Removed %s reads below min CpG fraction %s", s0 - filtered.n_obs, lo)
|
|
306
382
|
if hi is not None:
|
|
307
383
|
s0 = filtered.n_obs
|
|
308
384
|
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) <= hi]
|
|
309
|
-
|
|
310
|
-
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
385
|
+
logger.info("Removed %s reads above max CpG fraction %s", s0 - filtered.n_obs, hi)
|
|
386
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
387
|
+
"Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns
|
|
388
|
+
):
|
|
311
389
|
s0 = filtered.n_obs
|
|
312
|
-
filtered = filtered[
|
|
313
|
-
|
|
390
|
+
filtered = filtered[
|
|
391
|
+
filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float)
|
|
392
|
+
>= float(min_valid_fraction_positions_in_read_vs_ref)
|
|
393
|
+
]
|
|
394
|
+
logger.info(
|
|
395
|
+
"Removed %s reads with insufficient valid CpG site fraction vs ref",
|
|
396
|
+
s0 - filtered.n_obs,
|
|
397
|
+
)
|
|
314
398
|
|
|
315
399
|
# any C thresholds
|
|
316
|
-
if any_c_thresholds and
|
|
400
|
+
if any_c_thresholds and "C" in mod_target_bases:
|
|
317
401
|
lo, hi = _unpack_minmax(any_c_thresholds)
|
|
318
402
|
if lo is not None:
|
|
319
403
|
s0 = filtered.n_obs
|
|
320
404
|
filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) >= lo]
|
|
321
|
-
|
|
405
|
+
logger.info(
|
|
406
|
+
"Removed %s reads below min any-C fraction %s",
|
|
407
|
+
s0 - filtered.n_obs,
|
|
408
|
+
lo,
|
|
409
|
+
)
|
|
322
410
|
if hi is not None:
|
|
323
411
|
s0 = filtered.n_obs
|
|
324
412
|
filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) <= hi]
|
|
325
|
-
|
|
326
|
-
|
|
413
|
+
logger.info(
|
|
414
|
+
"Removed %s reads above max any-C fraction %s",
|
|
415
|
+
s0 - filtered.n_obs,
|
|
416
|
+
hi,
|
|
417
|
+
)
|
|
418
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
419
|
+
"Valid_C_site_in_read_vs_reference" in filtered.obs.columns
|
|
420
|
+
):
|
|
327
421
|
s0 = filtered.n_obs
|
|
328
|
-
filtered = filtered[
|
|
329
|
-
|
|
422
|
+
filtered = filtered[
|
|
423
|
+
filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float)
|
|
424
|
+
>= float(min_valid_fraction_positions_in_read_vs_ref)
|
|
425
|
+
]
|
|
426
|
+
logger.info(
|
|
427
|
+
"Removed %s reads with insufficient valid any-C site fraction vs ref",
|
|
428
|
+
s0 - filtered.n_obs,
|
|
429
|
+
)
|
|
330
430
|
|
|
331
431
|
# A thresholds
|
|
332
|
-
if a_thresholds and
|
|
432
|
+
if a_thresholds and "A" in mod_target_bases:
|
|
333
433
|
lo, hi = _unpack_minmax(a_thresholds)
|
|
334
434
|
if lo is not None:
|
|
335
435
|
s0 = filtered.n_obs
|
|
336
436
|
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) >= lo]
|
|
337
|
-
|
|
437
|
+
logger.info("Removed %s reads below min A fraction %s", s0 - filtered.n_obs, lo)
|
|
338
438
|
if hi is not None:
|
|
339
439
|
s0 = filtered.n_obs
|
|
340
440
|
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) <= hi]
|
|
341
|
-
|
|
342
|
-
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
441
|
+
logger.info("Removed %s reads above max A fraction %s", s0 - filtered.n_obs, hi)
|
|
442
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
443
|
+
"Valid_A_site_in_read_vs_reference" in filtered.obs.columns
|
|
444
|
+
):
|
|
343
445
|
s0 = filtered.n_obs
|
|
344
|
-
filtered = filtered[
|
|
345
|
-
|
|
446
|
+
filtered = filtered[
|
|
447
|
+
filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float)
|
|
448
|
+
>= float(min_valid_fraction_positions_in_read_vs_ref)
|
|
449
|
+
]
|
|
450
|
+
logger.info(
|
|
451
|
+
"Removed %s reads with insufficient valid A site fraction vs ref",
|
|
452
|
+
s0 - filtered.n_obs,
|
|
453
|
+
)
|
|
346
454
|
|
|
347
455
|
filtered = filtered.copy()
|
|
348
456
|
|
|
349
457
|
# mark as done
|
|
350
458
|
filtered.uns[uns_flag] = True
|
|
351
459
|
|
|
352
|
-
return filtered
|
|
460
|
+
return filtered
|