smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,9 +1,16 @@
|
|
|
1
|
-
import
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
2
3
|
import gc
|
|
4
|
+
from typing import List, Optional, Sequence
|
|
5
|
+
|
|
6
|
+
import anndata as ad
|
|
3
7
|
import numpy as np
|
|
4
8
|
import pandas as pd
|
|
5
|
-
|
|
6
|
-
from
|
|
9
|
+
|
|
10
|
+
from smftools.logging_utils import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
7
14
|
|
|
8
15
|
def filter_reads_on_modification_thresholds(
|
|
9
16
|
adata: ad.AnnData,
|
|
@@ -15,32 +22,40 @@ def filter_reads_on_modification_thresholds(
|
|
|
15
22
|
a_thresholds: Optional[Sequence[float]] = None,
|
|
16
23
|
use_other_c_as_background: bool = False,
|
|
17
24
|
min_valid_fraction_positions_in_read_vs_ref: Optional[float] = None,
|
|
18
|
-
uns_flag: str =
|
|
25
|
+
uns_flag: str = "filter_reads_on_modification_thresholds_performed",
|
|
19
26
|
bypass: bool = False,
|
|
20
27
|
force_redo: bool = False,
|
|
21
|
-
reference_column: str =
|
|
28
|
+
reference_column: str = "Reference_strand",
|
|
22
29
|
# memory-control options:
|
|
23
30
|
batch_size: int = 200,
|
|
24
31
|
compute_obs_if_missing: bool = True,
|
|
25
|
-
treat_zero_as_invalid: bool = False
|
|
32
|
+
treat_zero_as_invalid: bool = False,
|
|
26
33
|
) -> ad.AnnData:
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
34
|
+
"""Filter reads based on per-read modification thresholds.
|
|
35
|
+
|
|
36
|
+
If required obs columns exist, they are used directly. Otherwise, the function
|
|
37
|
+
computes the relevant per-read metrics in batches and stores them in ``adata.obs``.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
adata: AnnData object to filter.
|
|
41
|
+
smf_modality: SMF modality identifier.
|
|
42
|
+
mod_target_bases: List of target bases to evaluate.
|
|
43
|
+
gpc_thresholds: ``[min, max]`` thresholds for GpC (0..1) or ``None``.
|
|
44
|
+
cpg_thresholds: ``[min, max]`` thresholds for CpG (0..1) or ``None``.
|
|
45
|
+
any_c_thresholds: ``[min, max]`` thresholds for any C (0..1) or ``None``.
|
|
46
|
+
a_thresholds: ``[min, max]`` thresholds for A (0..1) or ``None``.
|
|
47
|
+
use_other_c_as_background: Require GpC/CpG > other_C background if present.
|
|
48
|
+
min_valid_fraction_positions_in_read_vs_ref: Minimum valid-site fraction per read.
|
|
49
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
50
|
+
bypass: Whether to skip processing.
|
|
51
|
+
force_redo: Whether to rerun even if ``uns_flag`` is set.
|
|
52
|
+
reference_column: Obs column containing reference identifiers.
|
|
53
|
+
batch_size: Batch size for low-memory computation.
|
|
54
|
+
compute_obs_if_missing: Whether to compute missing obs summaries.
|
|
55
|
+
treat_zero_as_invalid: Whether zeros should be treated as invalid positions.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
anndata.AnnData: Filtered AnnData object.
|
|
44
59
|
"""
|
|
45
60
|
|
|
46
61
|
# quick exit flags:
|
|
@@ -50,17 +65,23 @@ def filter_reads_on_modification_thresholds(
|
|
|
50
65
|
|
|
51
66
|
# helper: check whether obs columns exist for a particular mod type
|
|
52
67
|
def obs_has_columns_for(mod_type):
|
|
68
|
+
"""Return True if per-read summary columns exist for a mod type."""
|
|
53
69
|
col_pref = {
|
|
54
|
-
"GpC": ("Fraction_GpC_site_modified",
|
|
55
|
-
"CpG": ("Fraction_CpG_site_modified",
|
|
56
|
-
"C": ("Fraction_C_site_modified",
|
|
57
|
-
"A": ("Fraction_A_site_modified",
|
|
70
|
+
"GpC": ("Fraction_GpC_site_modified", "Valid_GpC_site_in_read_vs_reference"),
|
|
71
|
+
"CpG": ("Fraction_CpG_site_modified", "Valid_CpG_site_in_read_vs_reference"),
|
|
72
|
+
"C": ("Fraction_C_site_modified", "Valid_C_site_in_read_vs_reference"),
|
|
73
|
+
"A": ("Fraction_A_site_modified", "Valid_A_site_in_read_vs_reference"),
|
|
58
74
|
}.get(mod_type, (None, None))
|
|
59
75
|
return (col_pref[0] in adata.obs.columns) and (col_pref[1] in adata.obs.columns)
|
|
60
76
|
|
|
61
77
|
# if all required obs columns are present, use them directly (fast path)
|
|
62
78
|
required_present = True
|
|
63
|
-
for mt, thr in (
|
|
79
|
+
for mt, thr in (
|
|
80
|
+
("GpC", gpc_thresholds),
|
|
81
|
+
("CpG", cpg_thresholds),
|
|
82
|
+
("C", any_c_thresholds),
|
|
83
|
+
("A", a_thresholds),
|
|
84
|
+
):
|
|
64
85
|
if thr is not None and mt in mod_target_bases:
|
|
65
86
|
if not obs_has_columns_for(mt):
|
|
66
87
|
required_present = False
|
|
@@ -75,9 +96,10 @@ def filter_reads_on_modification_thresholds(
|
|
|
75
96
|
# Build mapping from reference -> var column names (expected pattern)
|
|
76
97
|
# e.g. var column names: "{ref}_GpC_site", "{ref}_CpG_site", "{ref}_any_C_site", "{ref}_other_C_site", "{ref}_A_site"
|
|
77
98
|
# If your var column naming differs, adjust these suffixes.
|
|
78
|
-
refs = list(adata.obs[reference_column].astype(
|
|
99
|
+
refs = list(adata.obs[reference_column].astype("category").cat.categories)
|
|
79
100
|
|
|
80
101
|
def _find_var_col_for(ref, suffix):
|
|
102
|
+
"""Resolve a var column name for a reference/suffix pair."""
|
|
81
103
|
name = f"{ref}_{suffix}"
|
|
82
104
|
if name in adata.var.columns:
|
|
83
105
|
return name
|
|
@@ -121,7 +143,9 @@ def filter_reads_on_modification_thresholds(
|
|
|
121
143
|
var_mask_bool = np.asarray(adata.var[var_colname].values).astype(bool)
|
|
122
144
|
except Exception:
|
|
123
145
|
# if var has values not boolean, attempt coercion
|
|
124
|
-
var_mask_bool = np.asarray(
|
|
146
|
+
var_mask_bool = np.asarray(
|
|
147
|
+
pd.to_numeric(adata.var[var_colname], errors="coerce").fillna(0).astype(bool)
|
|
148
|
+
)
|
|
125
149
|
|
|
126
150
|
if not var_mask_bool.any():
|
|
127
151
|
return
|
|
@@ -154,16 +178,20 @@ def filter_reads_on_modification_thresholds(
|
|
|
154
178
|
# valid_count = (non-nan if float data else non-zero) per row
|
|
155
179
|
# For sparse, .data are only stored nonzeros, so (X_block > 0).sum is fine
|
|
156
180
|
modified_count = np.asarray((X_block > 0).sum(axis=1)).ravel()
|
|
157
|
-
if np.isnan(X_block.data).any() if hasattr(X_block,
|
|
181
|
+
if np.isnan(X_block.data).any() if hasattr(X_block, "data") else False:
|
|
158
182
|
# if sparse with stored NaNs (!) handle differently - unlikely
|
|
159
|
-
valid_count = np.asarray(
|
|
183
|
+
valid_count = np.asarray(
|
|
184
|
+
~np.isnan(X_block.toarray()).sum(axis=1)
|
|
185
|
+
).ravel()
|
|
160
186
|
else:
|
|
161
187
|
if treat_zero_as_invalid:
|
|
162
188
|
# valid = number of non-zero entries
|
|
163
189
|
valid_count = np.asarray((X_block != 0).sum(axis=1)).ravel()
|
|
164
190
|
else:
|
|
165
191
|
# treat all positions as valid positions (they exist in reference) -> denominator = n_cols_for_ref
|
|
166
|
-
valid_count = np.full_like(
|
|
192
|
+
valid_count = np.full_like(
|
|
193
|
+
modified_count, n_cols_for_ref, dtype=float
|
|
194
|
+
)
|
|
167
195
|
else:
|
|
168
196
|
# dense numpy
|
|
169
197
|
Xb = np.asarray(X_block)
|
|
@@ -193,14 +221,18 @@ def filter_reads_on_modification_thresholds(
|
|
|
193
221
|
|
|
194
222
|
# fraction modified = modified_count / valid_count (guard divide-by-zero)
|
|
195
223
|
frac = np.zeros_like(modified_count, dtype=float)
|
|
196
|
-
mask_valid_nonzero =
|
|
197
|
-
frac[mask_valid_nonzero] =
|
|
224
|
+
mask_valid_nonzero = valid_count > 0
|
|
225
|
+
frac[mask_valid_nonzero] = (
|
|
226
|
+
modified_count[mask_valid_nonzero] / valid_count[mask_valid_nonzero]
|
|
227
|
+
)
|
|
198
228
|
|
|
199
229
|
# write to out arrays
|
|
200
230
|
out_frac_arr[block_rows_idx] = frac
|
|
201
231
|
# valid fraction relative to reference = valid_count / n_cols_for_ref
|
|
202
232
|
out_valid_arr[block_rows_idx] = np.zeros_like(valid_count, dtype=float)
|
|
203
|
-
out_valid_arr[block_rows_idx][mask_valid_nonzero] =
|
|
233
|
+
out_valid_arr[block_rows_idx][mask_valid_nonzero] = valid_count[
|
|
234
|
+
mask_valid_nonzero
|
|
235
|
+
] / float(n_cols_for_ref)
|
|
204
236
|
|
|
205
237
|
# free block memory ASAP
|
|
206
238
|
del X_block, modified_count, valid_count, frac
|
|
@@ -210,29 +242,51 @@ def filter_reads_on_modification_thresholds(
|
|
|
210
242
|
# GpC
|
|
211
243
|
if "GpC" in mod_target_bases:
|
|
212
244
|
for ref in refs:
|
|
213
|
-
_compute_for_ref_and_suffix(
|
|
245
|
+
_compute_for_ref_and_suffix(
|
|
246
|
+
ref,
|
|
247
|
+
"GpC_site",
|
|
248
|
+
create_cols["Fraction_GpC_site_modified"],
|
|
249
|
+
create_cols["Valid_GpC_site_in_read_vs_reference"],
|
|
250
|
+
)
|
|
214
251
|
# other_C (for background)
|
|
215
252
|
# We'll also compute 'other_C' per reference if it exists
|
|
216
253
|
other_c_per_ref = {}
|
|
217
254
|
for ref in refs:
|
|
218
255
|
other_col = _find_var_col_for(ref, "other_C_site")
|
|
219
256
|
if other_col:
|
|
220
|
-
other_c_per_ref[ref] = np.where(
|
|
257
|
+
other_c_per_ref[ref] = np.where(
|
|
258
|
+
np.asarray(adata.var[other_col].values).astype(bool)
|
|
259
|
+
)[0]
|
|
221
260
|
|
|
222
261
|
# CpG
|
|
223
262
|
if "CpG" in mod_target_bases:
|
|
224
263
|
for ref in refs:
|
|
225
|
-
_compute_for_ref_and_suffix(
|
|
264
|
+
_compute_for_ref_and_suffix(
|
|
265
|
+
ref,
|
|
266
|
+
"CpG_site",
|
|
267
|
+
create_cols["Fraction_CpG_site_modified"],
|
|
268
|
+
create_cols["Valid_CpG_site_in_read_vs_reference"],
|
|
269
|
+
)
|
|
226
270
|
|
|
227
271
|
# any C
|
|
228
272
|
if "C" in mod_target_bases:
|
|
229
273
|
for ref in refs:
|
|
230
|
-
_compute_for_ref_and_suffix(
|
|
274
|
+
_compute_for_ref_and_suffix(
|
|
275
|
+
ref,
|
|
276
|
+
"C_site",
|
|
277
|
+
create_cols["Fraction_C_site_modified"],
|
|
278
|
+
create_cols["Valid_C_site_in_read_vs_reference"],
|
|
279
|
+
)
|
|
231
280
|
|
|
232
281
|
# A
|
|
233
282
|
if "A" in mod_target_bases:
|
|
234
283
|
for ref in refs:
|
|
235
|
-
_compute_for_ref_and_suffix(
|
|
284
|
+
_compute_for_ref_and_suffix(
|
|
285
|
+
ref,
|
|
286
|
+
"A_site",
|
|
287
|
+
create_cols["Fraction_A_site_modified"],
|
|
288
|
+
create_cols["Valid_A_site_in_read_vs_reference"],
|
|
289
|
+
)
|
|
236
290
|
|
|
237
291
|
# write created arrays into adata.obs
|
|
238
292
|
for cname, arr in create_cols.items():
|
|
@@ -243,16 +297,20 @@ def filter_reads_on_modification_thresholds(
|
|
|
243
297
|
# compute per-ref background ratio if both exist
|
|
244
298
|
# Simplest approach: if 'Fraction_GpC_site_modified' and 'Fraction_other_C_site_modified' exist, compute ratio
|
|
245
299
|
if "Fraction_other_C_site_modified" in adata.obs.columns:
|
|
246
|
-
with np.errstate(divide=
|
|
247
|
-
ratio = adata.obs["Fraction_GpC_site_modified"].astype(float) / adata.obs[
|
|
300
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
301
|
+
ratio = adata.obs["Fraction_GpC_site_modified"].astype(float) / adata.obs[
|
|
302
|
+
"Fraction_other_C_site_modified"
|
|
303
|
+
].astype(float)
|
|
248
304
|
adata.obs["GpC_to_other_C_mod_ratio"] = ratio.fillna(0.0)
|
|
249
305
|
else:
|
|
250
306
|
adata.obs["GpC_to_other_C_mod_ratio"] = np.nan
|
|
251
307
|
|
|
252
308
|
if "CpG" in mod_target_bases and use_other_c_as_background:
|
|
253
309
|
if "Fraction_other_C_site_modified" in adata.obs.columns:
|
|
254
|
-
with np.errstate(divide=
|
|
255
|
-
ratio = adata.obs["Fraction_CpG_site_modified"].astype(float) / adata.obs[
|
|
310
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
311
|
+
ratio = adata.obs["Fraction_CpG_site_modified"].astype(float) / adata.obs[
|
|
312
|
+
"Fraction_other_C_site_modified"
|
|
313
|
+
].astype(float)
|
|
256
314
|
adata.obs["CpG_to_other_C_mod_ratio"] = ratio.fillna(0.0)
|
|
257
315
|
else:
|
|
258
316
|
adata.obs["CpG_to_other_C_mod_ratio"] = np.nan
|
|
@@ -266,10 +324,14 @@ def filter_reads_on_modification_thresholds(
|
|
|
266
324
|
|
|
267
325
|
# helper to get min/max from param like [min, max] or tuple(None,..)
|
|
268
326
|
def _unpack_minmax(thr):
|
|
327
|
+
"""Normalize a threshold pair to ordered (min, max) floats."""
|
|
269
328
|
if thr is None:
|
|
270
329
|
return None, None
|
|
271
330
|
try:
|
|
272
|
-
lo, hi =
|
|
331
|
+
lo, hi = (
|
|
332
|
+
float(thr[0]) if thr[0] is not None else None,
|
|
333
|
+
float(thr[1]) if thr[1] is not None else None,
|
|
334
|
+
)
|
|
273
335
|
if lo is not None and hi is not None and lo > hi:
|
|
274
336
|
lo, hi = hi, lo
|
|
275
337
|
return lo, hi
|
|
@@ -277,76 +339,124 @@ def filter_reads_on_modification_thresholds(
|
|
|
277
339
|
return None, None
|
|
278
340
|
|
|
279
341
|
# GpC thresholds
|
|
280
|
-
if gpc_thresholds and
|
|
342
|
+
if gpc_thresholds and "GpC" in mod_target_bases:
|
|
281
343
|
lo, hi = _unpack_minmax(gpc_thresholds)
|
|
282
|
-
if
|
|
344
|
+
if (
|
|
345
|
+
use_other_c_as_background
|
|
346
|
+
and smf_modality != "deaminase"
|
|
347
|
+
and "GpC_to_other_C_mod_ratio" in filtered.obs.columns
|
|
348
|
+
):
|
|
283
349
|
filtered = filtered[filtered.obs["GpC_to_other_C_mod_ratio"].astype(float) > 1]
|
|
284
350
|
if lo is not None:
|
|
285
351
|
s0 = filtered.n_obs
|
|
286
352
|
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) >= lo]
|
|
287
|
-
|
|
353
|
+
logger.info("Removed %s reads below min GpC fraction %s", s0 - filtered.n_obs, lo)
|
|
288
354
|
if hi is not None:
|
|
289
355
|
s0 = filtered.n_obs
|
|
290
356
|
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) <= hi]
|
|
291
|
-
|
|
292
|
-
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
357
|
+
logger.info("Removed %s reads above max GpC fraction %s", s0 - filtered.n_obs, hi)
|
|
358
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
359
|
+
"Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns
|
|
360
|
+
):
|
|
293
361
|
s0 = filtered.n_obs
|
|
294
|
-
filtered = filtered[
|
|
295
|
-
|
|
362
|
+
filtered = filtered[
|
|
363
|
+
filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float)
|
|
364
|
+
>= float(min_valid_fraction_positions_in_read_vs_ref)
|
|
365
|
+
]
|
|
366
|
+
logger.info(
|
|
367
|
+
"Removed %s reads with insufficient valid GpC site fraction vs ref",
|
|
368
|
+
s0 - filtered.n_obs,
|
|
369
|
+
)
|
|
296
370
|
|
|
297
371
|
# CpG thresholds
|
|
298
|
-
if cpg_thresholds and
|
|
372
|
+
if cpg_thresholds and "CpG" in mod_target_bases:
|
|
299
373
|
lo, hi = _unpack_minmax(cpg_thresholds)
|
|
300
|
-
if
|
|
374
|
+
if (
|
|
375
|
+
use_other_c_as_background
|
|
376
|
+
and smf_modality != "deaminase"
|
|
377
|
+
and "CpG_to_other_C_mod_ratio" in filtered.obs.columns
|
|
378
|
+
):
|
|
301
379
|
filtered = filtered[filtered.obs["CpG_to_other_C_mod_ratio"].astype(float) > 1]
|
|
302
380
|
if lo is not None:
|
|
303
381
|
s0 = filtered.n_obs
|
|
304
382
|
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) >= lo]
|
|
305
|
-
|
|
383
|
+
logger.info("Removed %s reads below min CpG fraction %s", s0 - filtered.n_obs, lo)
|
|
306
384
|
if hi is not None:
|
|
307
385
|
s0 = filtered.n_obs
|
|
308
386
|
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) <= hi]
|
|
309
|
-
|
|
310
|
-
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
387
|
+
logger.info("Removed %s reads above max CpG fraction %s", s0 - filtered.n_obs, hi)
|
|
388
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
389
|
+
"Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns
|
|
390
|
+
):
|
|
311
391
|
s0 = filtered.n_obs
|
|
312
|
-
filtered = filtered[
|
|
313
|
-
|
|
392
|
+
filtered = filtered[
|
|
393
|
+
filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float)
|
|
394
|
+
>= float(min_valid_fraction_positions_in_read_vs_ref)
|
|
395
|
+
]
|
|
396
|
+
logger.info(
|
|
397
|
+
"Removed %s reads with insufficient valid CpG site fraction vs ref",
|
|
398
|
+
s0 - filtered.n_obs,
|
|
399
|
+
)
|
|
314
400
|
|
|
315
401
|
# any C thresholds
|
|
316
|
-
if any_c_thresholds and
|
|
402
|
+
if any_c_thresholds and "C" in mod_target_bases:
|
|
317
403
|
lo, hi = _unpack_minmax(any_c_thresholds)
|
|
318
404
|
if lo is not None:
|
|
319
405
|
s0 = filtered.n_obs
|
|
320
406
|
filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) >= lo]
|
|
321
|
-
|
|
407
|
+
logger.info(
|
|
408
|
+
"Removed %s reads below min any-C fraction %s",
|
|
409
|
+
s0 - filtered.n_obs,
|
|
410
|
+
lo,
|
|
411
|
+
)
|
|
322
412
|
if hi is not None:
|
|
323
413
|
s0 = filtered.n_obs
|
|
324
414
|
filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) <= hi]
|
|
325
|
-
|
|
326
|
-
|
|
415
|
+
logger.info(
|
|
416
|
+
"Removed %s reads above max any-C fraction %s",
|
|
417
|
+
s0 - filtered.n_obs,
|
|
418
|
+
hi,
|
|
419
|
+
)
|
|
420
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
421
|
+
"Valid_C_site_in_read_vs_reference" in filtered.obs.columns
|
|
422
|
+
):
|
|
327
423
|
s0 = filtered.n_obs
|
|
328
|
-
filtered = filtered[
|
|
329
|
-
|
|
424
|
+
filtered = filtered[
|
|
425
|
+
filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float)
|
|
426
|
+
>= float(min_valid_fraction_positions_in_read_vs_ref)
|
|
427
|
+
]
|
|
428
|
+
logger.info(
|
|
429
|
+
"Removed %s reads with insufficient valid any-C site fraction vs ref",
|
|
430
|
+
s0 - filtered.n_obs,
|
|
431
|
+
)
|
|
330
432
|
|
|
331
433
|
# A thresholds
|
|
332
|
-
if a_thresholds and
|
|
434
|
+
if a_thresholds and "A" in mod_target_bases:
|
|
333
435
|
lo, hi = _unpack_minmax(a_thresholds)
|
|
334
436
|
if lo is not None:
|
|
335
437
|
s0 = filtered.n_obs
|
|
336
438
|
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) >= lo]
|
|
337
|
-
|
|
439
|
+
logger.info("Removed %s reads below min A fraction %s", s0 - filtered.n_obs, lo)
|
|
338
440
|
if hi is not None:
|
|
339
441
|
s0 = filtered.n_obs
|
|
340
442
|
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) <= hi]
|
|
341
|
-
|
|
342
|
-
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
443
|
+
logger.info("Removed %s reads above max A fraction %s", s0 - filtered.n_obs, hi)
|
|
444
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
|
|
445
|
+
"Valid_A_site_in_read_vs_reference" in filtered.obs.columns
|
|
446
|
+
):
|
|
343
447
|
s0 = filtered.n_obs
|
|
344
|
-
filtered = filtered[
|
|
345
|
-
|
|
448
|
+
filtered = filtered[
|
|
449
|
+
filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float)
|
|
450
|
+
>= float(min_valid_fraction_positions_in_read_vs_ref)
|
|
451
|
+
]
|
|
452
|
+
logger.info(
|
|
453
|
+
"Removed %s reads with insufficient valid A site fraction vs ref",
|
|
454
|
+
s0 - filtered.n_obs,
|
|
455
|
+
)
|
|
346
456
|
|
|
347
457
|
filtered = filtered.copy()
|
|
348
458
|
|
|
349
459
|
# mark as done
|
|
350
460
|
filtered.uns[uns_flag] = True
|
|
351
461
|
|
|
352
|
-
return filtered
|
|
462
|
+
return filtered
|