smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import gc
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import anndata as ad
|
|
6
|
+
from typing import Optional, Sequence, List
|
|
7
|
+
|
|
8
|
+
def filter_reads_on_modification_thresholds(
|
|
9
|
+
adata: ad.AnnData,
|
|
10
|
+
smf_modality: str,
|
|
11
|
+
mod_target_bases: List[str] = [],
|
|
12
|
+
gpc_thresholds: Optional[Sequence[float]] = None,
|
|
13
|
+
cpg_thresholds: Optional[Sequence[float]] = None,
|
|
14
|
+
any_c_thresholds: Optional[Sequence[float]] = None,
|
|
15
|
+
a_thresholds: Optional[Sequence[float]] = None,
|
|
16
|
+
use_other_c_as_background: bool = False,
|
|
17
|
+
min_valid_fraction_positions_in_read_vs_ref: Optional[float] = None,
|
|
18
|
+
uns_flag: str = 'reads_filtered_on_modification_thresholds',
|
|
19
|
+
bypass: bool = False,
|
|
20
|
+
force_redo: bool = False,
|
|
21
|
+
reference_column: str = 'Reference_strand',
|
|
22
|
+
# memory-control options:
|
|
23
|
+
batch_size: int = 200,
|
|
24
|
+
compute_obs_if_missing: bool = True,
|
|
25
|
+
treat_zero_as_invalid: bool = False
|
|
26
|
+
) -> ad.AnnData:
|
|
27
|
+
"""
|
|
28
|
+
Memory-efficient filtering by per-read modification thresholds.
|
|
29
|
+
|
|
30
|
+
- If required obs columns exist, uses them directly (fast).
|
|
31
|
+
- Otherwise, computes the relevant per-read metrics per-reference in batches
|
|
32
|
+
and writes them into adata.obs before filtering.
|
|
33
|
+
|
|
34
|
+
Parameters of interest :
|
|
35
|
+
- gpc_thresholds, cpg_thresholds, any_c_thresholds, a_thresholds:
|
|
36
|
+
each should be [min, max] (floats 0..1) or None. Thresholds are inclusive.
|
|
37
|
+
- use_other_c_as_background: require GpC/CpG > other_C background (if present).
|
|
38
|
+
- min_valid_fraction_positions_in_read_vs_ref: minimum fraction of valid sites
|
|
39
|
+
in the read vs reference (0..1). If None, this check is skipped.
|
|
40
|
+
- compute_obs_if_missing: if True, compute Fraction_* and Valid_* obs columns
|
|
41
|
+
if they are not already present, using a low-memory per-ref strategy.
|
|
42
|
+
- treat_zero_as_invalid: if True, a zero in X counts as invalid (non-site).
|
|
43
|
+
If False, zeros are considered valid positions (adjust to your data semantics).
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
# quick exit flags:
|
|
47
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
48
|
+
if (already and not force_redo) or bypass:
|
|
49
|
+
return adata
|
|
50
|
+
|
|
51
|
+
# helper: check whether obs columns exist for a particular mod type
|
|
52
|
+
def obs_has_columns_for(mod_type):
|
|
53
|
+
col_pref = {
|
|
54
|
+
"GpC": ("Fraction_GpC_site_modified", f"Valid_GpC_site_in_read_vs_reference"),
|
|
55
|
+
"CpG": ("Fraction_CpG_site_modified", f"Valid_CpG_site_in_read_vs_reference"),
|
|
56
|
+
"C": ("Fraction_C_site_modified", f"Valid_C_site_in_read_vs_reference"),
|
|
57
|
+
"A": ("Fraction_A_site_modified", f"Valid_A_site_in_read_vs_reference"),
|
|
58
|
+
}.get(mod_type, (None, None))
|
|
59
|
+
return (col_pref[0] in adata.obs.columns) and (col_pref[1] in adata.obs.columns)
|
|
60
|
+
|
|
61
|
+
# if all required obs columns are present, use them directly (fast path)
|
|
62
|
+
required_present = True
|
|
63
|
+
for mt, thr in (("GpC", gpc_thresholds), ("CpG", cpg_thresholds), ("C", any_c_thresholds), ("A", a_thresholds)):
|
|
64
|
+
if thr is not None and mt in mod_target_bases:
|
|
65
|
+
if not obs_has_columns_for(mt):
|
|
66
|
+
required_present = False
|
|
67
|
+
break
|
|
68
|
+
|
|
69
|
+
# If required obs columns are not present and compute_obs_if_missing is False => error
|
|
70
|
+
if not required_present and not compute_obs_if_missing:
|
|
71
|
+
raise RuntimeError(
|
|
72
|
+
"Required per-read summary columns not found in adata.obs and compute_obs_if_missing is False."
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Build mapping from reference -> var column names (expected pattern)
|
|
76
|
+
# e.g. var column names: "{ref}_GpC_site", "{ref}_CpG_site", "{ref}_any_C_site", "{ref}_other_C_site", "{ref}_A_site"
|
|
77
|
+
# If your var column naming differs, adjust these suffixes.
|
|
78
|
+
refs = list(adata.obs[reference_column].astype('category').cat.categories)
|
|
79
|
+
|
|
80
|
+
def _find_var_col_for(ref, suffix):
|
|
81
|
+
name = f"{ref}_{suffix}"
|
|
82
|
+
if name in adata.var.columns:
|
|
83
|
+
return name
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
# If we need to compute obs summaries: do so per-reference in batches
|
|
87
|
+
if not required_present and compute_obs_if_missing:
|
|
88
|
+
n_obs = adata.n_obs
|
|
89
|
+
# prepare empty columns in obs if they don't exist; fill later
|
|
90
|
+
# We'll create only columns that are relevant to mod_target_bases
|
|
91
|
+
create_cols = {}
|
|
92
|
+
if "GpC" in mod_target_bases:
|
|
93
|
+
create_cols["Fraction_GpC_site_modified"] = np.full((n_obs,), np.nan)
|
|
94
|
+
create_cols["Valid_GpC_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
|
|
95
|
+
# optional background ratio if other_C exists
|
|
96
|
+
create_cols["GpC_to_other_C_mod_ratio"] = np.full((n_obs,), np.nan)
|
|
97
|
+
if "CpG" in mod_target_bases:
|
|
98
|
+
create_cols["Fraction_CpG_site_modified"] = np.full((n_obs,), np.nan)
|
|
99
|
+
create_cols["Valid_CpG_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
|
|
100
|
+
create_cols["CpG_to_other_C_mod_ratio"] = np.full((n_obs,), np.nan)
|
|
101
|
+
if "C" in mod_target_bases:
|
|
102
|
+
create_cols["Fraction_C_site_modified"] = np.full((n_obs,), np.nan)
|
|
103
|
+
create_cols["Valid_C_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
|
|
104
|
+
if "A" in mod_target_bases:
|
|
105
|
+
create_cols["Fraction_A_site_modified"] = np.full((n_obs,), np.nan)
|
|
106
|
+
create_cols["Valid_A_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
|
|
107
|
+
|
|
108
|
+
# helper to compute for one reference and one suffix
|
|
109
|
+
def _compute_for_ref_and_suffix(ref, suffix, out_frac_arr, out_valid_arr):
|
|
110
|
+
"""
|
|
111
|
+
Compute fraction modified and valid fraction for reads mapping to 'ref'
|
|
112
|
+
using var column named f"{ref}_{suffix}" to select var columns.
|
|
113
|
+
"""
|
|
114
|
+
var_colname = _find_var_col_for(ref, suffix)
|
|
115
|
+
if var_colname is None:
|
|
116
|
+
# nothing to compute
|
|
117
|
+
return
|
|
118
|
+
|
|
119
|
+
# var boolean mask (which var columns belong to this suffix for the ref)
|
|
120
|
+
try:
|
|
121
|
+
var_mask_bool = np.asarray(adata.var[var_colname].values).astype(bool)
|
|
122
|
+
except Exception:
|
|
123
|
+
# if var has values not boolean, attempt coercion
|
|
124
|
+
var_mask_bool = np.asarray(pd.to_numeric(adata.var[var_colname], errors='coerce').fillna(0).astype(bool))
|
|
125
|
+
|
|
126
|
+
if not var_mask_bool.any():
|
|
127
|
+
return
|
|
128
|
+
col_indices = np.where(var_mask_bool)[0]
|
|
129
|
+
n_cols_for_ref = len(col_indices)
|
|
130
|
+
if n_cols_for_ref == 0:
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
# rows that belong to this reference
|
|
134
|
+
row_indices_all = np.where(adata.obs[reference_column].values == ref)[0]
|
|
135
|
+
if len(row_indices_all) == 0:
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
# process rows for this reference in batches to avoid allocating huge slices
|
|
139
|
+
for start in range(0, len(row_indices_all), batch_size):
|
|
140
|
+
block_rows_idx = row_indices_all[start : start + batch_size]
|
|
141
|
+
# slice rows x selected columns
|
|
142
|
+
X_block = adata.X[block_rows_idx, :][:, col_indices]
|
|
143
|
+
|
|
144
|
+
# If sparse, sum(axis=1) returns a (nrows,1) sparse/dense object -> coerce to 1d array
|
|
145
|
+
# If dense, this will be a dense array but limited to batch_size * n_cols_for_ref
|
|
146
|
+
# Count modified (assume numeric values where >0 indicate modification)
|
|
147
|
+
try:
|
|
148
|
+
# use vectorized sums; works for sparse/dense
|
|
149
|
+
# "modified_count" - count of entries > 0 (or > 0.5 if binary probabilities)
|
|
150
|
+
if hasattr(X_block, "toarray") and not isinstance(X_block, np.ndarray):
|
|
151
|
+
# sparse or matrix-like: convert sums carefully
|
|
152
|
+
# We compute:
|
|
153
|
+
# modified_count = (X_block > 0).sum(axis=1)
|
|
154
|
+
# valid_count = (non-nan if float data else non-zero) per row
|
|
155
|
+
# For sparse, .data are only stored nonzeros, so (X_block > 0).sum is fine
|
|
156
|
+
modified_count = np.asarray((X_block > 0).sum(axis=1)).ravel()
|
|
157
|
+
if np.isnan(X_block.data).any() if hasattr(X_block, 'data') else False:
|
|
158
|
+
# if sparse with stored NaNs (!) handle differently - unlikely
|
|
159
|
+
valid_count = np.asarray(~np.isnan(X_block.toarray()).sum(axis=1)).ravel()
|
|
160
|
+
else:
|
|
161
|
+
if treat_zero_as_invalid:
|
|
162
|
+
# valid = number of non-zero entries
|
|
163
|
+
valid_count = np.asarray((X_block != 0).sum(axis=1)).ravel()
|
|
164
|
+
else:
|
|
165
|
+
# treat all positions as valid positions (they exist in reference) -> denominator = n_cols_for_ref
|
|
166
|
+
valid_count = np.full_like(modified_count, n_cols_for_ref, dtype=float)
|
|
167
|
+
else:
|
|
168
|
+
# dense numpy
|
|
169
|
+
Xb = np.asarray(X_block)
|
|
170
|
+
if np.isnan(Xb).any():
|
|
171
|
+
valid_count = np.sum(~np.isnan(Xb), axis=1).astype(float)
|
|
172
|
+
else:
|
|
173
|
+
if treat_zero_as_invalid:
|
|
174
|
+
valid_count = np.sum(Xb != 0, axis=1).astype(float)
|
|
175
|
+
else:
|
|
176
|
+
valid_count = np.full((Xb.shape[0],), float(n_cols_for_ref))
|
|
177
|
+
modified_count = np.sum(Xb > 0, axis=1).astype(float)
|
|
178
|
+
except Exception:
|
|
179
|
+
# fallback to safe dense conversion per-row (shouldn't be needed usually)
|
|
180
|
+
Xb = np.asarray(X_block.toarray() if hasattr(X_block, "toarray") else X_block)
|
|
181
|
+
if Xb.size == 0:
|
|
182
|
+
modified_count = np.zeros(len(block_rows_idx), dtype=float)
|
|
183
|
+
valid_count = np.zeros(len(block_rows_idx), dtype=float)
|
|
184
|
+
else:
|
|
185
|
+
if np.isnan(Xb).any():
|
|
186
|
+
valid_count = np.sum(~np.isnan(Xb), axis=1).astype(float)
|
|
187
|
+
else:
|
|
188
|
+
if treat_zero_as_invalid:
|
|
189
|
+
valid_count = np.sum(Xb != 0, axis=1).astype(float)
|
|
190
|
+
else:
|
|
191
|
+
valid_count = np.full((Xb.shape[0],), float(n_cols_for_ref))
|
|
192
|
+
modified_count = np.sum(Xb > 0, axis=1).astype(float)
|
|
193
|
+
|
|
194
|
+
# fraction modified = modified_count / valid_count (guard divide-by-zero)
|
|
195
|
+
frac = np.zeros_like(modified_count, dtype=float)
|
|
196
|
+
mask_valid_nonzero = (valid_count > 0)
|
|
197
|
+
frac[mask_valid_nonzero] = modified_count[mask_valid_nonzero] / valid_count[mask_valid_nonzero]
|
|
198
|
+
|
|
199
|
+
# write to out arrays
|
|
200
|
+
out_frac_arr[block_rows_idx] = frac
|
|
201
|
+
# valid fraction relative to reference = valid_count / n_cols_for_ref
|
|
202
|
+
out_valid_arr[block_rows_idx] = np.zeros_like(valid_count, dtype=float)
|
|
203
|
+
out_valid_arr[block_rows_idx][mask_valid_nonzero] = (valid_count[mask_valid_nonzero] / float(n_cols_for_ref))
|
|
204
|
+
|
|
205
|
+
# free block memory ASAP
|
|
206
|
+
del X_block, modified_count, valid_count, frac
|
|
207
|
+
gc.collect()
|
|
208
|
+
|
|
209
|
+
# compute for each reference and required suffixes
|
|
210
|
+
# GpC
|
|
211
|
+
if "GpC" in mod_target_bases:
|
|
212
|
+
for ref in refs:
|
|
213
|
+
_compute_for_ref_and_suffix(ref, "GpC_site", create_cols["Fraction_GpC_site_modified"], create_cols["Valid_GpC_site_in_read_vs_reference"])
|
|
214
|
+
# other_C (for background)
|
|
215
|
+
# We'll also compute 'other_C' per reference if it exists
|
|
216
|
+
other_c_per_ref = {}
|
|
217
|
+
for ref in refs:
|
|
218
|
+
other_col = _find_var_col_for(ref, "other_C_site")
|
|
219
|
+
if other_col:
|
|
220
|
+
other_c_per_ref[ref] = np.where(np.asarray(adata.var[other_col].values).astype(bool))[0]
|
|
221
|
+
|
|
222
|
+
# CpG
|
|
223
|
+
if "CpG" in mod_target_bases:
|
|
224
|
+
for ref in refs:
|
|
225
|
+
_compute_for_ref_and_suffix(ref, "CpG_site", create_cols["Fraction_CpG_site_modified"], create_cols["Valid_CpG_site_in_read_vs_reference"])
|
|
226
|
+
|
|
227
|
+
# any C
|
|
228
|
+
if "C" in mod_target_bases:
|
|
229
|
+
for ref in refs:
|
|
230
|
+
_compute_for_ref_and_suffix(ref, "C_site", create_cols["Fraction_C_site_modified"], create_cols["Valid_C_site_in_read_vs_reference"])
|
|
231
|
+
|
|
232
|
+
# A
|
|
233
|
+
if "A" in mod_target_bases:
|
|
234
|
+
for ref in refs:
|
|
235
|
+
_compute_for_ref_and_suffix(ref, "A_site", create_cols["Fraction_A_site_modified"], create_cols["Valid_A_site_in_read_vs_reference"])
|
|
236
|
+
|
|
237
|
+
# write created arrays into adata.obs
|
|
238
|
+
for cname, arr in create_cols.items():
|
|
239
|
+
adata.obs[cname] = arr
|
|
240
|
+
|
|
241
|
+
# optionally compute GpC_to_other_C_mod_ratio and CpG_to_other_C_mod_ratio (if other_C masks exist)
|
|
242
|
+
if "GpC" in mod_target_bases and use_other_c_as_background:
|
|
243
|
+
# compute per-ref background ratio if both exist
|
|
244
|
+
# Simplest approach: if 'Fraction_GpC_site_modified' and 'Fraction_other_C_site_modified' exist, compute ratio
|
|
245
|
+
if "Fraction_other_C_site_modified" in adata.obs.columns:
|
|
246
|
+
with np.errstate(divide='ignore', invalid='ignore'):
|
|
247
|
+
ratio = adata.obs["Fraction_GpC_site_modified"].astype(float) / adata.obs["Fraction_other_C_site_modified"].astype(float)
|
|
248
|
+
adata.obs["GpC_to_other_C_mod_ratio"] = ratio.fillna(0.0)
|
|
249
|
+
else:
|
|
250
|
+
adata.obs["GpC_to_other_C_mod_ratio"] = np.nan
|
|
251
|
+
|
|
252
|
+
if "CpG" in mod_target_bases and use_other_c_as_background:
|
|
253
|
+
if "Fraction_other_C_site_modified" in adata.obs.columns:
|
|
254
|
+
with np.errstate(divide='ignore', invalid='ignore'):
|
|
255
|
+
ratio = adata.obs["Fraction_CpG_site_modified"].astype(float) / adata.obs["Fraction_other_C_site_modified"].astype(float)
|
|
256
|
+
adata.obs["CpG_to_other_C_mod_ratio"] = ratio.fillna(0.0)
|
|
257
|
+
else:
|
|
258
|
+
adata.obs["CpG_to_other_C_mod_ratio"] = np.nan
|
|
259
|
+
|
|
260
|
+
# free memory
|
|
261
|
+
del create_cols
|
|
262
|
+
gc.collect()
|
|
263
|
+
|
|
264
|
+
# --- Now apply the filters using adata.obs columns (this part is identical to your previous code but memory-friendly) ---
|
|
265
|
+
filtered = adata # we'll chain subset operations
|
|
266
|
+
|
|
267
|
+
# helper to get min/max from param like [min, max] or tuple(None,..)
|
|
268
|
+
def _unpack_minmax(thr):
|
|
269
|
+
if thr is None:
|
|
270
|
+
return None, None
|
|
271
|
+
try:
|
|
272
|
+
lo, hi = float(thr[0]) if thr[0] is not None else None, float(thr[1]) if thr[1] is not None else None
|
|
273
|
+
if lo is not None and hi is not None and lo > hi:
|
|
274
|
+
lo, hi = hi, lo
|
|
275
|
+
return lo, hi
|
|
276
|
+
except Exception:
|
|
277
|
+
return None, None
|
|
278
|
+
|
|
279
|
+
# GpC thresholds
|
|
280
|
+
if gpc_thresholds and 'GpC' in mod_target_bases:
|
|
281
|
+
lo, hi = _unpack_minmax(gpc_thresholds)
|
|
282
|
+
if use_other_c_as_background and smf_modality != 'deaminase' and "GpC_to_other_C_mod_ratio" in filtered.obs.columns:
|
|
283
|
+
filtered = filtered[filtered.obs["GpC_to_other_C_mod_ratio"].astype(float) > 1]
|
|
284
|
+
if lo is not None:
|
|
285
|
+
s0 = filtered.n_obs
|
|
286
|
+
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) >= lo]
|
|
287
|
+
print(f"Removed {s0 - filtered.n_obs} reads below min GpC fraction {lo}")
|
|
288
|
+
if hi is not None:
|
|
289
|
+
s0 = filtered.n_obs
|
|
290
|
+
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) <= hi]
|
|
291
|
+
print(f"Removed {s0 - filtered.n_obs} reads above max GpC fraction {hi}")
|
|
292
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns):
|
|
293
|
+
s0 = filtered.n_obs
|
|
294
|
+
filtered = filtered[filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
295
|
+
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid GpC site fraction vs ref")
|
|
296
|
+
|
|
297
|
+
# CpG thresholds
|
|
298
|
+
if cpg_thresholds and 'CpG' in mod_target_bases:
|
|
299
|
+
lo, hi = _unpack_minmax(cpg_thresholds)
|
|
300
|
+
if use_other_c_as_background and smf_modality != 'deaminase' and "CpG_to_other_C_mod_ratio" in filtered.obs.columns:
|
|
301
|
+
filtered = filtered[filtered.obs["CpG_to_other_C_mod_ratio"].astype(float) > 1]
|
|
302
|
+
if lo is not None:
|
|
303
|
+
s0 = filtered.n_obs
|
|
304
|
+
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) >= lo]
|
|
305
|
+
print(f"Removed {s0 - filtered.n_obs} reads below min CpG fraction {lo}")
|
|
306
|
+
if hi is not None:
|
|
307
|
+
s0 = filtered.n_obs
|
|
308
|
+
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) <= hi]
|
|
309
|
+
print(f"Removed {s0 - filtered.n_obs} reads above max CpG fraction {hi}")
|
|
310
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns):
|
|
311
|
+
s0 = filtered.n_obs
|
|
312
|
+
filtered = filtered[filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
313
|
+
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid CpG site fraction vs ref")
|
|
314
|
+
|
|
315
|
+
# any C thresholds
|
|
316
|
+
if any_c_thresholds and 'C' in mod_target_bases:
|
|
317
|
+
lo, hi = _unpack_minmax(any_c_thresholds)
|
|
318
|
+
if lo is not None:
|
|
319
|
+
s0 = filtered.n_obs
|
|
320
|
+
filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) >= lo]
|
|
321
|
+
print(f"Removed {s0 - filtered.n_obs} reads below min any-C fraction {lo}")
|
|
322
|
+
if hi is not None:
|
|
323
|
+
s0 = filtered.n_obs
|
|
324
|
+
filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) <= hi]
|
|
325
|
+
print(f"Removed {s0 - filtered.n_obs} reads above max any-C fraction {hi}")
|
|
326
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_C_site_in_read_vs_reference" in filtered.obs.columns):
|
|
327
|
+
s0 = filtered.n_obs
|
|
328
|
+
filtered = filtered[filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
329
|
+
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid any-C site fraction vs ref")
|
|
330
|
+
|
|
331
|
+
# A thresholds
|
|
332
|
+
if a_thresholds and 'A' in mod_target_bases:
|
|
333
|
+
lo, hi = _unpack_minmax(a_thresholds)
|
|
334
|
+
if lo is not None:
|
|
335
|
+
s0 = filtered.n_obs
|
|
336
|
+
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) >= lo]
|
|
337
|
+
print(f"Removed {s0 - filtered.n_obs} reads below min A fraction {lo}")
|
|
338
|
+
if hi is not None:
|
|
339
|
+
s0 = filtered.n_obs
|
|
340
|
+
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) <= hi]
|
|
341
|
+
print(f"Removed {s0 - filtered.n_obs} reads above max A fraction {hi}")
|
|
342
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_A_site_in_read_vs_reference" in filtered.obs.columns):
|
|
343
|
+
s0 = filtered.n_obs
|
|
344
|
+
filtered = filtered[filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
345
|
+
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid A site fraction vs ref")
|
|
346
|
+
|
|
347
|
+
filtered = filtered.copy()
|
|
348
|
+
|
|
349
|
+
# mark as done
|
|
350
|
+
filtered.uns[uns_flag] = True
|
|
351
|
+
|
|
352
|
+
return filtered
|