smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,352 @@
1
+ import math
2
+ import gc
3
+ import numpy as np
4
+ import pandas as pd
5
+ import anndata as ad
6
+ from typing import Optional, Sequence, List
7
+
8
+ def filter_reads_on_modification_thresholds(
9
+ adata: ad.AnnData,
10
+ smf_modality: str,
11
+ mod_target_bases: List[str] = [],
12
+ gpc_thresholds: Optional[Sequence[float]] = None,
13
+ cpg_thresholds: Optional[Sequence[float]] = None,
14
+ any_c_thresholds: Optional[Sequence[float]] = None,
15
+ a_thresholds: Optional[Sequence[float]] = None,
16
+ use_other_c_as_background: bool = False,
17
+ min_valid_fraction_positions_in_read_vs_ref: Optional[float] = None,
18
+ uns_flag: str = 'reads_filtered_on_modification_thresholds',
19
+ bypass: bool = False,
20
+ force_redo: bool = False,
21
+ reference_column: str = 'Reference_strand',
22
+ # memory-control options:
23
+ batch_size: int = 200,
24
+ compute_obs_if_missing: bool = True,
25
+ treat_zero_as_invalid: bool = False
26
+ ) -> ad.AnnData:
27
+ """
28
+ Memory-efficient filtering by per-read modification thresholds.
29
+
30
+ - If required obs columns exist, uses them directly (fast).
31
+ - Otherwise, computes the relevant per-read metrics per-reference in batches
32
+ and writes them into adata.obs before filtering.
33
+
34
+ Parameters of interest :
35
+ - gpc_thresholds, cpg_thresholds, any_c_thresholds, a_thresholds:
36
+ each should be [min, max] (floats 0..1) or None. Thresholds are inclusive.
37
+ - use_other_c_as_background: require GpC/CpG > other_C background (if present).
38
+ - min_valid_fraction_positions_in_read_vs_ref: minimum fraction of valid sites
39
+ in the read vs reference (0..1). If None, this check is skipped.
40
+ - compute_obs_if_missing: if True, compute Fraction_* and Valid_* obs columns
41
+ if they are not already present, using a low-memory per-ref strategy.
42
+ - treat_zero_as_invalid: if True, a zero in X counts as invalid (non-site).
43
+ If False, zeros are considered valid positions (adjust to your data semantics).
44
+ """
45
+
46
+ # quick exit flags:
47
+ already = bool(adata.uns.get(uns_flag, False))
48
+ if (already and not force_redo) or bypass:
49
+ return adata
50
+
51
+ # helper: check whether obs columns exist for a particular mod type
52
+ def obs_has_columns_for(mod_type):
53
+ col_pref = {
54
+ "GpC": ("Fraction_GpC_site_modified", f"Valid_GpC_site_in_read_vs_reference"),
55
+ "CpG": ("Fraction_CpG_site_modified", f"Valid_CpG_site_in_read_vs_reference"),
56
+ "C": ("Fraction_C_site_modified", f"Valid_C_site_in_read_vs_reference"),
57
+ "A": ("Fraction_A_site_modified", f"Valid_A_site_in_read_vs_reference"),
58
+ }.get(mod_type, (None, None))
59
+ return (col_pref[0] in adata.obs.columns) and (col_pref[1] in adata.obs.columns)
60
+
61
+ # if all required obs columns are present, use them directly (fast path)
62
+ required_present = True
63
+ for mt, thr in (("GpC", gpc_thresholds), ("CpG", cpg_thresholds), ("C", any_c_thresholds), ("A", a_thresholds)):
64
+ if thr is not None and mt in mod_target_bases:
65
+ if not obs_has_columns_for(mt):
66
+ required_present = False
67
+ break
68
+
69
+ # If required obs columns are not present and compute_obs_if_missing is False => error
70
+ if not required_present and not compute_obs_if_missing:
71
+ raise RuntimeError(
72
+ "Required per-read summary columns not found in adata.obs and compute_obs_if_missing is False."
73
+ )
74
+
75
+ # Build mapping from reference -> var column names (expected pattern)
76
+ # e.g. var column names: "{ref}_GpC_site", "{ref}_CpG_site", "{ref}_any_C_site", "{ref}_other_C_site", "{ref}_A_site"
77
+ # If your var column naming differs, adjust these suffixes.
78
+ refs = list(adata.obs[reference_column].astype('category').cat.categories)
79
+
80
+ def _find_var_col_for(ref, suffix):
81
+ name = f"{ref}_{suffix}"
82
+ if name in adata.var.columns:
83
+ return name
84
+ return None
85
+
86
+ # If we need to compute obs summaries: do so per-reference in batches
87
+ if not required_present and compute_obs_if_missing:
88
+ n_obs = adata.n_obs
89
+ # prepare empty columns in obs if they don't exist; fill later
90
+ # We'll create only columns that are relevant to mod_target_bases
91
+ create_cols = {}
92
+ if "GpC" in mod_target_bases:
93
+ create_cols["Fraction_GpC_site_modified"] = np.full((n_obs,), np.nan)
94
+ create_cols["Valid_GpC_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
95
+ # optional background ratio if other_C exists
96
+ create_cols["GpC_to_other_C_mod_ratio"] = np.full((n_obs,), np.nan)
97
+ if "CpG" in mod_target_bases:
98
+ create_cols["Fraction_CpG_site_modified"] = np.full((n_obs,), np.nan)
99
+ create_cols["Valid_CpG_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
100
+ create_cols["CpG_to_other_C_mod_ratio"] = np.full((n_obs,), np.nan)
101
+ if "C" in mod_target_bases:
102
+ create_cols["Fraction_C_site_modified"] = np.full((n_obs,), np.nan)
103
+ create_cols["Valid_C_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
104
+ if "A" in mod_target_bases:
105
+ create_cols["Fraction_A_site_modified"] = np.full((n_obs,), np.nan)
106
+ create_cols["Valid_A_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
107
+
108
+ # helper to compute for one reference and one suffix
109
+ def _compute_for_ref_and_suffix(ref, suffix, out_frac_arr, out_valid_arr):
110
+ """
111
+ Compute fraction modified and valid fraction for reads mapping to 'ref'
112
+ using var column named f"{ref}_{suffix}" to select var columns.
113
+ """
114
+ var_colname = _find_var_col_for(ref, suffix)
115
+ if var_colname is None:
116
+ # nothing to compute
117
+ return
118
+
119
+ # var boolean mask (which var columns belong to this suffix for the ref)
120
+ try:
121
+ var_mask_bool = np.asarray(adata.var[var_colname].values).astype(bool)
122
+ except Exception:
123
+ # if var has values not boolean, attempt coercion
124
+ var_mask_bool = np.asarray(pd.to_numeric(adata.var[var_colname], errors='coerce').fillna(0).astype(bool))
125
+
126
+ if not var_mask_bool.any():
127
+ return
128
+ col_indices = np.where(var_mask_bool)[0]
129
+ n_cols_for_ref = len(col_indices)
130
+ if n_cols_for_ref == 0:
131
+ return
132
+
133
+ # rows that belong to this reference
134
+ row_indices_all = np.where(adata.obs[reference_column].values == ref)[0]
135
+ if len(row_indices_all) == 0:
136
+ return
137
+
138
+ # process rows for this reference in batches to avoid allocating huge slices
139
+ for start in range(0, len(row_indices_all), batch_size):
140
+ block_rows_idx = row_indices_all[start : start + batch_size]
141
+ # slice rows x selected columns
142
+ X_block = adata.X[block_rows_idx, :][:, col_indices]
143
+
144
+ # If sparse, sum(axis=1) returns a (nrows,1) sparse/dense object -> coerce to 1d array
145
+ # If dense, this will be a dense array but limited to batch_size * n_cols_for_ref
146
+ # Count modified (assume numeric values where >0 indicate modification)
147
+ try:
148
+ # use vectorized sums; works for sparse/dense
149
+ # "modified_count" - count of entries > 0 (or > 0.5 if binary probabilities)
150
+ if hasattr(X_block, "toarray") and not isinstance(X_block, np.ndarray):
151
+ # sparse or matrix-like: convert sums carefully
152
+ # We compute:
153
+ # modified_count = (X_block > 0).sum(axis=1)
154
+ # valid_count = (non-nan if float data else non-zero) per row
155
+ # For sparse, .data are only stored nonzeros, so (X_block > 0).sum is fine
156
+ modified_count = np.asarray((X_block > 0).sum(axis=1)).ravel()
157
+ if np.isnan(X_block.data).any() if hasattr(X_block, 'data') else False:
158
+ # if sparse with stored NaNs (!) handle differently - unlikely
159
+ valid_count = np.asarray(~np.isnan(X_block.toarray()).sum(axis=1)).ravel()
160
+ else:
161
+ if treat_zero_as_invalid:
162
+ # valid = number of non-zero entries
163
+ valid_count = np.asarray((X_block != 0).sum(axis=1)).ravel()
164
+ else:
165
+ # treat all positions as valid positions (they exist in reference) -> denominator = n_cols_for_ref
166
+ valid_count = np.full_like(modified_count, n_cols_for_ref, dtype=float)
167
+ else:
168
+ # dense numpy
169
+ Xb = np.asarray(X_block)
170
+ if np.isnan(Xb).any():
171
+ valid_count = np.sum(~np.isnan(Xb), axis=1).astype(float)
172
+ else:
173
+ if treat_zero_as_invalid:
174
+ valid_count = np.sum(Xb != 0, axis=1).astype(float)
175
+ else:
176
+ valid_count = np.full((Xb.shape[0],), float(n_cols_for_ref))
177
+ modified_count = np.sum(Xb > 0, axis=1).astype(float)
178
+ except Exception:
179
+ # fallback to safe dense conversion per-row (shouldn't be needed usually)
180
+ Xb = np.asarray(X_block.toarray() if hasattr(X_block, "toarray") else X_block)
181
+ if Xb.size == 0:
182
+ modified_count = np.zeros(len(block_rows_idx), dtype=float)
183
+ valid_count = np.zeros(len(block_rows_idx), dtype=float)
184
+ else:
185
+ if np.isnan(Xb).any():
186
+ valid_count = np.sum(~np.isnan(Xb), axis=1).astype(float)
187
+ else:
188
+ if treat_zero_as_invalid:
189
+ valid_count = np.sum(Xb != 0, axis=1).astype(float)
190
+ else:
191
+ valid_count = np.full((Xb.shape[0],), float(n_cols_for_ref))
192
+ modified_count = np.sum(Xb > 0, axis=1).astype(float)
193
+
194
+ # fraction modified = modified_count / valid_count (guard divide-by-zero)
195
+ frac = np.zeros_like(modified_count, dtype=float)
196
+ mask_valid_nonzero = (valid_count > 0)
197
+ frac[mask_valid_nonzero] = modified_count[mask_valid_nonzero] / valid_count[mask_valid_nonzero]
198
+
199
+ # write to out arrays
200
+ out_frac_arr[block_rows_idx] = frac
201
+ # valid fraction relative to reference = valid_count / n_cols_for_ref
202
+ out_valid_arr[block_rows_idx] = np.zeros_like(valid_count, dtype=float)
203
+ out_valid_arr[block_rows_idx][mask_valid_nonzero] = (valid_count[mask_valid_nonzero] / float(n_cols_for_ref))
204
+
205
+ # free block memory ASAP
206
+ del X_block, modified_count, valid_count, frac
207
+ gc.collect()
208
+
209
+ # compute for each reference and required suffixes
210
+ # GpC
211
+ if "GpC" in mod_target_bases:
212
+ for ref in refs:
213
+ _compute_for_ref_and_suffix(ref, "GpC_site", create_cols["Fraction_GpC_site_modified"], create_cols["Valid_GpC_site_in_read_vs_reference"])
214
+ # other_C (for background)
215
+ # We'll also compute 'other_C' per reference if it exists
216
+ other_c_per_ref = {}
217
+ for ref in refs:
218
+ other_col = _find_var_col_for(ref, "other_C_site")
219
+ if other_col:
220
+ other_c_per_ref[ref] = np.where(np.asarray(adata.var[other_col].values).astype(bool))[0]
221
+
222
+ # CpG
223
+ if "CpG" in mod_target_bases:
224
+ for ref in refs:
225
+ _compute_for_ref_and_suffix(ref, "CpG_site", create_cols["Fraction_CpG_site_modified"], create_cols["Valid_CpG_site_in_read_vs_reference"])
226
+
227
+ # any C
228
+ if "C" in mod_target_bases:
229
+ for ref in refs:
230
+ _compute_for_ref_and_suffix(ref, "C_site", create_cols["Fraction_C_site_modified"], create_cols["Valid_C_site_in_read_vs_reference"])
231
+
232
+ # A
233
+ if "A" in mod_target_bases:
234
+ for ref in refs:
235
+ _compute_for_ref_and_suffix(ref, "A_site", create_cols["Fraction_A_site_modified"], create_cols["Valid_A_site_in_read_vs_reference"])
236
+
237
+ # write created arrays into adata.obs
238
+ for cname, arr in create_cols.items():
239
+ adata.obs[cname] = arr
240
+
241
+ # optionally compute GpC_to_other_C_mod_ratio and CpG_to_other_C_mod_ratio (if other_C masks exist)
242
+ if "GpC" in mod_target_bases and use_other_c_as_background:
243
+ # compute per-ref background ratio if both exist
244
+ # Simplest approach: if 'Fraction_GpC_site_modified' and 'Fraction_other_C_site_modified' exist, compute ratio
245
+ if "Fraction_other_C_site_modified" in adata.obs.columns:
246
+ with np.errstate(divide='ignore', invalid='ignore'):
247
+ ratio = adata.obs["Fraction_GpC_site_modified"].astype(float) / adata.obs["Fraction_other_C_site_modified"].astype(float)
248
+ adata.obs["GpC_to_other_C_mod_ratio"] = ratio.fillna(0.0)
249
+ else:
250
+ adata.obs["GpC_to_other_C_mod_ratio"] = np.nan
251
+
252
+ if "CpG" in mod_target_bases and use_other_c_as_background:
253
+ if "Fraction_other_C_site_modified" in adata.obs.columns:
254
+ with np.errstate(divide='ignore', invalid='ignore'):
255
+ ratio = adata.obs["Fraction_CpG_site_modified"].astype(float) / adata.obs["Fraction_other_C_site_modified"].astype(float)
256
+ adata.obs["CpG_to_other_C_mod_ratio"] = ratio.fillna(0.0)
257
+ else:
258
+ adata.obs["CpG_to_other_C_mod_ratio"] = np.nan
259
+
260
+ # free memory
261
+ del create_cols
262
+ gc.collect()
263
+
264
+ # --- Now apply the filters using adata.obs columns (this part is identical to your previous code but memory-friendly) ---
265
+ filtered = adata # we'll chain subset operations
266
+
267
+ # helper to get min/max from param like [min, max] or tuple(None,..)
268
+ def _unpack_minmax(thr):
269
+ if thr is None:
270
+ return None, None
271
+ try:
272
+ lo, hi = float(thr[0]) if thr[0] is not None else None, float(thr[1]) if thr[1] is not None else None
273
+ if lo is not None and hi is not None and lo > hi:
274
+ lo, hi = hi, lo
275
+ return lo, hi
276
+ except Exception:
277
+ return None, None
278
+
279
+ # GpC thresholds
280
+ if gpc_thresholds and 'GpC' in mod_target_bases:
281
+ lo, hi = _unpack_minmax(gpc_thresholds)
282
+ if use_other_c_as_background and smf_modality != 'deaminase' and "GpC_to_other_C_mod_ratio" in filtered.obs.columns:
283
+ filtered = filtered[filtered.obs["GpC_to_other_C_mod_ratio"].astype(float) > 1]
284
+ if lo is not None:
285
+ s0 = filtered.n_obs
286
+ filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) >= lo]
287
+ print(f"Removed {s0 - filtered.n_obs} reads below min GpC fraction {lo}")
288
+ if hi is not None:
289
+ s0 = filtered.n_obs
290
+ filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) <= hi]
291
+ print(f"Removed {s0 - filtered.n_obs} reads above max GpC fraction {hi}")
292
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns):
293
+ s0 = filtered.n_obs
294
+ filtered = filtered[filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
295
+ print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid GpC site fraction vs ref")
296
+
297
+ # CpG thresholds
298
+ if cpg_thresholds and 'CpG' in mod_target_bases:
299
+ lo, hi = _unpack_minmax(cpg_thresholds)
300
+ if use_other_c_as_background and smf_modality != 'deaminase' and "CpG_to_other_C_mod_ratio" in filtered.obs.columns:
301
+ filtered = filtered[filtered.obs["CpG_to_other_C_mod_ratio"].astype(float) > 1]
302
+ if lo is not None:
303
+ s0 = filtered.n_obs
304
+ filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) >= lo]
305
+ print(f"Removed {s0 - filtered.n_obs} reads below min CpG fraction {lo}")
306
+ if hi is not None:
307
+ s0 = filtered.n_obs
308
+ filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) <= hi]
309
+ print(f"Removed {s0 - filtered.n_obs} reads above max CpG fraction {hi}")
310
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns):
311
+ s0 = filtered.n_obs
312
+ filtered = filtered[filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
313
+ print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid CpG site fraction vs ref")
314
+
315
+ # any C thresholds
316
+ if any_c_thresholds and 'C' in mod_target_bases:
317
+ lo, hi = _unpack_minmax(any_c_thresholds)
318
+ if lo is not None:
319
+ s0 = filtered.n_obs
320
+ filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) >= lo]
321
+ print(f"Removed {s0 - filtered.n_obs} reads below min any-C fraction {lo}")
322
+ if hi is not None:
323
+ s0 = filtered.n_obs
324
+ filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) <= hi]
325
+ print(f"Removed {s0 - filtered.n_obs} reads above max any-C fraction {hi}")
326
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_C_site_in_read_vs_reference" in filtered.obs.columns):
327
+ s0 = filtered.n_obs
328
+ filtered = filtered[filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
329
+ print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid any-C site fraction vs ref")
330
+
331
+ # A thresholds
332
+ if a_thresholds and 'A' in mod_target_bases:
333
+ lo, hi = _unpack_minmax(a_thresholds)
334
+ if lo is not None:
335
+ s0 = filtered.n_obs
336
+ filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) >= lo]
337
+ print(f"Removed {s0 - filtered.n_obs} reads below min A fraction {lo}")
338
+ if hi is not None:
339
+ s0 = filtered.n_obs
340
+ filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) <= hi]
341
+ print(f"Removed {s0 - filtered.n_obs} reads above max A fraction {hi}")
342
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_A_site_in_read_vs_reference" in filtered.obs.columns):
343
+ s0 = filtered.n_obs
344
+ filtered = filtered[filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
345
+ print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid A site fraction vs ref")
346
+
347
+ filtered = filtered.copy()
348
+
349
+ # mark as done
350
+ filtered.uns[uns_flag] = True
351
+
352
+ return filtered