smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,16 @@
1
- import math
1
+ from __future__ import annotations
2
+
2
3
  import gc
4
+ from typing import List, Optional, Sequence
5
+
6
+ import anndata as ad
3
7
  import numpy as np
4
8
  import pandas as pd
5
- import anndata as ad
6
- from typing import Optional, Sequence, List
9
+
10
+ from smftools.logging_utils import get_logger
11
+
12
+ logger = get_logger(__name__)
13
+
7
14
 
8
15
  def filter_reads_on_modification_thresholds(
9
16
  adata: ad.AnnData,
@@ -15,32 +22,40 @@ def filter_reads_on_modification_thresholds(
15
22
  a_thresholds: Optional[Sequence[float]] = None,
16
23
  use_other_c_as_background: bool = False,
17
24
  min_valid_fraction_positions_in_read_vs_ref: Optional[float] = None,
18
- uns_flag: str = 'filter_reads_on_modification_thresholds_performed',
25
+ uns_flag: str = "filter_reads_on_modification_thresholds_performed",
19
26
  bypass: bool = False,
20
27
  force_redo: bool = False,
21
- reference_column: str = 'Reference_strand',
28
+ reference_column: str = "Reference_strand",
22
29
  # memory-control options:
23
30
  batch_size: int = 200,
24
31
  compute_obs_if_missing: bool = True,
25
- treat_zero_as_invalid: bool = False
32
+ treat_zero_as_invalid: bool = False,
26
33
  ) -> ad.AnnData:
27
- """
28
- Memory-efficient filtering by per-read modification thresholds.
29
-
30
- - If required obs columns exist, uses them directly (fast).
31
- - Otherwise, computes the relevant per-read metrics per-reference in batches
32
- and writes them into adata.obs before filtering.
33
-
34
- Parameters of interest :
35
- - gpc_thresholds, cpg_thresholds, any_c_thresholds, a_thresholds:
36
- each should be [min, max] (floats 0..1) or None. Thresholds are inclusive.
37
- - use_other_c_as_background: require GpC/CpG > other_C background (if present).
38
- - min_valid_fraction_positions_in_read_vs_ref: minimum fraction of valid sites
39
- in the read vs reference (0..1). If None, this check is skipped.
40
- - compute_obs_if_missing: if True, compute Fraction_* and Valid_* obs columns
41
- if they are not already present, using a low-memory per-ref strategy.
42
- - treat_zero_as_invalid: if True, a zero in X counts as invalid (non-site).
43
- If False, zeros are considered valid positions (adjust to your data semantics).
34
+ """Filter reads based on per-read modification thresholds.
35
+
36
+ If required obs columns exist, they are used directly. Otherwise, the function
37
+ computes the relevant per-read metrics in batches and stores them in ``adata.obs``.
38
+
39
+ Args:
40
+ adata: AnnData object to filter.
41
+ smf_modality: SMF modality identifier.
42
+ mod_target_bases: List of target bases to evaluate.
43
+ gpc_thresholds: ``[min, max]`` thresholds for GpC (0..1) or ``None``.
44
+ cpg_thresholds: ``[min, max]`` thresholds for CpG (0..1) or ``None``.
45
+ any_c_thresholds: ``[min, max]`` thresholds for any C (0..1) or ``None``.
46
+ a_thresholds: ``[min, max]`` thresholds for A (0..1) or ``None``.
47
+ use_other_c_as_background: Require GpC/CpG > other_C background if present.
48
+ min_valid_fraction_positions_in_read_vs_ref: Minimum valid-site fraction per read.
49
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
50
+ bypass: Whether to skip processing.
51
+ force_redo: Whether to rerun even if ``uns_flag`` is set.
52
+ reference_column: Obs column containing reference identifiers.
53
+ batch_size: Batch size for low-memory computation.
54
+ compute_obs_if_missing: Whether to compute missing obs summaries.
55
+ treat_zero_as_invalid: Whether zeros should be treated as invalid positions.
56
+
57
+ Returns:
58
+ anndata.AnnData: Filtered AnnData object.
44
59
  """
45
60
 
46
61
  # quick exit flags:
@@ -50,17 +65,23 @@ def filter_reads_on_modification_thresholds(
50
65
 
51
66
  # helper: check whether obs columns exist for a particular mod type
52
67
  def obs_has_columns_for(mod_type):
68
+ """Return True if per-read summary columns exist for a mod type."""
53
69
  col_pref = {
54
- "GpC": ("Fraction_GpC_site_modified", f"Valid_GpC_site_in_read_vs_reference"),
55
- "CpG": ("Fraction_CpG_site_modified", f"Valid_CpG_site_in_read_vs_reference"),
56
- "C": ("Fraction_C_site_modified", f"Valid_C_site_in_read_vs_reference"),
57
- "A": ("Fraction_A_site_modified", f"Valid_A_site_in_read_vs_reference"),
70
+ "GpC": ("Fraction_GpC_site_modified", "Valid_GpC_site_in_read_vs_reference"),
71
+ "CpG": ("Fraction_CpG_site_modified", "Valid_CpG_site_in_read_vs_reference"),
72
+ "C": ("Fraction_C_site_modified", "Valid_C_site_in_read_vs_reference"),
73
+ "A": ("Fraction_A_site_modified", "Valid_A_site_in_read_vs_reference"),
58
74
  }.get(mod_type, (None, None))
59
75
  return (col_pref[0] in adata.obs.columns) and (col_pref[1] in adata.obs.columns)
60
76
 
61
77
  # if all required obs columns are present, use them directly (fast path)
62
78
  required_present = True
63
- for mt, thr in (("GpC", gpc_thresholds), ("CpG", cpg_thresholds), ("C", any_c_thresholds), ("A", a_thresholds)):
79
+ for mt, thr in (
80
+ ("GpC", gpc_thresholds),
81
+ ("CpG", cpg_thresholds),
82
+ ("C", any_c_thresholds),
83
+ ("A", a_thresholds),
84
+ ):
64
85
  if thr is not None and mt in mod_target_bases:
65
86
  if not obs_has_columns_for(mt):
66
87
  required_present = False
@@ -75,9 +96,10 @@ def filter_reads_on_modification_thresholds(
75
96
  # Build mapping from reference -> var column names (expected pattern)
76
97
  # e.g. var column names: "{ref}_GpC_site", "{ref}_CpG_site", "{ref}_any_C_site", "{ref}_other_C_site", "{ref}_A_site"
77
98
  # If your var column naming differs, adjust these suffixes.
78
- refs = list(adata.obs[reference_column].astype('category').cat.categories)
99
+ refs = list(adata.obs[reference_column].astype("category").cat.categories)
79
100
 
80
101
  def _find_var_col_for(ref, suffix):
102
+ """Resolve a var column name for a reference/suffix pair."""
81
103
  name = f"{ref}_{suffix}"
82
104
  if name in adata.var.columns:
83
105
  return name
@@ -121,7 +143,9 @@ def filter_reads_on_modification_thresholds(
121
143
  var_mask_bool = np.asarray(adata.var[var_colname].values).astype(bool)
122
144
  except Exception:
123
145
  # if var has values not boolean, attempt coercion
124
- var_mask_bool = np.asarray(pd.to_numeric(adata.var[var_colname], errors='coerce').fillna(0).astype(bool))
146
+ var_mask_bool = np.asarray(
147
+ pd.to_numeric(adata.var[var_colname], errors="coerce").fillna(0).astype(bool)
148
+ )
125
149
 
126
150
  if not var_mask_bool.any():
127
151
  return
@@ -154,16 +178,20 @@ def filter_reads_on_modification_thresholds(
154
178
  # valid_count = (non-nan if float data else non-zero) per row
155
179
  # For sparse, .data are only stored nonzeros, so (X_block > 0).sum is fine
156
180
  modified_count = np.asarray((X_block > 0).sum(axis=1)).ravel()
157
- if np.isnan(X_block.data).any() if hasattr(X_block, 'data') else False:
181
+ if np.isnan(X_block.data).any() if hasattr(X_block, "data") else False:
158
182
  # if sparse with stored NaNs (!) handle differently - unlikely
159
- valid_count = np.asarray(~np.isnan(X_block.toarray()).sum(axis=1)).ravel()
183
+ valid_count = np.asarray(
184
+ ~np.isnan(X_block.toarray()).sum(axis=1)
185
+ ).ravel()
160
186
  else:
161
187
  if treat_zero_as_invalid:
162
188
  # valid = number of non-zero entries
163
189
  valid_count = np.asarray((X_block != 0).sum(axis=1)).ravel()
164
190
  else:
165
191
  # treat all positions as valid positions (they exist in reference) -> denominator = n_cols_for_ref
166
- valid_count = np.full_like(modified_count, n_cols_for_ref, dtype=float)
192
+ valid_count = np.full_like(
193
+ modified_count, n_cols_for_ref, dtype=float
194
+ )
167
195
  else:
168
196
  # dense numpy
169
197
  Xb = np.asarray(X_block)
@@ -193,14 +221,18 @@ def filter_reads_on_modification_thresholds(
193
221
 
194
222
  # fraction modified = modified_count / valid_count (guard divide-by-zero)
195
223
  frac = np.zeros_like(modified_count, dtype=float)
196
- mask_valid_nonzero = (valid_count > 0)
197
- frac[mask_valid_nonzero] = modified_count[mask_valid_nonzero] / valid_count[mask_valid_nonzero]
224
+ mask_valid_nonzero = valid_count > 0
225
+ frac[mask_valid_nonzero] = (
226
+ modified_count[mask_valid_nonzero] / valid_count[mask_valid_nonzero]
227
+ )
198
228
 
199
229
  # write to out arrays
200
230
  out_frac_arr[block_rows_idx] = frac
201
231
  # valid fraction relative to reference = valid_count / n_cols_for_ref
202
232
  out_valid_arr[block_rows_idx] = np.zeros_like(valid_count, dtype=float)
203
- out_valid_arr[block_rows_idx][mask_valid_nonzero] = (valid_count[mask_valid_nonzero] / float(n_cols_for_ref))
233
+ out_valid_arr[block_rows_idx][mask_valid_nonzero] = valid_count[
234
+ mask_valid_nonzero
235
+ ] / float(n_cols_for_ref)
204
236
 
205
237
  # free block memory ASAP
206
238
  del X_block, modified_count, valid_count, frac
@@ -210,29 +242,51 @@ def filter_reads_on_modification_thresholds(
210
242
  # GpC
211
243
  if "GpC" in mod_target_bases:
212
244
  for ref in refs:
213
- _compute_for_ref_and_suffix(ref, "GpC_site", create_cols["Fraction_GpC_site_modified"], create_cols["Valid_GpC_site_in_read_vs_reference"])
245
+ _compute_for_ref_and_suffix(
246
+ ref,
247
+ "GpC_site",
248
+ create_cols["Fraction_GpC_site_modified"],
249
+ create_cols["Valid_GpC_site_in_read_vs_reference"],
250
+ )
214
251
  # other_C (for background)
215
252
  # We'll also compute 'other_C' per reference if it exists
216
253
  other_c_per_ref = {}
217
254
  for ref in refs:
218
255
  other_col = _find_var_col_for(ref, "other_C_site")
219
256
  if other_col:
220
- other_c_per_ref[ref] = np.where(np.asarray(adata.var[other_col].values).astype(bool))[0]
257
+ other_c_per_ref[ref] = np.where(
258
+ np.asarray(adata.var[other_col].values).astype(bool)
259
+ )[0]
221
260
 
222
261
  # CpG
223
262
  if "CpG" in mod_target_bases:
224
263
  for ref in refs:
225
- _compute_for_ref_and_suffix(ref, "CpG_site", create_cols["Fraction_CpG_site_modified"], create_cols["Valid_CpG_site_in_read_vs_reference"])
264
+ _compute_for_ref_and_suffix(
265
+ ref,
266
+ "CpG_site",
267
+ create_cols["Fraction_CpG_site_modified"],
268
+ create_cols["Valid_CpG_site_in_read_vs_reference"],
269
+ )
226
270
 
227
271
  # any C
228
272
  if "C" in mod_target_bases:
229
273
  for ref in refs:
230
- _compute_for_ref_and_suffix(ref, "C_site", create_cols["Fraction_C_site_modified"], create_cols["Valid_C_site_in_read_vs_reference"])
274
+ _compute_for_ref_and_suffix(
275
+ ref,
276
+ "C_site",
277
+ create_cols["Fraction_C_site_modified"],
278
+ create_cols["Valid_C_site_in_read_vs_reference"],
279
+ )
231
280
 
232
281
  # A
233
282
  if "A" in mod_target_bases:
234
283
  for ref in refs:
235
- _compute_for_ref_and_suffix(ref, "A_site", create_cols["Fraction_A_site_modified"], create_cols["Valid_A_site_in_read_vs_reference"])
284
+ _compute_for_ref_and_suffix(
285
+ ref,
286
+ "A_site",
287
+ create_cols["Fraction_A_site_modified"],
288
+ create_cols["Valid_A_site_in_read_vs_reference"],
289
+ )
236
290
 
237
291
  # write created arrays into adata.obs
238
292
  for cname, arr in create_cols.items():
@@ -243,16 +297,20 @@ def filter_reads_on_modification_thresholds(
243
297
  # compute per-ref background ratio if both exist
244
298
  # Simplest approach: if 'Fraction_GpC_site_modified' and 'Fraction_other_C_site_modified' exist, compute ratio
245
299
  if "Fraction_other_C_site_modified" in adata.obs.columns:
246
- with np.errstate(divide='ignore', invalid='ignore'):
247
- ratio = adata.obs["Fraction_GpC_site_modified"].astype(float) / adata.obs["Fraction_other_C_site_modified"].astype(float)
300
+ with np.errstate(divide="ignore", invalid="ignore"):
301
+ ratio = adata.obs["Fraction_GpC_site_modified"].astype(float) / adata.obs[
302
+ "Fraction_other_C_site_modified"
303
+ ].astype(float)
248
304
  adata.obs["GpC_to_other_C_mod_ratio"] = ratio.fillna(0.0)
249
305
  else:
250
306
  adata.obs["GpC_to_other_C_mod_ratio"] = np.nan
251
307
 
252
308
  if "CpG" in mod_target_bases and use_other_c_as_background:
253
309
  if "Fraction_other_C_site_modified" in adata.obs.columns:
254
- with np.errstate(divide='ignore', invalid='ignore'):
255
- ratio = adata.obs["Fraction_CpG_site_modified"].astype(float) / adata.obs["Fraction_other_C_site_modified"].astype(float)
310
+ with np.errstate(divide="ignore", invalid="ignore"):
311
+ ratio = adata.obs["Fraction_CpG_site_modified"].astype(float) / adata.obs[
312
+ "Fraction_other_C_site_modified"
313
+ ].astype(float)
256
314
  adata.obs["CpG_to_other_C_mod_ratio"] = ratio.fillna(0.0)
257
315
  else:
258
316
  adata.obs["CpG_to_other_C_mod_ratio"] = np.nan
@@ -266,10 +324,14 @@ def filter_reads_on_modification_thresholds(
266
324
 
267
325
  # helper to get min/max from param like [min, max] or tuple(None,..)
268
326
  def _unpack_minmax(thr):
327
+ """Normalize a threshold pair to ordered (min, max) floats."""
269
328
  if thr is None:
270
329
  return None, None
271
330
  try:
272
- lo, hi = float(thr[0]) if thr[0] is not None else None, float(thr[1]) if thr[1] is not None else None
331
+ lo, hi = (
332
+ float(thr[0]) if thr[0] is not None else None,
333
+ float(thr[1]) if thr[1] is not None else None,
334
+ )
273
335
  if lo is not None and hi is not None and lo > hi:
274
336
  lo, hi = hi, lo
275
337
  return lo, hi
@@ -277,76 +339,124 @@ def filter_reads_on_modification_thresholds(
277
339
  return None, None
278
340
 
279
341
  # GpC thresholds
280
- if gpc_thresholds and 'GpC' in mod_target_bases:
342
+ if gpc_thresholds and "GpC" in mod_target_bases:
281
343
  lo, hi = _unpack_minmax(gpc_thresholds)
282
- if use_other_c_as_background and smf_modality != 'deaminase' and "GpC_to_other_C_mod_ratio" in filtered.obs.columns:
344
+ if (
345
+ use_other_c_as_background
346
+ and smf_modality != "deaminase"
347
+ and "GpC_to_other_C_mod_ratio" in filtered.obs.columns
348
+ ):
283
349
  filtered = filtered[filtered.obs["GpC_to_other_C_mod_ratio"].astype(float) > 1]
284
350
  if lo is not None:
285
351
  s0 = filtered.n_obs
286
352
  filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) >= lo]
287
- print(f"Removed {s0 - filtered.n_obs} reads below min GpC fraction {lo}")
353
+ logger.info("Removed %s reads below min GpC fraction %s", s0 - filtered.n_obs, lo)
288
354
  if hi is not None:
289
355
  s0 = filtered.n_obs
290
356
  filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) <= hi]
291
- print(f"Removed {s0 - filtered.n_obs} reads above max GpC fraction {hi}")
292
- if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns):
357
+ logger.info("Removed %s reads above max GpC fraction %s", s0 - filtered.n_obs, hi)
358
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
359
+ "Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns
360
+ ):
293
361
  s0 = filtered.n_obs
294
- filtered = filtered[filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
295
- print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid GpC site fraction vs ref")
362
+ filtered = filtered[
363
+ filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float)
364
+ >= float(min_valid_fraction_positions_in_read_vs_ref)
365
+ ]
366
+ logger.info(
367
+ "Removed %s reads with insufficient valid GpC site fraction vs ref",
368
+ s0 - filtered.n_obs,
369
+ )
296
370
 
297
371
  # CpG thresholds
298
- if cpg_thresholds and 'CpG' in mod_target_bases:
372
+ if cpg_thresholds and "CpG" in mod_target_bases:
299
373
  lo, hi = _unpack_minmax(cpg_thresholds)
300
- if use_other_c_as_background and smf_modality != 'deaminase' and "CpG_to_other_C_mod_ratio" in filtered.obs.columns:
374
+ if (
375
+ use_other_c_as_background
376
+ and smf_modality != "deaminase"
377
+ and "CpG_to_other_C_mod_ratio" in filtered.obs.columns
378
+ ):
301
379
  filtered = filtered[filtered.obs["CpG_to_other_C_mod_ratio"].astype(float) > 1]
302
380
  if lo is not None:
303
381
  s0 = filtered.n_obs
304
382
  filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) >= lo]
305
- print(f"Removed {s0 - filtered.n_obs} reads below min CpG fraction {lo}")
383
+ logger.info("Removed %s reads below min CpG fraction %s", s0 - filtered.n_obs, lo)
306
384
  if hi is not None:
307
385
  s0 = filtered.n_obs
308
386
  filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) <= hi]
309
- print(f"Removed {s0 - filtered.n_obs} reads above max CpG fraction {hi}")
310
- if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns):
387
+ logger.info("Removed %s reads above max CpG fraction %s", s0 - filtered.n_obs, hi)
388
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
389
+ "Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns
390
+ ):
311
391
  s0 = filtered.n_obs
312
- filtered = filtered[filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
313
- print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid CpG site fraction vs ref")
392
+ filtered = filtered[
393
+ filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float)
394
+ >= float(min_valid_fraction_positions_in_read_vs_ref)
395
+ ]
396
+ logger.info(
397
+ "Removed %s reads with insufficient valid CpG site fraction vs ref",
398
+ s0 - filtered.n_obs,
399
+ )
314
400
 
315
401
  # any C thresholds
316
- if any_c_thresholds and 'C' in mod_target_bases:
402
+ if any_c_thresholds and "C" in mod_target_bases:
317
403
  lo, hi = _unpack_minmax(any_c_thresholds)
318
404
  if lo is not None:
319
405
  s0 = filtered.n_obs
320
406
  filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) >= lo]
321
- print(f"Removed {s0 - filtered.n_obs} reads below min any-C fraction {lo}")
407
+ logger.info(
408
+ "Removed %s reads below min any-C fraction %s",
409
+ s0 - filtered.n_obs,
410
+ lo,
411
+ )
322
412
  if hi is not None:
323
413
  s0 = filtered.n_obs
324
414
  filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) <= hi]
325
- print(f"Removed {s0 - filtered.n_obs} reads above max any-C fraction {hi}")
326
- if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_C_site_in_read_vs_reference" in filtered.obs.columns):
415
+ logger.info(
416
+ "Removed %s reads above max any-C fraction %s",
417
+ s0 - filtered.n_obs,
418
+ hi,
419
+ )
420
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
421
+ "Valid_C_site_in_read_vs_reference" in filtered.obs.columns
422
+ ):
327
423
  s0 = filtered.n_obs
328
- filtered = filtered[filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
329
- print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid any-C site fraction vs ref")
424
+ filtered = filtered[
425
+ filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float)
426
+ >= float(min_valid_fraction_positions_in_read_vs_ref)
427
+ ]
428
+ logger.info(
429
+ "Removed %s reads with insufficient valid any-C site fraction vs ref",
430
+ s0 - filtered.n_obs,
431
+ )
330
432
 
331
433
  # A thresholds
332
- if a_thresholds and 'A' in mod_target_bases:
434
+ if a_thresholds and "A" in mod_target_bases:
333
435
  lo, hi = _unpack_minmax(a_thresholds)
334
436
  if lo is not None:
335
437
  s0 = filtered.n_obs
336
438
  filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) >= lo]
337
- print(f"Removed {s0 - filtered.n_obs} reads below min A fraction {lo}")
439
+ logger.info("Removed %s reads below min A fraction %s", s0 - filtered.n_obs, lo)
338
440
  if hi is not None:
339
441
  s0 = filtered.n_obs
340
442
  filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) <= hi]
341
- print(f"Removed {s0 - filtered.n_obs} reads above max A fraction {hi}")
342
- if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_A_site_in_read_vs_reference" in filtered.obs.columns):
443
+ logger.info("Removed %s reads above max A fraction %s", s0 - filtered.n_obs, hi)
444
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
445
+ "Valid_A_site_in_read_vs_reference" in filtered.obs.columns
446
+ ):
343
447
  s0 = filtered.n_obs
344
- filtered = filtered[filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
345
- print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid A site fraction vs ref")
448
+ filtered = filtered[
449
+ filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float)
450
+ >= float(min_valid_fraction_positions_in_read_vs_ref)
451
+ ]
452
+ logger.info(
453
+ "Removed %s reads with insufficient valid A site fraction vs ref",
454
+ s0 - filtered.n_obs,
455
+ )
346
456
 
347
457
  filtered = filtered.copy()
348
458
 
349
459
  # mark as done
350
460
  filtered.uns[uns_flag] = True
351
461
 
352
- return filtered
462
+ return filtered