smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,14 @@
1
- import math
2
1
  import gc
2
+ from typing import List, Optional, Sequence
3
+
4
+ import anndata as ad
3
5
  import numpy as np
4
6
  import pandas as pd
5
- import anndata as ad
6
- from typing import Optional, Sequence, List
7
+
8
+ from smftools.logging_utils import get_logger
9
+
10
+ logger = get_logger(__name__)
11
+
7
12
 
8
13
  def filter_reads_on_modification_thresholds(
9
14
  adata: ad.AnnData,
@@ -15,32 +20,40 @@ def filter_reads_on_modification_thresholds(
15
20
  a_thresholds: Optional[Sequence[float]] = None,
16
21
  use_other_c_as_background: bool = False,
17
22
  min_valid_fraction_positions_in_read_vs_ref: Optional[float] = None,
18
- uns_flag: str = 'reads_filtered_on_modification_thresholds',
23
+ uns_flag: str = "filter_reads_on_modification_thresholds_performed",
19
24
  bypass: bool = False,
20
25
  force_redo: bool = False,
21
- reference_column: str = 'Reference_strand',
26
+ reference_column: str = "Reference_strand",
22
27
  # memory-control options:
23
28
  batch_size: int = 200,
24
29
  compute_obs_if_missing: bool = True,
25
- treat_zero_as_invalid: bool = False
30
+ treat_zero_as_invalid: bool = False,
26
31
  ) -> ad.AnnData:
27
- """
28
- Memory-efficient filtering by per-read modification thresholds.
29
-
30
- - If required obs columns exist, uses them directly (fast).
31
- - Otherwise, computes the relevant per-read metrics per-reference in batches
32
- and writes them into adata.obs before filtering.
33
-
34
- Parameters of interest :
35
- - gpc_thresholds, cpg_thresholds, any_c_thresholds, a_thresholds:
36
- each should be [min, max] (floats 0..1) or None. Thresholds are inclusive.
37
- - use_other_c_as_background: require GpC/CpG > other_C background (if present).
38
- - min_valid_fraction_positions_in_read_vs_ref: minimum fraction of valid sites
39
- in the read vs reference (0..1). If None, this check is skipped.
40
- - compute_obs_if_missing: if True, compute Fraction_* and Valid_* obs columns
41
- if they are not already present, using a low-memory per-ref strategy.
42
- - treat_zero_as_invalid: if True, a zero in X counts as invalid (non-site).
43
- If False, zeros are considered valid positions (adjust to your data semantics).
32
+ """Filter reads based on per-read modification thresholds.
33
+
34
+ If required obs columns exist, they are used directly. Otherwise, the function
35
+ computes the relevant per-read metrics in batches and stores them in ``adata.obs``.
36
+
37
+ Args:
38
+ adata: AnnData object to filter.
39
+ smf_modality: SMF modality identifier.
40
+ mod_target_bases: List of target bases to evaluate.
41
+ gpc_thresholds: ``[min, max]`` thresholds for GpC (0..1) or ``None``.
42
+ cpg_thresholds: ``[min, max]`` thresholds for CpG (0..1) or ``None``.
43
+ any_c_thresholds: ``[min, max]`` thresholds for any C (0..1) or ``None``.
44
+ a_thresholds: ``[min, max]`` thresholds for A (0..1) or ``None``.
45
+ use_other_c_as_background: Require GpC/CpG > other_C background if present.
46
+ min_valid_fraction_positions_in_read_vs_ref: Minimum valid-site fraction per read.
47
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
48
+ bypass: Whether to skip processing.
49
+ force_redo: Whether to rerun even if ``uns_flag`` is set.
50
+ reference_column: Obs column containing reference identifiers.
51
+ batch_size: Batch size for low-memory computation.
52
+ compute_obs_if_missing: Whether to compute missing obs summaries.
53
+ treat_zero_as_invalid: Whether zeros should be treated as invalid positions.
54
+
55
+ Returns:
56
+ anndata.AnnData: Filtered AnnData object.
44
57
  """
45
58
 
46
59
  # quick exit flags:
@@ -50,17 +63,23 @@ def filter_reads_on_modification_thresholds(
50
63
 
51
64
  # helper: check whether obs columns exist for a particular mod type
52
65
  def obs_has_columns_for(mod_type):
66
+ """Return True if per-read summary columns exist for a mod type."""
53
67
  col_pref = {
54
- "GpC": ("Fraction_GpC_site_modified", f"Valid_GpC_site_in_read_vs_reference"),
55
- "CpG": ("Fraction_CpG_site_modified", f"Valid_CpG_site_in_read_vs_reference"),
56
- "C": ("Fraction_C_site_modified", f"Valid_C_site_in_read_vs_reference"),
57
- "A": ("Fraction_A_site_modified", f"Valid_A_site_in_read_vs_reference"),
68
+ "GpC": ("Fraction_GpC_site_modified", "Valid_GpC_site_in_read_vs_reference"),
69
+ "CpG": ("Fraction_CpG_site_modified", "Valid_CpG_site_in_read_vs_reference"),
70
+ "C": ("Fraction_C_site_modified", "Valid_C_site_in_read_vs_reference"),
71
+ "A": ("Fraction_A_site_modified", "Valid_A_site_in_read_vs_reference"),
58
72
  }.get(mod_type, (None, None))
59
73
  return (col_pref[0] in adata.obs.columns) and (col_pref[1] in adata.obs.columns)
60
74
 
61
75
  # if all required obs columns are present, use them directly (fast path)
62
76
  required_present = True
63
- for mt, thr in (("GpC", gpc_thresholds), ("CpG", cpg_thresholds), ("C", any_c_thresholds), ("A", a_thresholds)):
77
+ for mt, thr in (
78
+ ("GpC", gpc_thresholds),
79
+ ("CpG", cpg_thresholds),
80
+ ("C", any_c_thresholds),
81
+ ("A", a_thresholds),
82
+ ):
64
83
  if thr is not None and mt in mod_target_bases:
65
84
  if not obs_has_columns_for(mt):
66
85
  required_present = False
@@ -75,9 +94,10 @@ def filter_reads_on_modification_thresholds(
75
94
  # Build mapping from reference -> var column names (expected pattern)
76
95
  # e.g. var column names: "{ref}_GpC_site", "{ref}_CpG_site", "{ref}_any_C_site", "{ref}_other_C_site", "{ref}_A_site"
77
96
  # If your var column naming differs, adjust these suffixes.
78
- refs = list(adata.obs[reference_column].astype('category').cat.categories)
97
+ refs = list(adata.obs[reference_column].astype("category").cat.categories)
79
98
 
80
99
  def _find_var_col_for(ref, suffix):
100
+ """Resolve a var column name for a reference/suffix pair."""
81
101
  name = f"{ref}_{suffix}"
82
102
  if name in adata.var.columns:
83
103
  return name
@@ -121,7 +141,9 @@ def filter_reads_on_modification_thresholds(
121
141
  var_mask_bool = np.asarray(adata.var[var_colname].values).astype(bool)
122
142
  except Exception:
123
143
  # if var has values not boolean, attempt coercion
124
- var_mask_bool = np.asarray(pd.to_numeric(adata.var[var_colname], errors='coerce').fillna(0).astype(bool))
144
+ var_mask_bool = np.asarray(
145
+ pd.to_numeric(adata.var[var_colname], errors="coerce").fillna(0).astype(bool)
146
+ )
125
147
 
126
148
  if not var_mask_bool.any():
127
149
  return
@@ -154,16 +176,20 @@ def filter_reads_on_modification_thresholds(
154
176
  # valid_count = (non-nan if float data else non-zero) per row
155
177
  # For sparse, .data are only stored nonzeros, so (X_block > 0).sum is fine
156
178
  modified_count = np.asarray((X_block > 0).sum(axis=1)).ravel()
157
- if np.isnan(X_block.data).any() if hasattr(X_block, 'data') else False:
179
+ if np.isnan(X_block.data).any() if hasattr(X_block, "data") else False:
158
180
  # if sparse with stored NaNs (!) handle differently - unlikely
159
- valid_count = np.asarray(~np.isnan(X_block.toarray()).sum(axis=1)).ravel()
181
+ valid_count = np.asarray(
182
+ ~np.isnan(X_block.toarray()).sum(axis=1)
183
+ ).ravel()
160
184
  else:
161
185
  if treat_zero_as_invalid:
162
186
  # valid = number of non-zero entries
163
187
  valid_count = np.asarray((X_block != 0).sum(axis=1)).ravel()
164
188
  else:
165
189
  # treat all positions as valid positions (they exist in reference) -> denominator = n_cols_for_ref
166
- valid_count = np.full_like(modified_count, n_cols_for_ref, dtype=float)
190
+ valid_count = np.full_like(
191
+ modified_count, n_cols_for_ref, dtype=float
192
+ )
167
193
  else:
168
194
  # dense numpy
169
195
  Xb = np.asarray(X_block)
@@ -193,14 +219,18 @@ def filter_reads_on_modification_thresholds(
193
219
 
194
220
  # fraction modified = modified_count / valid_count (guard divide-by-zero)
195
221
  frac = np.zeros_like(modified_count, dtype=float)
196
- mask_valid_nonzero = (valid_count > 0)
197
- frac[mask_valid_nonzero] = modified_count[mask_valid_nonzero] / valid_count[mask_valid_nonzero]
222
+ mask_valid_nonzero = valid_count > 0
223
+ frac[mask_valid_nonzero] = (
224
+ modified_count[mask_valid_nonzero] / valid_count[mask_valid_nonzero]
225
+ )
198
226
 
199
227
  # write to out arrays
200
228
  out_frac_arr[block_rows_idx] = frac
201
229
  # valid fraction relative to reference = valid_count / n_cols_for_ref
202
230
  out_valid_arr[block_rows_idx] = np.zeros_like(valid_count, dtype=float)
203
- out_valid_arr[block_rows_idx][mask_valid_nonzero] = (valid_count[mask_valid_nonzero] / float(n_cols_for_ref))
231
+ out_valid_arr[block_rows_idx][mask_valid_nonzero] = valid_count[
232
+ mask_valid_nonzero
233
+ ] / float(n_cols_for_ref)
204
234
 
205
235
  # free block memory ASAP
206
236
  del X_block, modified_count, valid_count, frac
@@ -210,29 +240,51 @@ def filter_reads_on_modification_thresholds(
210
240
  # GpC
211
241
  if "GpC" in mod_target_bases:
212
242
  for ref in refs:
213
- _compute_for_ref_and_suffix(ref, "GpC_site", create_cols["Fraction_GpC_site_modified"], create_cols["Valid_GpC_site_in_read_vs_reference"])
243
+ _compute_for_ref_and_suffix(
244
+ ref,
245
+ "GpC_site",
246
+ create_cols["Fraction_GpC_site_modified"],
247
+ create_cols["Valid_GpC_site_in_read_vs_reference"],
248
+ )
214
249
  # other_C (for background)
215
250
  # We'll also compute 'other_C' per reference if it exists
216
251
  other_c_per_ref = {}
217
252
  for ref in refs:
218
253
  other_col = _find_var_col_for(ref, "other_C_site")
219
254
  if other_col:
220
- other_c_per_ref[ref] = np.where(np.asarray(adata.var[other_col].values).astype(bool))[0]
255
+ other_c_per_ref[ref] = np.where(
256
+ np.asarray(adata.var[other_col].values).astype(bool)
257
+ )[0]
221
258
 
222
259
  # CpG
223
260
  if "CpG" in mod_target_bases:
224
261
  for ref in refs:
225
- _compute_for_ref_and_suffix(ref, "CpG_site", create_cols["Fraction_CpG_site_modified"], create_cols["Valid_CpG_site_in_read_vs_reference"])
262
+ _compute_for_ref_and_suffix(
263
+ ref,
264
+ "CpG_site",
265
+ create_cols["Fraction_CpG_site_modified"],
266
+ create_cols["Valid_CpG_site_in_read_vs_reference"],
267
+ )
226
268
 
227
269
  # any C
228
270
  if "C" in mod_target_bases:
229
271
  for ref in refs:
230
- _compute_for_ref_and_suffix(ref, "C_site", create_cols["Fraction_C_site_modified"], create_cols["Valid_C_site_in_read_vs_reference"])
272
+ _compute_for_ref_and_suffix(
273
+ ref,
274
+ "C_site",
275
+ create_cols["Fraction_C_site_modified"],
276
+ create_cols["Valid_C_site_in_read_vs_reference"],
277
+ )
231
278
 
232
279
  # A
233
280
  if "A" in mod_target_bases:
234
281
  for ref in refs:
235
- _compute_for_ref_and_suffix(ref, "A_site", create_cols["Fraction_A_site_modified"], create_cols["Valid_A_site_in_read_vs_reference"])
282
+ _compute_for_ref_and_suffix(
283
+ ref,
284
+ "A_site",
285
+ create_cols["Fraction_A_site_modified"],
286
+ create_cols["Valid_A_site_in_read_vs_reference"],
287
+ )
236
288
 
237
289
  # write created arrays into adata.obs
238
290
  for cname, arr in create_cols.items():
@@ -243,16 +295,20 @@ def filter_reads_on_modification_thresholds(
243
295
  # compute per-ref background ratio if both exist
244
296
  # Simplest approach: if 'Fraction_GpC_site_modified' and 'Fraction_other_C_site_modified' exist, compute ratio
245
297
  if "Fraction_other_C_site_modified" in adata.obs.columns:
246
- with np.errstate(divide='ignore', invalid='ignore'):
247
- ratio = adata.obs["Fraction_GpC_site_modified"].astype(float) / adata.obs["Fraction_other_C_site_modified"].astype(float)
298
+ with np.errstate(divide="ignore", invalid="ignore"):
299
+ ratio = adata.obs["Fraction_GpC_site_modified"].astype(float) / adata.obs[
300
+ "Fraction_other_C_site_modified"
301
+ ].astype(float)
248
302
  adata.obs["GpC_to_other_C_mod_ratio"] = ratio.fillna(0.0)
249
303
  else:
250
304
  adata.obs["GpC_to_other_C_mod_ratio"] = np.nan
251
305
 
252
306
  if "CpG" in mod_target_bases and use_other_c_as_background:
253
307
  if "Fraction_other_C_site_modified" in adata.obs.columns:
254
- with np.errstate(divide='ignore', invalid='ignore'):
255
- ratio = adata.obs["Fraction_CpG_site_modified"].astype(float) / adata.obs["Fraction_other_C_site_modified"].astype(float)
308
+ with np.errstate(divide="ignore", invalid="ignore"):
309
+ ratio = adata.obs["Fraction_CpG_site_modified"].astype(float) / adata.obs[
310
+ "Fraction_other_C_site_modified"
311
+ ].astype(float)
256
312
  adata.obs["CpG_to_other_C_mod_ratio"] = ratio.fillna(0.0)
257
313
  else:
258
314
  adata.obs["CpG_to_other_C_mod_ratio"] = np.nan
@@ -266,10 +322,14 @@ def filter_reads_on_modification_thresholds(
266
322
 
267
323
  # helper to get min/max from param like [min, max] or tuple(None,..)
268
324
  def _unpack_minmax(thr):
325
+ """Normalize a threshold pair to ordered (min, max) floats."""
269
326
  if thr is None:
270
327
  return None, None
271
328
  try:
272
- lo, hi = float(thr[0]) if thr[0] is not None else None, float(thr[1]) if thr[1] is not None else None
329
+ lo, hi = (
330
+ float(thr[0]) if thr[0] is not None else None,
331
+ float(thr[1]) if thr[1] is not None else None,
332
+ )
273
333
  if lo is not None and hi is not None and lo > hi:
274
334
  lo, hi = hi, lo
275
335
  return lo, hi
@@ -277,76 +337,124 @@ def filter_reads_on_modification_thresholds(
277
337
  return None, None
278
338
 
279
339
  # GpC thresholds
280
- if gpc_thresholds and 'GpC' in mod_target_bases:
340
+ if gpc_thresholds and "GpC" in mod_target_bases:
281
341
  lo, hi = _unpack_minmax(gpc_thresholds)
282
- if use_other_c_as_background and smf_modality != 'deaminase' and "GpC_to_other_C_mod_ratio" in filtered.obs.columns:
342
+ if (
343
+ use_other_c_as_background
344
+ and smf_modality != "deaminase"
345
+ and "GpC_to_other_C_mod_ratio" in filtered.obs.columns
346
+ ):
283
347
  filtered = filtered[filtered.obs["GpC_to_other_C_mod_ratio"].astype(float) > 1]
284
348
  if lo is not None:
285
349
  s0 = filtered.n_obs
286
350
  filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) >= lo]
287
- print(f"Removed {s0 - filtered.n_obs} reads below min GpC fraction {lo}")
351
+ logger.info("Removed %s reads below min GpC fraction %s", s0 - filtered.n_obs, lo)
288
352
  if hi is not None:
289
353
  s0 = filtered.n_obs
290
354
  filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) <= hi]
291
- print(f"Removed {s0 - filtered.n_obs} reads above max GpC fraction {hi}")
292
- if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns):
355
+ logger.info("Removed %s reads above max GpC fraction %s", s0 - filtered.n_obs, hi)
356
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
357
+ "Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns
358
+ ):
293
359
  s0 = filtered.n_obs
294
- filtered = filtered[filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
295
- print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid GpC site fraction vs ref")
360
+ filtered = filtered[
361
+ filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float)
362
+ >= float(min_valid_fraction_positions_in_read_vs_ref)
363
+ ]
364
+ logger.info(
365
+ "Removed %s reads with insufficient valid GpC site fraction vs ref",
366
+ s0 - filtered.n_obs,
367
+ )
296
368
 
297
369
  # CpG thresholds
298
- if cpg_thresholds and 'CpG' in mod_target_bases:
370
+ if cpg_thresholds and "CpG" in mod_target_bases:
299
371
  lo, hi = _unpack_minmax(cpg_thresholds)
300
- if use_other_c_as_background and smf_modality != 'deaminase' and "CpG_to_other_C_mod_ratio" in filtered.obs.columns:
372
+ if (
373
+ use_other_c_as_background
374
+ and smf_modality != "deaminase"
375
+ and "CpG_to_other_C_mod_ratio" in filtered.obs.columns
376
+ ):
301
377
  filtered = filtered[filtered.obs["CpG_to_other_C_mod_ratio"].astype(float) > 1]
302
378
  if lo is not None:
303
379
  s0 = filtered.n_obs
304
380
  filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) >= lo]
305
- print(f"Removed {s0 - filtered.n_obs} reads below min CpG fraction {lo}")
381
+ logger.info("Removed %s reads below min CpG fraction %s", s0 - filtered.n_obs, lo)
306
382
  if hi is not None:
307
383
  s0 = filtered.n_obs
308
384
  filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) <= hi]
309
- print(f"Removed {s0 - filtered.n_obs} reads above max CpG fraction {hi}")
310
- if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns):
385
+ logger.info("Removed %s reads above max CpG fraction %s", s0 - filtered.n_obs, hi)
386
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
387
+ "Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns
388
+ ):
311
389
  s0 = filtered.n_obs
312
- filtered = filtered[filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
313
- print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid CpG site fraction vs ref")
390
+ filtered = filtered[
391
+ filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float)
392
+ >= float(min_valid_fraction_positions_in_read_vs_ref)
393
+ ]
394
+ logger.info(
395
+ "Removed %s reads with insufficient valid CpG site fraction vs ref",
396
+ s0 - filtered.n_obs,
397
+ )
314
398
 
315
399
  # any C thresholds
316
- if any_c_thresholds and 'C' in mod_target_bases:
400
+ if any_c_thresholds and "C" in mod_target_bases:
317
401
  lo, hi = _unpack_minmax(any_c_thresholds)
318
402
  if lo is not None:
319
403
  s0 = filtered.n_obs
320
404
  filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) >= lo]
321
- print(f"Removed {s0 - filtered.n_obs} reads below min any-C fraction {lo}")
405
+ logger.info(
406
+ "Removed %s reads below min any-C fraction %s",
407
+ s0 - filtered.n_obs,
408
+ lo,
409
+ )
322
410
  if hi is not None:
323
411
  s0 = filtered.n_obs
324
412
  filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) <= hi]
325
- print(f"Removed {s0 - filtered.n_obs} reads above max any-C fraction {hi}")
326
- if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_C_site_in_read_vs_reference" in filtered.obs.columns):
413
+ logger.info(
414
+ "Removed %s reads above max any-C fraction %s",
415
+ s0 - filtered.n_obs,
416
+ hi,
417
+ )
418
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
419
+ "Valid_C_site_in_read_vs_reference" in filtered.obs.columns
420
+ ):
327
421
  s0 = filtered.n_obs
328
- filtered = filtered[filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
329
- print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid any-C site fraction vs ref")
422
+ filtered = filtered[
423
+ filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float)
424
+ >= float(min_valid_fraction_positions_in_read_vs_ref)
425
+ ]
426
+ logger.info(
427
+ "Removed %s reads with insufficient valid any-C site fraction vs ref",
428
+ s0 - filtered.n_obs,
429
+ )
330
430
 
331
431
  # A thresholds
332
- if a_thresholds and 'A' in mod_target_bases:
432
+ if a_thresholds and "A" in mod_target_bases:
333
433
  lo, hi = _unpack_minmax(a_thresholds)
334
434
  if lo is not None:
335
435
  s0 = filtered.n_obs
336
436
  filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) >= lo]
337
- print(f"Removed {s0 - filtered.n_obs} reads below min A fraction {lo}")
437
+ logger.info("Removed %s reads below min A fraction %s", s0 - filtered.n_obs, lo)
338
438
  if hi is not None:
339
439
  s0 = filtered.n_obs
340
440
  filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) <= hi]
341
- print(f"Removed {s0 - filtered.n_obs} reads above max A fraction {hi}")
342
- if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_A_site_in_read_vs_reference" in filtered.obs.columns):
441
+ logger.info("Removed %s reads above max A fraction %s", s0 - filtered.n_obs, hi)
442
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and (
443
+ "Valid_A_site_in_read_vs_reference" in filtered.obs.columns
444
+ ):
343
445
  s0 = filtered.n_obs
344
- filtered = filtered[filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
345
- print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid A site fraction vs ref")
446
+ filtered = filtered[
447
+ filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float)
448
+ >= float(min_valid_fraction_positions_in_read_vs_ref)
449
+ ]
450
+ logger.info(
451
+ "Removed %s reads with insufficient valid A site fraction vs ref",
452
+ s0 - filtered.n_obs,
453
+ )
346
454
 
347
455
  filtered = filtered.copy()
348
456
 
349
457
  # mark as done
350
458
  filtered.uns[uns_flag] = True
351
459
 
352
- return filtered
460
+ return filtered