smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,4 @@
1
- def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.05):
1
+ def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.00001, uns_flag='positional_coverage_calculated'):
2
2
  """
3
3
  Append position-level metadata regarding whether the position is informative within the given observation category.
4
4
 
@@ -13,6 +13,12 @@ def calculate_coverage(adata, obs_column='Reference_strand', position_nan_thresh
13
13
  import numpy as np
14
14
  import pandas as pd
15
15
  import anndata as ad
16
+
17
+ # Only run if not already performed
18
+ already = bool(adata.uns.get(uns_flag, False))
19
+ if already:
20
+ # QC already performed; nothing to do
21
+ return
16
22
 
17
23
  categories = adata.obs[obs_column].cat.categories
18
24
  n_categories_with_position = np.zeros(adata.shape[1])
@@ -40,3 +46,6 @@ def calculate_coverage(adata, obs_column='Reference_strand', position_nan_thresh
40
46
 
41
47
  # Store final category count
42
48
  adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
49
+
50
+ # mark as done
51
+ adata.uns[uns_flag] = True
@@ -103,7 +103,7 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
103
103
  probability_thresholding_list[position] = (0.8, np.nan)
104
104
  title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
105
105
  plt.title(title)
106
- save_name = output_directory + f'/{title}'
106
+ save_name = output_directory / f"{title}.png"
107
107
  if save:
108
108
  plt.savefig(save_name)
109
109
  plt.close()
@@ -0,0 +1,101 @@
1
+ def calculate_read_modification_stats(adata,
2
+ reference_column,
3
+ sample_names_col,
4
+ mod_target_bases,
5
+ uns_flag="read_modification_stats_calculated",
6
+ bypass=False,
7
+ force_redo=False
8
+ ):
9
+ """
10
+ Adds methylation/deamination statistics for each read.
11
+ Indicates the read GpC and CpG methylation ratio to other_C methylation (background false positive metric for Cytosine MTase SMF).
12
+
13
+ Parameters:
14
+ adata (AnnData): An adata object
15
+ reference_column (str): String representing the name of the Reference column to use
16
+ sample_names_col (str): String representing the name of the sample name column to use
17
+ mod_target_bases:
18
+
19
+ Returns:
20
+ None
21
+ """
22
+ import numpy as np
23
+ import anndata as ad
24
+ import pandas as pd
25
+
26
+ # Only run if not already performed
27
+ already = bool(adata.uns.get(uns_flag, False))
28
+ if (already and not force_redo) or bypass:
29
+ # QC already performed; nothing to do
30
+ return
31
+
32
+ print('Calculating read level Modification statistics')
33
+
34
+ references = set(adata.obs[reference_column])
35
+ sample_names = set(adata.obs[sample_names_col])
36
+ site_types = []
37
+
38
+ if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
39
+ site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
40
+
41
+ if 'A' in mod_target_bases:
42
+ site_types += ['A_site']
43
+
44
+ for site_type in site_types:
45
+ adata.obs[f'Modified_{site_type}_count'] = pd.Series(0, index=adata.obs_names, dtype=int)
46
+ adata.obs[f'Total_{site_type}_in_read'] = pd.Series(0, index=adata.obs_names, dtype=int)
47
+ adata.obs[f'Fraction_{site_type}_modified'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
48
+ adata.obs[f'Total_{site_type}_in_reference'] = pd.Series(np.nan, index=adata.obs_names, dtype=int)
49
+ adata.obs[f'Valid_{site_type}_in_read_vs_reference'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
50
+
51
+
52
+ for ref in references:
53
+ ref_subset = adata[adata.obs[reference_column] == ref]
54
+ for site_type in site_types:
55
+ print(f'Iterating over {ref}_{site_type}')
56
+ observation_matrix = ref_subset.obsm[f'{ref}_{site_type}']
57
+ total_positions_in_read = np.nansum(~np.isnan(observation_matrix), axis=1)
58
+ total_positions_in_reference = observation_matrix.shape[1]
59
+ fraction_valid_positions_in_read_vs_ref = total_positions_in_read / total_positions_in_reference
60
+ number_mods_in_read = np.nansum(observation_matrix, axis=1)
61
+ fraction_modified = number_mods_in_read / total_positions_in_read
62
+
63
+ fraction_modified = np.divide(
64
+ number_mods_in_read,
65
+ total_positions_in_read,
66
+ out=np.full_like(number_mods_in_read, np.nan, dtype=float),
67
+ where=total_positions_in_read != 0
68
+ )
69
+
70
+ temp_obs_data = pd.DataFrame({f'Total_{site_type}_in_read': total_positions_in_read,
71
+ f'Modified_{site_type}_count': number_mods_in_read,
72
+ f'Fraction_{site_type}_modified': fraction_modified,
73
+ f'Total_{site_type}_in_reference': total_positions_in_reference,
74
+ f'Valid_{site_type}_in_read_vs_reference': fraction_valid_positions_in_read_vs_ref},
75
+ index=ref_subset.obs.index)
76
+
77
+ adata.obs.update(temp_obs_data)
78
+
79
+ if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
80
+ with np.errstate(divide='ignore', invalid='ignore'):
81
+ gpc_to_c_ratio = np.divide(
82
+ adata.obs[f'Fraction_GpC_site_modified'],
83
+ adata.obs[f'Fraction_other_C_site_modified'],
84
+ out=np.full_like(adata.obs[f'Fraction_GpC_site_modified'], np.nan, dtype=float),
85
+ where=adata.obs[f'Fraction_other_C_site_modified'] != 0
86
+ )
87
+
88
+ cpg_to_c_ratio = np.divide(
89
+ adata.obs[f'Fraction_CpG_site_modified'],
90
+ adata.obs[f'Fraction_other_C_site_modified'],
91
+ out=np.full_like(adata.obs[f'Fraction_CpG_site_modified'], np.nan, dtype=float),
92
+ where=adata.obs[f'Fraction_other_C_site_modified'] != 0
93
+ )
94
+
95
+ adata.obs['GpC_to_other_C_mod_ratio'] = gpc_to_c_ratio
96
+ adata.obs['CpG_to_other_C_mod_ratio'] = cpg_to_c_ratio
97
+
98
+ # mark as done
99
+ adata.uns[uns_flag] = True
100
+
101
+ return
@@ -1,4 +1,9 @@
1
- def clean_NaN(adata, layer=None):
1
+ def clean_NaN(adata,
2
+ layer=None,
3
+ uns_flag='clean_NaN_performed',
4
+ bypass=False,
5
+ force_redo=True
6
+ ):
2
7
  """
3
8
  Append layers to adata that contain NaN cleaning strategies.
4
9
 
@@ -14,6 +19,12 @@ def clean_NaN(adata, layer=None):
14
19
  import anndata as ad
15
20
  from ..readwrite import adata_to_df
16
21
 
22
+ # Only run if not already performed
23
+ already = bool(adata.uns.get(uns_flag, False))
24
+ if (already and not force_redo) or bypass:
25
+ # QC already performed; nothing to do
26
+ return
27
+
17
28
  # Ensure the specified layer exists
18
29
  if layer and layer not in adata.layers:
19
30
  raise ValueError(f"Layer '{layer}' not found in adata.layers.")
@@ -44,3 +55,8 @@ def clean_NaN(adata, layer=None):
44
55
  print('Making layer: nan_half')
45
56
  df_nan_half = df.fillna(0.5)
46
57
  adata.layers['nan_half'] = df_nan_half.values
58
+
59
+ # mark as done
60
+ adata.uns[uns_flag] = True
61
+
62
+ return None
@@ -0,0 +1,158 @@
1
+ from typing import Optional, Union, Sequence
2
+ import numpy as np
3
+ import pandas as pd
4
+ import anndata as ad
5
+
6
+ def filter_reads_on_length_quality_mapping(
7
+ adata: ad.AnnData,
8
+ filter_on_coordinates: Union[bool, Sequence] = False,
9
+ # New single-range params (preferred):
10
+ read_length: Optional[Sequence[float]] = None, # e.g. [min, max]
11
+ length_ratio: Optional[Sequence[float]] = None, # e.g. [min, max]
12
+ read_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
13
+ mapping_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
14
+ uns_flag: str = "reads_removed_failing_length_quality_mapping_qc",
15
+ bypass: bool = False,
16
+ force_redo: bool = True
17
+ ) -> ad.AnnData:
18
+ """
19
+ Filter AnnData by coordinate window, read length, length ratios, read quality and mapping quality.
20
+
21
+ New: you may pass `read_length=[min, max]` (or tuple) to set both min/max in one argument.
22
+ If `read_length` is given it overrides scalar min/max variants (which are not present in this signature).
23
+ Same behavior supported for `length_ratio`, `read_quality`, `mapping_quality`.
24
+
25
+ Returns a filtered copy of the input AnnData and marks adata.uns[uns_flag] = True.
26
+ """
27
+ # early exit
28
+ already = bool(adata.uns.get(uns_flag, False))
29
+ if bypass or (already and not force_redo):
30
+ return adata
31
+
32
+ adata_work = adata
33
+ start_n = adata_work.n_obs
34
+
35
+ # --- coordinate filtering (unchanged) ---
36
+ if filter_on_coordinates:
37
+ try:
38
+ low, high = tuple(filter_on_coordinates)
39
+ except Exception:
40
+ raise ValueError("filter_on_coordinates must be False or an iterable of two numbers (low, high).")
41
+ try:
42
+ var_coords = np.array([float(v) for v in adata_work.var_names])
43
+ if low > high:
44
+ low, high = high, low
45
+ col_mask_bool = (var_coords >= float(low)) & (var_coords <= float(high))
46
+ if not col_mask_bool.any():
47
+ start_idx = int(np.argmin(np.abs(var_coords - float(low))))
48
+ end_idx = int(np.argmin(np.abs(var_coords - float(high))))
49
+ lo_idx, hi_idx = min(start_idx, end_idx), max(start_idx, end_idx)
50
+ selected_cols = list(adata_work.var_names[lo_idx : hi_idx + 1])
51
+ else:
52
+ selected_cols = list(adata_work.var_names[col_mask_bool])
53
+ print(f"Subsetting adata to coordinates between {low} and {high}: keeping {len(selected_cols)} variables.")
54
+ adata_work = adata_work[:, selected_cols].copy()
55
+ except Exception:
56
+ print("Warning: could not interpret adata.var_names as numeric coordinates — skipping coordinate filtering.")
57
+
58
+ # --- helper to coerce range inputs ---
59
+ def _coerce_range(range_arg):
60
+ """
61
+ Given range_arg which may be None or a 2-seq [min,max], return (min_or_None, max_or_None).
62
+ If both present and min>max they are swapped.
63
+ """
64
+ if range_arg is None:
65
+ return None, None
66
+ if not isinstance(range_arg, (list, tuple, np.ndarray)) or len(range_arg) != 2:
67
+ # not a 2-element range -> treat as no restriction (or you could raise)
68
+ return None, None
69
+ lo_raw, hi_raw = range_arg[0], range_arg[1]
70
+ lo = None if lo_raw is None else float(lo_raw)
71
+ hi = None if hi_raw is None else float(hi_raw)
72
+ if (lo is not None) and (hi is not None) and lo > hi:
73
+ lo, hi = hi, lo
74
+ return lo, hi
75
+
76
+ # Resolve ranges using only the provided range arguments
77
+ rl_min, rl_max = _coerce_range(read_length)
78
+ lr_min, lr_max = _coerce_range(length_ratio)
79
+ rq_min, rq_max = _coerce_range(read_quality)
80
+ mq_min, mq_max = _coerce_range(mapping_quality)
81
+
82
+ # --- build combined mask ---
83
+ combined_mask = pd.Series(True, index=adata_work.obs.index)
84
+
85
+ # read length filter
86
+ if (rl_min is not None) or (rl_max is not None):
87
+ if "mapped_length" not in adata_work.obs.columns:
88
+ print("Warning: 'mapped_length' not found in adata.obs — skipping read_length filter.")
89
+ else:
90
+ vals = pd.to_numeric(adata_work.obs["mapped_length"], errors="coerce")
91
+ mask = pd.Series(True, index=adata_work.obs.index)
92
+ if rl_min is not None:
93
+ mask &= (vals >= rl_min)
94
+ if rl_max is not None:
95
+ mask &= (vals <= rl_max)
96
+ mask &= vals.notna()
97
+ combined_mask &= mask
98
+ print(f"Planned read_length filter: min={rl_min}, max={rl_max}")
99
+
100
+ # length ratio filter
101
+ if (lr_min is not None) or (lr_max is not None):
102
+ if "mapped_length_to_reference_length_ratio" not in adata_work.obs.columns:
103
+ print("Warning: 'mapped_length_to_reference_length_ratio' not found in adata.obs — skipping length_ratio filter.")
104
+ else:
105
+ vals = pd.to_numeric(adata_work.obs["mapped_length_to_reference_length_ratio"], errors="coerce")
106
+ mask = pd.Series(True, index=adata_work.obs.index)
107
+ if lr_min is not None:
108
+ mask &= (vals >= lr_min)
109
+ if lr_max is not None:
110
+ mask &= (vals <= lr_max)
111
+ mask &= vals.notna()
112
+ combined_mask &= mask
113
+ print(f"Planned length_ratio filter: min={lr_min}, max={lr_max}")
114
+
115
+ # read quality filter (supporting optional range but typically min only)
116
+ if (rq_min is not None) or (rq_max is not None):
117
+ if "read_quality" not in adata_work.obs.columns:
118
+ print("Warning: 'read_quality' not found in adata.obs — skipping read_quality filter.")
119
+ else:
120
+ vals = pd.to_numeric(adata_work.obs["read_quality"], errors="coerce")
121
+ mask = pd.Series(True, index=adata_work.obs.index)
122
+ if rq_min is not None:
123
+ mask &= (vals >= rq_min)
124
+ if rq_max is not None:
125
+ mask &= (vals <= rq_max)
126
+ mask &= vals.notna()
127
+ combined_mask &= mask
128
+ print(f"Planned read_quality filter: min={rq_min}, max={rq_max}")
129
+
130
+ # mapping quality filter (supporting optional range but typically min only)
131
+ if (mq_min is not None) or (mq_max is not None):
132
+ if "mapping_quality" not in adata_work.obs.columns:
133
+ print("Warning: 'mapping_quality' not found in adata.obs — skipping mapping_quality filter.")
134
+ else:
135
+ vals = pd.to_numeric(adata_work.obs["mapping_quality"], errors="coerce")
136
+ mask = pd.Series(True, index=adata_work.obs.index)
137
+ if mq_min is not None:
138
+ mask &= (vals >= mq_min)
139
+ if mq_max is not None:
140
+ mask &= (vals <= mq_max)
141
+ mask &= vals.notna()
142
+ combined_mask &= mask
143
+ print(f"Planned mapping_quality filter: min={mq_min}, max={mq_max}")
144
+
145
+ # Apply combined mask and report
146
+ s0 = adata_work.n_obs
147
+ combined_mask_bool = combined_mask.astype(bool).values
148
+ adata_work = adata_work[combined_mask_bool].copy()
149
+ s1 = adata_work.n_obs
150
+ print(f"Combined filters applied: kept {s1} / {s0} reads (removed {s0 - s1})")
151
+
152
+ final_n = adata_work.n_obs
153
+ print(f"Filtering complete: start={start_n}, final={final_n}, removed={start_n - final_n}")
154
+
155
+ # mark as done
156
+ adata_work.uns[uns_flag] = True
157
+
158
+ return adata_work