smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,26 +1,38 @@
1
1
  ## filter_adata_by_nan_proportion
2
2
 
3
- def filter_adata_by_nan_proportion(adata, threshold, axis='obs'):
4
- """
5
- Filters an anndata object on a nan proportion threshold in a given matrix axis.
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ import anndata as ad
9
+
10
+
11
+ def filter_adata_by_nan_proportion(
12
+ adata: "ad.AnnData", threshold: float, axis: str = "obs"
13
+ ) -> "ad.AnnData":
14
+ """Filter an AnnData object on NaN proportion in a matrix axis.
15
+
16
+ Args:
17
+ adata: AnnData object to filter.
18
+ threshold: Maximum allowed NaN proportion.
19
+ axis: Whether to filter based on ``"obs"`` or ``"var"`` NaN content.
6
20
 
7
- Parameters:
8
- adata (AnnData):
9
- threshold (float): The max np.nan content to allow in the given axis.
10
- axis (str): Whether to filter the adata based on obs or var np.nan content
11
21
  Returns:
12
- filtered_adata
22
+ anndata.AnnData: Filtered AnnData object.
23
+
24
+ Raises:
25
+ ValueError: If ``axis`` is not ``"obs"`` or ``"var"``.
13
26
  """
14
27
  import numpy as np
15
- import anndata as ad
16
28
 
17
- if axis == 'obs':
29
+ if axis == "obs":
18
30
  # Calculate the proportion of NaN values in each read
19
31
  nan_proportion = np.isnan(adata.X).mean(axis=1)
20
32
  # Filter reads to keep reads with less than a certain NaN proportion
21
33
  filtered_indices = np.where(nan_proportion <= threshold)[0]
22
34
  filtered_adata = adata[filtered_indices, :].copy()
23
- elif axis == 'var':
35
+ elif axis == "var":
24
36
  # Calculate the proportion of NaN values at a given position
25
37
  nan_proportion = np.isnan(adata.X).mean(axis=0)
26
38
  # Filter positions to keep positions with less than a certain NaN proportion
@@ -28,4 +40,4 @@ def filter_adata_by_nan_proportion(adata, threshold, axis='obs'):
28
40
  filtered_adata = adata[:, filtered_indices].copy()
29
41
  else:
30
42
  raise ValueError("Axis must be either 'obs' or 'var'")
31
- return filtered_adata
43
+ return filtered_adata
@@ -1,28 +1,41 @@
1
- from typing import Optional, Union, Sequence
1
+ from typing import Optional, Sequence, Union
2
+
3
+ import anndata as ad
2
4
  import numpy as np
3
5
  import pandas as pd
4
- import anndata as ad
6
+
7
+ from smftools.logging_utils import get_logger
8
+
9
+ logger = get_logger(__name__)
10
+
5
11
 
6
12
  def filter_reads_on_length_quality_mapping(
7
13
  adata: ad.AnnData,
8
14
  filter_on_coordinates: Union[bool, Sequence] = False,
9
15
  # New single-range params (preferred):
10
- read_length: Optional[Sequence[float]] = None, # e.g. [min, max]
11
- length_ratio: Optional[Sequence[float]] = None, # e.g. [min, max]
12
- read_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
13
- mapping_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
14
- uns_flag: str = "reads_removed_failing_length_quality_mapping_qc",
16
+ read_length: Optional[Sequence[float]] = None, # e.g. [min, max]
17
+ length_ratio: Optional[Sequence[float]] = None, # e.g. [min, max]
18
+ read_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
19
+ mapping_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
20
+ uns_flag: str = "filter_reads_on_length_quality_mapping_performed",
15
21
  bypass: bool = False,
16
- force_redo: bool = True
22
+ force_redo: bool = True,
17
23
  ) -> ad.AnnData:
18
- """
19
- Filter AnnData by coordinate window, read length, length ratios, read quality and mapping quality.
20
-
21
- New: you may pass `read_length=[min, max]` (or tuple) to set both min/max in one argument.
22
- If `read_length` is given it overrides scalar min/max variants (which are not present in this signature).
23
- Same behavior supported for `length_ratio`, `read_quality`, `mapping_quality`.
24
-
25
- Returns a filtered copy of the input AnnData and marks adata.uns[uns_flag] = True.
24
+ """Filter AnnData by coordinates, read length, quality, and mapping metrics.
25
+
26
+ Args:
27
+ adata: AnnData object to filter.
28
+ filter_on_coordinates: Optional coordinate window as a two-value sequence.
29
+ read_length: Read length range as ``[min, max]``.
30
+ length_ratio: Length ratio range as ``[min, max]``.
31
+ read_quality: Read quality range as ``[min, max]``.
32
+ mapping_quality: Mapping quality range as ``[min, max]``.
33
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
34
+ bypass: Whether to skip processing.
35
+ force_redo: Whether to rerun even if ``uns_flag`` is set.
36
+
37
+ Returns:
38
+ anndata.AnnData: Filtered copy of the input AnnData.
26
39
  """
27
40
  # early exit
28
41
  already = bool(adata.uns.get(uns_flag, False))
@@ -37,7 +50,9 @@ def filter_reads_on_length_quality_mapping(
37
50
  try:
38
51
  low, high = tuple(filter_on_coordinates)
39
52
  except Exception:
40
- raise ValueError("filter_on_coordinates must be False or an iterable of two numbers (low, high).")
53
+ raise ValueError(
54
+ "filter_on_coordinates must be False or an iterable of two numbers (low, high)."
55
+ )
41
56
  try:
42
57
  var_coords = np.array([float(v) for v in adata_work.var_names])
43
58
  if low > high:
@@ -50,10 +65,17 @@ def filter_reads_on_length_quality_mapping(
50
65
  selected_cols = list(adata_work.var_names[lo_idx : hi_idx + 1])
51
66
  else:
52
67
  selected_cols = list(adata_work.var_names[col_mask_bool])
53
- print(f"Subsetting adata to coordinates between {low} and {high}: keeping {len(selected_cols)} variables.")
68
+ logger.info(
69
+ "Subsetting adata to coordinates between %s and %s: keeping %s variables.",
70
+ low,
71
+ high,
72
+ len(selected_cols),
73
+ )
54
74
  adata_work = adata_work[:, selected_cols].copy()
55
75
  except Exception:
56
- print("Warning: could not interpret adata.var_names as numeric coordinates — skipping coordinate filtering.")
76
+ logger.warning(
77
+ "Could not interpret adata.var_names as numeric coordinates — skipping coordinate filtering."
78
+ )
57
79
 
58
80
  # --- helper to coerce range inputs ---
59
81
  def _coerce_range(range_arg):
@@ -85,72 +107,83 @@ def filter_reads_on_length_quality_mapping(
85
107
  # read length filter
86
108
  if (rl_min is not None) or (rl_max is not None):
87
109
  if "mapped_length" not in adata_work.obs.columns:
88
- print("Warning: 'mapped_length' not found in adata.obs — skipping read_length filter.")
110
+ logger.warning("'mapped_length' not found in adata.obs — skipping read_length filter.")
89
111
  else:
90
112
  vals = pd.to_numeric(adata_work.obs["mapped_length"], errors="coerce")
91
113
  mask = pd.Series(True, index=adata_work.obs.index)
92
114
  if rl_min is not None:
93
- mask &= (vals >= rl_min)
115
+ mask &= vals >= rl_min
94
116
  if rl_max is not None:
95
- mask &= (vals <= rl_max)
117
+ mask &= vals <= rl_max
96
118
  mask &= vals.notna()
97
119
  combined_mask &= mask
98
- print(f"Planned read_length filter: min={rl_min}, max={rl_max}")
120
+ logger.info("Planned read_length filter: min=%s, max=%s", rl_min, rl_max)
99
121
 
100
122
  # length ratio filter
101
123
  if (lr_min is not None) or (lr_max is not None):
102
124
  if "mapped_length_to_reference_length_ratio" not in adata_work.obs.columns:
103
- print("Warning: 'mapped_length_to_reference_length_ratio' not found in adata.obs — skipping length_ratio filter.")
125
+ logger.warning(
126
+ "'mapped_length_to_reference_length_ratio' not found in adata.obs — skipping length_ratio filter."
127
+ )
104
128
  else:
105
- vals = pd.to_numeric(adata_work.obs["mapped_length_to_reference_length_ratio"], errors="coerce")
129
+ vals = pd.to_numeric(
130
+ adata_work.obs["mapped_length_to_reference_length_ratio"], errors="coerce"
131
+ )
106
132
  mask = pd.Series(True, index=adata_work.obs.index)
107
133
  if lr_min is not None:
108
- mask &= (vals >= lr_min)
134
+ mask &= vals >= lr_min
109
135
  if lr_max is not None:
110
- mask &= (vals <= lr_max)
136
+ mask &= vals <= lr_max
111
137
  mask &= vals.notna()
112
138
  combined_mask &= mask
113
- print(f"Planned length_ratio filter: min={lr_min}, max={lr_max}")
139
+ logger.info("Planned length_ratio filter: min=%s, max=%s", lr_min, lr_max)
114
140
 
115
141
  # read quality filter (supporting optional range but typically min only)
116
142
  if (rq_min is not None) or (rq_max is not None):
117
143
  if "read_quality" not in adata_work.obs.columns:
118
- print("Warning: 'read_quality' not found in adata.obs — skipping read_quality filter.")
144
+ logger.warning("'read_quality' not found in adata.obs — skipping read_quality filter.")
119
145
  else:
120
146
  vals = pd.to_numeric(adata_work.obs["read_quality"], errors="coerce")
121
147
  mask = pd.Series(True, index=adata_work.obs.index)
122
148
  if rq_min is not None:
123
- mask &= (vals >= rq_min)
149
+ mask &= vals >= rq_min
124
150
  if rq_max is not None:
125
- mask &= (vals <= rq_max)
151
+ mask &= vals <= rq_max
126
152
  mask &= vals.notna()
127
153
  combined_mask &= mask
128
- print(f"Planned read_quality filter: min={rq_min}, max={rq_max}")
154
+ logger.info("Planned read_quality filter: min=%s, max=%s", rq_min, rq_max)
129
155
 
130
156
  # mapping quality filter (supporting optional range but typically min only)
131
157
  if (mq_min is not None) or (mq_max is not None):
132
158
  if "mapping_quality" not in adata_work.obs.columns:
133
- print("Warning: 'mapping_quality' not found in adata.obs — skipping mapping_quality filter.")
159
+ logger.warning(
160
+ "'mapping_quality' not found in adata.obs — skipping mapping_quality filter."
161
+ )
134
162
  else:
135
163
  vals = pd.to_numeric(adata_work.obs["mapping_quality"], errors="coerce")
136
164
  mask = pd.Series(True, index=adata_work.obs.index)
137
165
  if mq_min is not None:
138
- mask &= (vals >= mq_min)
166
+ mask &= vals >= mq_min
139
167
  if mq_max is not None:
140
- mask &= (vals <= mq_max)
168
+ mask &= vals <= mq_max
141
169
  mask &= vals.notna()
142
170
  combined_mask &= mask
143
- print(f"Planned mapping_quality filter: min={mq_min}, max={mq_max}")
171
+ logger.info("Planned mapping_quality filter: min=%s, max=%s", mq_min, mq_max)
144
172
 
145
173
  # Apply combined mask and report
146
174
  s0 = adata_work.n_obs
147
175
  combined_mask_bool = combined_mask.astype(bool).values
148
176
  adata_work = adata_work[combined_mask_bool].copy()
149
177
  s1 = adata_work.n_obs
150
- print(f"Combined filters applied: kept {s1} / {s0} reads (removed {s0 - s1})")
178
+ logger.info("Combined filters applied: kept %s / %s reads (removed %s)", s1, s0, s0 - s1)
151
179
 
152
180
  final_n = adata_work.n_obs
153
- print(f"Filtering complete: start={start_n}, final={final_n}, removed={start_n - final_n}")
181
+ logger.info(
182
+ "Filtering complete: start=%s, final=%s, removed=%s",
183
+ start_n,
184
+ final_n,
185
+ start_n - final_n,
186
+ )
154
187
 
155
188
  # mark as done
156
189
  adata_work.uns[uns_flag] = True