smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,15 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  from typing import Optional, Tuple
3
5
 
4
6
  import anndata as ad
5
7
 
8
+ from smftools.logging_utils import get_logger
9
+
10
+ logger = get_logger(__name__)
11
+
12
+
6
13
  def preprocess_adata(
7
14
  config_path: str,
8
15
  ) -> Tuple[Optional[ad.AnnData], Optional[Path], Optional[ad.AnnData], Optional[Path]]:
@@ -29,8 +36,8 @@ def preprocess_adata(
29
36
  Path to preprocessed, duplicate-removed AnnData.
30
37
  """
31
38
  from ..readwrite import safe_read_h5ad
32
- from .load_adata import load_adata
33
39
  from .helpers import get_adata_paths
40
+ from .load_adata import load_adata
34
41
 
35
42
  # 1) Ensure config is loaded and at least *some* AnnData stage exists
36
43
  loaded_adata, loaded_path, cfg = load_adata(config_path)
@@ -60,20 +67,27 @@ def preprocess_adata(
60
67
  # Case A: full redo of preprocessing
61
68
  # -----------------------------
62
69
  if getattr(cfg, "force_redo_preprocessing", False):
63
- print("Forcing full redo of preprocessing workflow, starting from latest stage AnnData available.")
70
+ logger.info(
71
+ "Forcing full redo of preprocessing workflow, starting from latest stage AnnData available."
72
+ )
64
73
 
65
74
  if hmm_exists:
66
75
  adata = _load(hmm_path)
76
+ source_path = hmm_path
67
77
  elif spatial_exists:
68
78
  adata = _load(spatial_path)
79
+ source_path = spatial_path
69
80
  elif pp_dedup_exists:
70
81
  adata = _load(pp_dedup_path)
82
+ source_path = pp_dedup_path
71
83
  elif pp_exists:
72
84
  adata = _load(pp_path)
85
+ source_path = pp_path
73
86
  elif raw_exists:
74
87
  adata = _load(raw_path)
88
+ source_path = raw_path
75
89
  else:
76
- print("Cannot redo preprocessing: no AnnData available at any stage.")
90
+ logger.error("Cannot redo preprocessing: no AnnData available at any stage.")
77
91
  return (None, None, None, None)
78
92
 
79
93
  pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
@@ -81,6 +95,8 @@ def preprocess_adata(
81
95
  cfg=cfg,
82
96
  pp_adata_path=pp_path,
83
97
  pp_dup_rem_adata_path=pp_dedup_path,
98
+ source_adata_path=source_path,
99
+ config_path=config_path,
84
100
  )
85
101
  return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
86
102
 
@@ -88,16 +104,18 @@ def preprocess_adata(
88
104
  # Case B: redo duplicate detection only
89
105
  # -----------------------------
90
106
  if getattr(cfg, "force_redo_flag_duplicate_reads", False):
91
- print(
107
+ logger.info(
92
108
  "Forcing redo of duplicate detection workflow, starting from the preprocessed AnnData "
93
109
  "if available. Otherwise, will use the raw AnnData."
94
110
  )
95
111
  if pp_exists:
96
112
  adata = _load(pp_path)
113
+ source_path = pp_path
97
114
  elif raw_exists:
98
115
  adata = _load(raw_path)
116
+ source_path = raw_path
99
117
  else:
100
- print(
118
+ logger.error(
101
119
  "Cannot redo duplicate detection: no compatible AnnData available "
102
120
  "(need at least raw or preprocessed)."
103
121
  )
@@ -108,6 +126,8 @@ def preprocess_adata(
108
126
  cfg=cfg,
109
127
  pp_adata_path=pp_path,
110
128
  pp_dup_rem_adata_path=pp_dedup_path,
129
+ source_adata_path=source_path,
130
+ config_path=config_path,
111
131
  )
112
132
  return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
113
133
 
@@ -117,43 +137,51 @@ def preprocess_adata(
117
137
 
118
138
  # If HMM exists, preprocessing is considered “done enough”
119
139
  if hmm_exists:
120
- print(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
140
+ logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
121
141
  return (None, None, None, None)
122
142
 
123
143
  # If spatial exists, also skip re-preprocessing by default
124
144
  if spatial_exists:
125
- print(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
145
+ logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
126
146
  return (None, None, None, None)
127
147
 
128
148
  # If pp_dedup exists, just return paths (no recomputation)
129
149
  if pp_dedup_exists:
130
- print(f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}")
150
+ logger.debug(
151
+ f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
152
+ )
131
153
  return (None, pp_path, None, pp_dedup_path)
132
154
 
133
155
  # If pp exists but pp_dedup does not, load pp and run core
134
156
  if pp_exists:
135
- print(f"Preprocessed AnnData found: {pp_path}")
157
+ logger.debug(f"Preprocessed AnnData found: {pp_path}")
136
158
  adata = _load(pp_path)
159
+ source_path = pp_path
137
160
  pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
138
161
  adata=adata,
139
162
  cfg=cfg,
140
163
  pp_adata_path=pp_path,
141
164
  pp_dup_rem_adata_path=pp_dedup_path,
165
+ source_adata_path=source_path,
166
+ config_path=config_path,
142
167
  )
143
168
  return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
144
169
 
145
170
  # Otherwise, fall back to raw (if available)
146
171
  if raw_exists:
147
172
  adata = _load(raw_path)
173
+ source_path = raw_path
148
174
  pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
149
175
  adata=adata,
150
176
  cfg=cfg,
151
177
  pp_adata_path=pp_path,
152
178
  pp_dup_rem_adata_path=pp_dedup_path,
179
+ source_adata_path=source_path,
180
+ config_path=config_path,
153
181
  )
154
182
  return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
155
183
 
156
- print("No AnnData available at any stage for preprocessing.")
184
+ logger.error("No AnnData available at any stage for preprocessing.")
157
185
  return (None, None, None, None)
158
186
 
159
187
 
@@ -162,6 +190,8 @@ def preprocess_adata_core(
162
190
  cfg,
163
191
  pp_adata_path: Path,
164
192
  pp_dup_rem_adata_path: Path,
193
+ source_adata_path: Optional[Path] = None,
194
+ config_path: Optional[str] = None,
165
195
  ) -> Tuple[ad.AnnData, Path, ad.AnnData, Path]:
166
196
  """
167
197
  Core preprocessing pipeline.
@@ -190,31 +220,32 @@ def preprocess_adata_core(
190
220
  """
191
221
  from pathlib import Path
192
222
 
193
- import numpy as np
194
-
195
- from .helpers import write_gz_h5ad
196
- from ..readwrite import make_dirs
223
+ from ..metadata import record_smftools_metadata
224
+ from ..plotting import plot_read_qc_histograms
197
225
  from ..preprocessing import (
198
- load_sample_sheet,
199
- filter_reads_on_length_quality_mapping,
200
- clean_NaN,
201
- calculate_coverage,
202
226
  append_base_context,
203
227
  append_binary_layer_by_base_context,
228
+ binarize_adata,
229
+ binarize_on_Youden,
230
+ calculate_complexity_II,
231
+ calculate_coverage,
232
+ calculate_position_Youden,
204
233
  calculate_read_modification_stats,
234
+ clean_NaN,
235
+ filter_reads_on_length_quality_mapping,
205
236
  filter_reads_on_modification_thresholds,
206
237
  flag_duplicate_reads,
207
- calculate_complexity_II,
208
- calculate_position_Youden,
209
- binarize_on_Youden,
210
- binarize_adata,
238
+ load_sample_sheet,
211
239
  )
212
- from ..plotting import plot_read_qc_histograms
240
+ from ..readwrite import make_dirs
241
+ from .helpers import write_gz_h5ad
213
242
 
214
243
  ################################### 1) Load existing ###################################
215
244
  # General config variable init - Necessary user passed inputs
216
- smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
217
- output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
245
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
246
+ output_directory = Path(
247
+ cfg.output_directory
248
+ ) # Path to the output directory to make for the analysis. Necessary.
218
249
  make_dirs([output_directory])
219
250
 
220
251
  ######### Begin Preprocessing #########
@@ -222,172 +253,238 @@ def preprocess_adata_core(
222
253
 
223
254
  ## Load sample sheet metadata based on barcode mapping ##
224
255
  if getattr(cfg, "sample_sheet_path", None):
225
- load_sample_sheet(adata,
226
- cfg.sample_sheet_path,
227
- mapping_key_column=cfg.sample_sheet_mapping_column,
228
- as_category=True,
229
- force_reload=cfg.force_reload_sample_sheet)
256
+ load_sample_sheet(
257
+ adata,
258
+ cfg.sample_sheet_path,
259
+ mapping_key_column=cfg.sample_sheet_mapping_column,
260
+ as_category=True,
261
+ force_reload=cfg.force_reload_sample_sheet,
262
+ )
230
263
  else:
231
264
  pass
232
-
265
+
233
266
  # Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
234
267
  pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
235
268
 
236
269
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
237
- print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
270
+ logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
238
271
  else:
239
272
  make_dirs([pp_dir, pp_length_qc_dir])
240
- plot_read_qc_histograms(adata,
241
- pp_length_qc_dir,
242
- cfg.obs_to_plot_pp_qc,
243
- sample_key=cfg.sample_name_col_for_plotting,
244
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
273
+ plot_read_qc_histograms(
274
+ adata,
275
+ pp_length_qc_dir,
276
+ cfg.obs_to_plot_pp_qc,
277
+ sample_key=cfg.sample_name_col_for_plotting,
278
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
279
+ )
245
280
 
246
281
  # Filter on read length, read quality, reference length, mapped_length, and mapping quality metadata.
247
282
  print(adata.shape)
248
- adata = filter_reads_on_length_quality_mapping(adata,
249
- filter_on_coordinates=cfg.read_coord_filter,
250
- read_length=cfg.read_len_filter_thresholds,
251
- length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
252
- read_quality=cfg.read_quality_filter_thresholds,
253
- mapping_quality=cfg.read_mapping_quality_filter_thresholds,
254
- bypass=None,
255
- force_redo=None)
283
+ adata = filter_reads_on_length_quality_mapping(
284
+ adata,
285
+ filter_on_coordinates=cfg.read_coord_filter,
286
+ read_length=cfg.read_len_filter_thresholds,
287
+ length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
288
+ read_quality=cfg.read_quality_filter_thresholds,
289
+ mapping_quality=cfg.read_mapping_quality_filter_thresholds,
290
+ bypass=None,
291
+ force_redo=None,
292
+ )
256
293
  print(adata.shape)
257
294
 
258
295
  pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
259
296
 
260
297
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
261
- print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
298
+ logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
262
299
  else:
263
300
  make_dirs([pp_dir, pp_length_qc_dir])
264
- plot_read_qc_histograms(adata,
265
- pp_length_qc_dir,
266
- cfg.obs_to_plot_pp_qc,
267
- sample_key=cfg.sample_name_col_for_plotting,
268
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
269
-
301
+ plot_read_qc_histograms(
302
+ adata,
303
+ pp_length_qc_dir,
304
+ cfg.obs_to_plot_pp_qc,
305
+ sample_key=cfg.sample_name_col_for_plotting,
306
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
307
+ )
308
+
270
309
  ############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
271
- if smf_modality == 'direct':
310
+ if smf_modality == "direct":
272
311
  native = True
273
312
  if cfg.fit_position_methylation_thresholds:
274
313
  pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
275
314
  make_dirs([pp_Youden_dir])
276
315
  # Calculate positional methylation thresholds for mod calls
277
- calculate_position_Youden(adata,
278
- positive_control_sample=cfg.positive_control_sample_methylation_fitting,
279
- negative_control_sample=cfg.negative_control_sample_methylation_fitting,
280
- J_threshold=cfg.fit_j_threshold,
281
- ref_column=cfg.reference_column,
282
- sample_column=cfg.sample_column,
283
- infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
284
- inference_variable=cfg.inference_variable_sample_methylation_fitting,
285
- save=True,
286
- output_directory=pp_Youden_dir
287
- )
316
+ calculate_position_Youden(
317
+ adata,
318
+ positive_control_sample=cfg.positive_control_sample_methylation_fitting,
319
+ negative_control_sample=cfg.negative_control_sample_methylation_fitting,
320
+ J_threshold=cfg.fit_j_threshold,
321
+ ref_column=cfg.reference_column,
322
+ sample_column=cfg.sample_column,
323
+ infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
324
+ inference_variable=cfg.inference_variable_sample_methylation_fitting,
325
+ save=True,
326
+ output_directory=pp_Youden_dir,
327
+ )
288
328
  # binarize the modcalls based on the determined thresholds
289
- binarize_on_Youden(adata,
290
- ref_column=cfg.reference_column,
291
- output_layer_name=cfg.output_binary_layer_name
292
- )
329
+ binarize_on_Youden(
330
+ adata,
331
+ ref_column=cfg.reference_column,
332
+ output_layer_name=cfg.output_binary_layer_name,
333
+ )
293
334
  else:
294
- binarize_adata(adata,
295
- source="X",
296
- target_layer=cfg.output_binary_layer_name,
297
- threshold=cfg.binarize_on_fixed_methlyation_threshold)
298
-
299
- clean_NaN(adata,
300
- layer=cfg.output_binary_layer_name,
301
- bypass=cfg.bypass_clean_nan,
302
- force_redo=cfg.force_redo_clean_nan
303
- )
335
+ binarize_adata(
336
+ adata,
337
+ source="X",
338
+ target_layer=cfg.output_binary_layer_name,
339
+ threshold=cfg.binarize_on_fixed_methlyation_threshold,
340
+ )
341
+
342
+ clean_NaN(
343
+ adata,
344
+ layer=cfg.output_binary_layer_name,
345
+ bypass=cfg.bypass_clean_nan,
346
+ force_redo=cfg.force_redo_clean_nan,
347
+ )
304
348
  else:
305
349
  native = False
306
- clean_NaN(adata,
307
- bypass=cfg.bypass_clean_nan,
308
- force_redo=cfg.force_redo_clean_nan
309
- )
310
-
350
+ clean_NaN(adata, bypass=cfg.bypass_clean_nan, force_redo=cfg.force_redo_clean_nan)
351
+
311
352
  ############### Calculate positional coverage by reference set in dataset ###############
312
- calculate_coverage(adata,
313
- ref_column=cfg.reference_column,
314
- position_nan_threshold=cfg.position_max_nan_threshold)
353
+ calculate_coverage(
354
+ adata,
355
+ ref_column=cfg.reference_column,
356
+ position_nan_threshold=cfg.position_max_nan_threshold,
357
+ smf_modality=smf_modality,
358
+ target_layer=cfg.output_binary_layer_name,
359
+ )
315
360
 
316
361
  ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
317
362
  # Additionally, store base_context level binary modification arrays in adata.obsm
318
- append_base_context(adata,
319
- ref_column=cfg.reference_column,
320
- use_consensus=False,
321
- native=native,
322
- mod_target_bases=cfg.mod_target_bases,
323
- bypass=cfg.bypass_append_base_context,
324
- force_redo=cfg.force_redo_append_base_context)
325
-
326
- adata = append_binary_layer_by_base_context(adata,
327
- cfg.reference_column,
328
- smf_modality,
329
- bypass=cfg.bypass_append_binary_layer_by_base_context,
330
- force_redo=cfg.force_redo_append_binary_layer_by_base_context)
331
-
332
- ############### Calculate read methylation/deamination statistics for specific base contexts defined above ###############
333
- calculate_read_modification_stats(adata,
334
- cfg.reference_column,
335
- cfg.sample_column,
336
- cfg.mod_target_bases,
337
- bypass=cfg.bypass_calculate_read_modification_stats,
338
- force_redo=cfg.force_redo_calculate_read_modification_stats)
339
-
363
+ append_base_context(
364
+ adata,
365
+ ref_column=cfg.reference_column,
366
+ use_consensus=False,
367
+ native=native,
368
+ mod_target_bases=cfg.mod_target_bases,
369
+ bypass=cfg.bypass_append_base_context,
370
+ force_redo=cfg.force_redo_append_base_context,
371
+ )
372
+
373
+ ############### Calculate read methylation/deamination statistics for specific base contexts defined by append_base_context ###############
374
+ calculate_read_modification_stats(
375
+ adata,
376
+ cfg.reference_column,
377
+ cfg.sample_column,
378
+ cfg.mod_target_bases,
379
+ bypass=cfg.bypass_calculate_read_modification_stats,
380
+ force_redo=cfg.force_redo_calculate_read_modification_stats,
381
+ )
382
+
340
383
  ### Make a dir for outputting sample level read modification metrics before filtering ###
341
384
  pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
342
385
 
343
386
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
344
- print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
387
+ logger.debug(
388
+ f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
389
+ )
345
390
  else:
346
391
  make_dirs([pp_dir, pp_meth_qc_dir])
347
- obs_to_plot = ['Raw_modification_signal']
348
- if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
349
- obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_C_site_modified']
350
- if 'A' in cfg.mod_target_bases:
351
- obs_to_plot += ['Fraction_A_site_modified']
352
- plot_read_qc_histograms(adata,
353
- pp_meth_qc_dir, obs_to_plot,
354
- sample_key=cfg.sample_name_col_for_plotting,
355
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
392
+ obs_to_plot = ["Raw_modification_signal"]
393
+ if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
394
+ obs_to_plot += [
395
+ "Fraction_GpC_site_modified",
396
+ "Fraction_CpG_site_modified",
397
+ "Fraction_other_C_site_modified",
398
+ "Fraction_C_site_modified",
399
+ ]
400
+ if "A" in cfg.mod_target_bases:
401
+ obs_to_plot += ["Fraction_A_site_modified"]
402
+ plot_read_qc_histograms(
403
+ adata,
404
+ pp_meth_qc_dir,
405
+ obs_to_plot,
406
+ sample_key=cfg.sample_name_col_for_plotting,
407
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
408
+ )
356
409
 
357
410
  ##### Optionally filter reads on modification metrics
358
- adata = filter_reads_on_modification_thresholds(adata,
359
- smf_modality=smf_modality,
360
- mod_target_bases=cfg.mod_target_bases,
361
- gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
362
- cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
363
- any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
364
- a_thresholds=cfg.read_mod_filtering_a_thresholds,
365
- use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
366
- min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
367
- bypass=cfg.bypass_filter_reads_on_modification_thresholds,
368
- force_redo=cfg.force_redo_filter_reads_on_modification_thresholds)
369
-
411
+ adata = filter_reads_on_modification_thresholds(
412
+ adata,
413
+ smf_modality=smf_modality,
414
+ mod_target_bases=cfg.mod_target_bases,
415
+ gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
416
+ cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
417
+ any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
418
+ a_thresholds=cfg.read_mod_filtering_a_thresholds,
419
+ use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
420
+ min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
421
+ bypass=cfg.bypass_filter_reads_on_modification_thresholds,
422
+ force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
423
+ )
424
+
370
425
  pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
371
-
426
+
372
427
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
373
- print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
428
+ logger.debug(
429
+ f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
430
+ )
374
431
  else:
375
432
  make_dirs([pp_dir, pp_meth_qc_dir])
376
- obs_to_plot = ['Raw_modification_signal']
377
- if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
378
- obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_C_site_modified']
379
- if 'A' in cfg.mod_target_bases:
380
- obs_to_plot += ['Fraction_A_site_modified']
381
- plot_read_qc_histograms(adata,
382
- pp_meth_qc_dir, obs_to_plot,
383
- sample_key=cfg.sample_name_col_for_plotting,
384
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
433
+ obs_to_plot = ["Raw_modification_signal"]
434
+ if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
435
+ obs_to_plot += [
436
+ "Fraction_GpC_site_modified",
437
+ "Fraction_CpG_site_modified",
438
+ "Fraction_other_C_site_modified",
439
+ "Fraction_C_site_modified",
440
+ ]
441
+ if "A" in cfg.mod_target_bases:
442
+ obs_to_plot += ["Fraction_A_site_modified"]
443
+ plot_read_qc_histograms(
444
+ adata,
445
+ pp_meth_qc_dir,
446
+ obs_to_plot,
447
+ sample_key=cfg.sample_name_col_for_plotting,
448
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
449
+ )
450
+
451
+ ############### Calculate final positional coverage by reference set in dataset after filtering reads ###############
452
+ calculate_coverage(
453
+ adata,
454
+ ref_column=cfg.reference_column,
455
+ position_nan_threshold=cfg.position_max_nan_threshold,
456
+ smf_modality=smf_modality,
457
+ target_layer=cfg.output_binary_layer_name,
458
+ force_redo=True,
459
+ )
460
+
461
+ ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats after filtering reads ###############
462
+ # Additionally, store base_context level binary modification arrays in adata.obsm
463
+ append_base_context(
464
+ adata,
465
+ ref_column=cfg.reference_column,
466
+ use_consensus=False,
467
+ native=native,
468
+ mod_target_bases=cfg.mod_target_bases,
469
+ bypass=cfg.bypass_append_base_context,
470
+ force_redo=True,
471
+ )
472
+
473
+ # Add site type binary modification layers for valid coverage sites
474
+ adata = append_binary_layer_by_base_context(
475
+ adata,
476
+ cfg.reference_column,
477
+ smf_modality,
478
+ bypass=cfg.bypass_append_binary_layer_by_base_context,
479
+ force_redo=cfg.force_redo_append_binary_layer_by_base_context,
480
+ from_valid_sites_only=True,
481
+ )
385
482
 
386
483
  ############### Duplicate detection for conversion/deamination SMF ###############
387
- if smf_modality != 'direct':
484
+ if smf_modality != "direct":
388
485
  references = adata.obs[cfg.reference_column].cat.categories
389
486
 
390
- var_filters_sets =[]
487
+ var_filters_sets = []
391
488
  for ref in references:
392
489
  for site_type in cfg.duplicate_detection_site_types:
393
490
  var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
@@ -397,27 +494,30 @@ def preprocess_adata_core(
397
494
  make_dirs([pp_dup_qc_dir])
398
495
 
399
496
  # Flag duplicate reads and plot duplicate detection QC
400
- adata_unique, adata = flag_duplicate_reads(adata,
401
- var_filters_sets,
402
- distance_threshold=cfg.duplicate_detection_distance_threshold,
403
- obs_reference_col=cfg.reference_column,
404
- sample_col=cfg.sample_name_col_for_plotting,
405
- output_directory=pp_dup_qc_dir,
406
- metric_keys=cfg.hamming_vs_metric_keys,
407
- keep_best_metric=cfg.duplicate_detection_keep_best_metric,
408
- bypass=cfg.bypass_flag_duplicate_reads,
409
- force_redo=cfg.force_redo_flag_duplicate_reads,
410
- window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
411
- min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
412
- do_pca=cfg.duplicate_detection_do_pca,
413
- pca_n_components=50,
414
- pca_center=True,
415
- do_hierarchical=cfg.duplicate_detection_do_hierarchical,
416
- hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
417
- hierarchical_metric="euclidean",
418
- hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors
419
- )
420
-
497
+ adata_unique, adata = flag_duplicate_reads(
498
+ adata,
499
+ var_filters_sets,
500
+ distance_threshold=cfg.duplicate_detection_distance_threshold,
501
+ obs_reference_col=cfg.reference_column,
502
+ sample_col=cfg.sample_name_col_for_plotting,
503
+ output_directory=pp_dup_qc_dir,
504
+ metric_keys=cfg.hamming_vs_metric_keys,
505
+ keep_best_metric=cfg.duplicate_detection_keep_best_metric,
506
+ bypass=cfg.bypass_flag_duplicate_reads,
507
+ force_redo=cfg.force_redo_flag_duplicate_reads,
508
+ window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
509
+ min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
510
+ do_pca=cfg.duplicate_detection_do_pca,
511
+ pca_n_components=50,
512
+ pca_center=True,
513
+ do_hierarchical=cfg.duplicate_detection_do_hierarchical,
514
+ hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
515
+ hierarchical_metric="euclidean",
516
+ hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
517
+ demux_types=("double", "already"),
518
+ demux_col="demux_type",
519
+ )
520
+
421
521
  # Use the flagged duplicate read groups and perform complexity analysis
422
522
  complexity_outs = pp_dup_qc_dir / "sample_complexity_analyses"
423
523
  make_dirs([complexity_outs])
@@ -426,15 +526,15 @@ def preprocess_adata_core(
426
526
  output_directory=complexity_outs,
427
527
  sample_col=cfg.sample_name_col_for_plotting,
428
528
  ref_col=cfg.reference_column,
429
- cluster_col='sequence__merged_cluster_id',
529
+ cluster_col="sequence__merged_cluster_id",
430
530
  plot=True,
431
- save_plot=True, # set False to display instead
531
+ save_plot=True, # set False to display instead
432
532
  n_boot=30,
433
533
  n_depths=12,
434
534
  random_state=42,
435
535
  csv_summary=True,
436
536
  bypass=cfg.bypass_complexity_analysis,
437
- force_redo=cfg.force_redo_complexity_analysis
537
+ force_redo=cfg.force_redo_complexity_analysis,
438
538
  )
439
539
 
440
540
  else:
@@ -443,13 +543,29 @@ def preprocess_adata_core(
443
543
 
444
544
  ############################################### Save preprocessed adata with duplicate detection ###############################################
445
545
  if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
446
- print('Saving preprocessed adata.')
546
+ logger.info("Saving preprocessed adata.")
547
+ record_smftools_metadata(
548
+ adata,
549
+ step_name="preprocess",
550
+ cfg=cfg,
551
+ config_path=config_path,
552
+ input_paths=[source_adata_path] if source_adata_path else None,
553
+ output_path=pp_adata_path,
554
+ )
447
555
  write_gz_h5ad(adata, pp_adata_path)
448
556
 
449
557
  if not pp_dup_rem_adata_path.exists() or cfg.force_redo_preprocessing:
450
- print('Saving preprocessed adata with duplicates removed.')
451
- write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
558
+ logger.info("Saving preprocessed adata with duplicates removed.")
559
+ record_smftools_metadata(
560
+ adata_unique,
561
+ step_name="preprocess",
562
+ cfg=cfg,
563
+ config_path=config_path,
564
+ input_paths=[pp_adata_path],
565
+ output_path=pp_dup_rem_adata_path,
566
+ )
567
+ write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
452
568
 
453
569
  ########################################################################################################################
454
570
 
455
- return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
571
+ return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)