smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +7 -1
  5. smftools/cli/hmm_adata.py +902 -244
  6. smftools/cli/load_adata.py +318 -198
  7. smftools/cli/preprocess_adata.py +285 -171
  8. smftools/cli/spatial_adata.py +137 -53
  9. smftools/cli_entry.py +94 -178
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +5 -1
  12. smftools/config/deaminase.yaml +1 -1
  13. smftools/config/default.yaml +22 -17
  14. smftools/config/direct.yaml +8 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +505 -276
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2125 -1426
  21. smftools/hmm/__init__.py +2 -3
  22. smftools/hmm/archived/call_hmm_peaks.py +16 -1
  23. smftools/hmm/call_hmm_peaks.py +173 -193
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +379 -156
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +195 -29
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +347 -168
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +145 -85
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +8 -8
  84. smftools/preprocessing/append_base_context.py +105 -79
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  86. smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +127 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +44 -22
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +103 -55
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +688 -271
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +93 -27
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +264 -109
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.4.dist-info/RECORD +0 -176
  128. /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
  129. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  130. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  131. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  132. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  133. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -3,6 +3,11 @@ from typing import Optional, Tuple
3
3
 
4
4
  import anndata as ad
5
5
 
6
+ from smftools.logging_utils import get_logger
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
6
11
  def preprocess_adata(
7
12
  config_path: str,
8
13
  ) -> Tuple[Optional[ad.AnnData], Optional[Path], Optional[ad.AnnData], Optional[Path]]:
@@ -29,8 +34,8 @@ def preprocess_adata(
29
34
  Path to preprocessed, duplicate-removed AnnData.
30
35
  """
31
36
  from ..readwrite import safe_read_h5ad
32
- from .load_adata import load_adata
33
37
  from .helpers import get_adata_paths
38
+ from .load_adata import load_adata
34
39
 
35
40
  # 1) Ensure config is loaded and at least *some* AnnData stage exists
36
41
  loaded_adata, loaded_path, cfg = load_adata(config_path)
@@ -60,20 +65,27 @@ def preprocess_adata(
60
65
  # Case A: full redo of preprocessing
61
66
  # -----------------------------
62
67
  if getattr(cfg, "force_redo_preprocessing", False):
63
- print("Forcing full redo of preprocessing workflow, starting from latest stage AnnData available.")
68
+ logger.info(
69
+ "Forcing full redo of preprocessing workflow, starting from latest stage AnnData available."
70
+ )
64
71
 
65
72
  if hmm_exists:
66
73
  adata = _load(hmm_path)
74
+ source_path = hmm_path
67
75
  elif spatial_exists:
68
76
  adata = _load(spatial_path)
77
+ source_path = spatial_path
69
78
  elif pp_dedup_exists:
70
79
  adata = _load(pp_dedup_path)
80
+ source_path = pp_dedup_path
71
81
  elif pp_exists:
72
82
  adata = _load(pp_path)
83
+ source_path = pp_path
73
84
  elif raw_exists:
74
85
  adata = _load(raw_path)
86
+ source_path = raw_path
75
87
  else:
76
- print("Cannot redo preprocessing: no AnnData available at any stage.")
88
+ logger.error("Cannot redo preprocessing: no AnnData available at any stage.")
77
89
  return (None, None, None, None)
78
90
 
79
91
  pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
@@ -81,6 +93,8 @@ def preprocess_adata(
81
93
  cfg=cfg,
82
94
  pp_adata_path=pp_path,
83
95
  pp_dup_rem_adata_path=pp_dedup_path,
96
+ source_adata_path=source_path,
97
+ config_path=config_path,
84
98
  )
85
99
  return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
86
100
 
@@ -88,16 +102,18 @@ def preprocess_adata(
88
102
  # Case B: redo duplicate detection only
89
103
  # -----------------------------
90
104
  if getattr(cfg, "force_redo_flag_duplicate_reads", False):
91
- print(
105
+ logger.info(
92
106
  "Forcing redo of duplicate detection workflow, starting from the preprocessed AnnData "
93
107
  "if available. Otherwise, will use the raw AnnData."
94
108
  )
95
109
  if pp_exists:
96
110
  adata = _load(pp_path)
111
+ source_path = pp_path
97
112
  elif raw_exists:
98
113
  adata = _load(raw_path)
114
+ source_path = raw_path
99
115
  else:
100
- print(
116
+ logger.error(
101
117
  "Cannot redo duplicate detection: no compatible AnnData available "
102
118
  "(need at least raw or preprocessed)."
103
119
  )
@@ -108,6 +124,8 @@ def preprocess_adata(
108
124
  cfg=cfg,
109
125
  pp_adata_path=pp_path,
110
126
  pp_dup_rem_adata_path=pp_dedup_path,
127
+ source_adata_path=source_path,
128
+ config_path=config_path,
111
129
  )
112
130
  return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
113
131
 
@@ -117,43 +135,51 @@ def preprocess_adata(
117
135
 
118
136
  # If HMM exists, preprocessing is considered “done enough”
119
137
  if hmm_exists:
120
- print(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
138
+ logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
121
139
  return (None, None, None, None)
122
140
 
123
141
  # If spatial exists, also skip re-preprocessing by default
124
142
  if spatial_exists:
125
- print(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
143
+ logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
126
144
  return (None, None, None, None)
127
145
 
128
146
  # If pp_dedup exists, just return paths (no recomputation)
129
147
  if pp_dedup_exists:
130
- print(f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}")
148
+ logger.debug(
149
+ f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
150
+ )
131
151
  return (None, pp_path, None, pp_dedup_path)
132
152
 
133
153
  # If pp exists but pp_dedup does not, load pp and run core
134
154
  if pp_exists:
135
- print(f"Preprocessed AnnData found: {pp_path}")
155
+ logger.debug(f"Preprocessed AnnData found: {pp_path}")
136
156
  adata = _load(pp_path)
157
+ source_path = pp_path
137
158
  pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
138
159
  adata=adata,
139
160
  cfg=cfg,
140
161
  pp_adata_path=pp_path,
141
162
  pp_dup_rem_adata_path=pp_dedup_path,
163
+ source_adata_path=source_path,
164
+ config_path=config_path,
142
165
  )
143
166
  return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
144
167
 
145
168
  # Otherwise, fall back to raw (if available)
146
169
  if raw_exists:
147
170
  adata = _load(raw_path)
171
+ source_path = raw_path
148
172
  pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
149
173
  adata=adata,
150
174
  cfg=cfg,
151
175
  pp_adata_path=pp_path,
152
176
  pp_dup_rem_adata_path=pp_dedup_path,
177
+ source_adata_path=source_path,
178
+ config_path=config_path,
153
179
  )
154
180
  return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
155
181
 
156
- print("No AnnData available at any stage for preprocessing.")
182
+ logger.error("No AnnData available at any stage for preprocessing.")
157
183
  return (None, None, None, None)
158
184
 
159
185
 
@@ -162,6 +188,8 @@ def preprocess_adata_core(
162
188
  cfg,
163
189
  pp_adata_path: Path,
164
190
  pp_dup_rem_adata_path: Path,
191
+ source_adata_path: Optional[Path] = None,
192
+ config_path: Optional[str] = None,
165
193
  ) -> Tuple[ad.AnnData, Path, ad.AnnData, Path]:
166
194
  """
167
195
  Core preprocessing pipeline.
@@ -190,31 +218,32 @@ def preprocess_adata_core(
190
218
  """
191
219
  from pathlib import Path
192
220
 
193
- import numpy as np
194
-
195
- from .helpers import write_gz_h5ad
196
- from ..readwrite import make_dirs
221
+ from ..metadata import record_smftools_metadata
222
+ from ..plotting import plot_read_qc_histograms
197
223
  from ..preprocessing import (
198
- load_sample_sheet,
199
- filter_reads_on_length_quality_mapping,
200
- clean_NaN,
201
- calculate_coverage,
202
224
  append_base_context,
203
225
  append_binary_layer_by_base_context,
226
+ binarize_adata,
227
+ binarize_on_Youden,
228
+ calculate_complexity_II,
229
+ calculate_coverage,
230
+ calculate_position_Youden,
204
231
  calculate_read_modification_stats,
232
+ clean_NaN,
233
+ filter_reads_on_length_quality_mapping,
205
234
  filter_reads_on_modification_thresholds,
206
235
  flag_duplicate_reads,
207
- calculate_complexity_II,
208
- calculate_position_Youden,
209
- binarize_on_Youden,
210
- binarize_adata,
236
+ load_sample_sheet,
211
237
  )
212
- from ..plotting import plot_read_qc_histograms
238
+ from ..readwrite import make_dirs
239
+ from .helpers import write_gz_h5ad
213
240
 
214
241
  ################################### 1) Load existing ###################################
215
242
  # General config variable init - Necessary user passed inputs
216
- smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
217
- output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
243
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
244
+ output_directory = Path(
245
+ cfg.output_directory
246
+ ) # Path to the output directory to make for the analysis. Necessary.
218
247
  make_dirs([output_directory])
219
248
 
220
249
  ######### Begin Preprocessing #########
@@ -222,172 +251,238 @@ def preprocess_adata_core(
222
251
 
223
252
  ## Load sample sheet metadata based on barcode mapping ##
224
253
  if getattr(cfg, "sample_sheet_path", None):
225
- load_sample_sheet(adata,
226
- cfg.sample_sheet_path,
227
- mapping_key_column=cfg.sample_sheet_mapping_column,
228
- as_category=True,
229
- force_reload=cfg.force_reload_sample_sheet)
254
+ load_sample_sheet(
255
+ adata,
256
+ cfg.sample_sheet_path,
257
+ mapping_key_column=cfg.sample_sheet_mapping_column,
258
+ as_category=True,
259
+ force_reload=cfg.force_reload_sample_sheet,
260
+ )
230
261
  else:
231
262
  pass
232
-
263
+
233
264
  # Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
234
265
  pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
235
266
 
236
267
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
237
- print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
268
+ logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
238
269
  else:
239
270
  make_dirs([pp_dir, pp_length_qc_dir])
240
- plot_read_qc_histograms(adata,
241
- pp_length_qc_dir,
242
- cfg.obs_to_plot_pp_qc,
243
- sample_key=cfg.sample_name_col_for_plotting,
244
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
271
+ plot_read_qc_histograms(
272
+ adata,
273
+ pp_length_qc_dir,
274
+ cfg.obs_to_plot_pp_qc,
275
+ sample_key=cfg.sample_name_col_for_plotting,
276
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
277
+ )
245
278
 
246
279
  # Filter on read length, read quality, reference length, mapped_length, and mapping quality metadata.
247
280
  print(adata.shape)
248
- adata = filter_reads_on_length_quality_mapping(adata,
249
- filter_on_coordinates=cfg.read_coord_filter,
250
- read_length=cfg.read_len_filter_thresholds,
251
- length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
252
- read_quality=cfg.read_quality_filter_thresholds,
253
- mapping_quality=cfg.read_mapping_quality_filter_thresholds,
254
- bypass=None,
255
- force_redo=None)
281
+ adata = filter_reads_on_length_quality_mapping(
282
+ adata,
283
+ filter_on_coordinates=cfg.read_coord_filter,
284
+ read_length=cfg.read_len_filter_thresholds,
285
+ length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
286
+ read_quality=cfg.read_quality_filter_thresholds,
287
+ mapping_quality=cfg.read_mapping_quality_filter_thresholds,
288
+ bypass=None,
289
+ force_redo=None,
290
+ )
256
291
  print(adata.shape)
257
292
 
258
293
  pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
259
294
 
260
295
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
261
- print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
296
+ logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
262
297
  else:
263
298
  make_dirs([pp_dir, pp_length_qc_dir])
264
- plot_read_qc_histograms(adata,
265
- pp_length_qc_dir,
266
- cfg.obs_to_plot_pp_qc,
267
- sample_key=cfg.sample_name_col_for_plotting,
268
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
269
-
299
+ plot_read_qc_histograms(
300
+ adata,
301
+ pp_length_qc_dir,
302
+ cfg.obs_to_plot_pp_qc,
303
+ sample_key=cfg.sample_name_col_for_plotting,
304
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
305
+ )
306
+
270
307
  ############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
271
- if smf_modality == 'direct':
308
+ if smf_modality == "direct":
272
309
  native = True
273
310
  if cfg.fit_position_methylation_thresholds:
274
311
  pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
275
312
  make_dirs([pp_Youden_dir])
276
313
  # Calculate positional methylation thresholds for mod calls
277
- calculate_position_Youden(adata,
278
- positive_control_sample=cfg.positive_control_sample_methylation_fitting,
279
- negative_control_sample=cfg.negative_control_sample_methylation_fitting,
280
- J_threshold=cfg.fit_j_threshold,
281
- ref_column=cfg.reference_column,
282
- sample_column=cfg.sample_column,
283
- infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
284
- inference_variable=cfg.inference_variable_sample_methylation_fitting,
285
- save=True,
286
- output_directory=pp_Youden_dir
287
- )
314
+ calculate_position_Youden(
315
+ adata,
316
+ positive_control_sample=cfg.positive_control_sample_methylation_fitting,
317
+ negative_control_sample=cfg.negative_control_sample_methylation_fitting,
318
+ J_threshold=cfg.fit_j_threshold,
319
+ ref_column=cfg.reference_column,
320
+ sample_column=cfg.sample_column,
321
+ infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
322
+ inference_variable=cfg.inference_variable_sample_methylation_fitting,
323
+ save=True,
324
+ output_directory=pp_Youden_dir,
325
+ )
288
326
  # binarize the modcalls based on the determined thresholds
289
- binarize_on_Youden(adata,
290
- ref_column=cfg.reference_column,
291
- output_layer_name=cfg.output_binary_layer_name
292
- )
327
+ binarize_on_Youden(
328
+ adata,
329
+ ref_column=cfg.reference_column,
330
+ output_layer_name=cfg.output_binary_layer_name,
331
+ )
293
332
  else:
294
- binarize_adata(adata,
295
- source="X",
296
- target_layer=cfg.output_binary_layer_name,
297
- threshold=cfg.binarize_on_fixed_methlyation_threshold)
298
-
299
- clean_NaN(adata,
300
- layer=cfg.output_binary_layer_name,
301
- bypass=cfg.bypass_clean_nan,
302
- force_redo=cfg.force_redo_clean_nan
303
- )
333
+ binarize_adata(
334
+ adata,
335
+ source="X",
336
+ target_layer=cfg.output_binary_layer_name,
337
+ threshold=cfg.binarize_on_fixed_methlyation_threshold,
338
+ )
339
+
340
+ clean_NaN(
341
+ adata,
342
+ layer=cfg.output_binary_layer_name,
343
+ bypass=cfg.bypass_clean_nan,
344
+ force_redo=cfg.force_redo_clean_nan,
345
+ )
304
346
  else:
305
347
  native = False
306
- clean_NaN(adata,
307
- bypass=cfg.bypass_clean_nan,
308
- force_redo=cfg.force_redo_clean_nan
309
- )
310
-
348
+ clean_NaN(adata, bypass=cfg.bypass_clean_nan, force_redo=cfg.force_redo_clean_nan)
349
+
311
350
  ############### Calculate positional coverage by reference set in dataset ###############
312
- calculate_coverage(adata,
313
- ref_column=cfg.reference_column,
314
- position_nan_threshold=cfg.position_max_nan_threshold)
351
+ calculate_coverage(
352
+ adata,
353
+ ref_column=cfg.reference_column,
354
+ position_nan_threshold=cfg.position_max_nan_threshold,
355
+ smf_modality=smf_modality,
356
+ target_layer=cfg.output_binary_layer_name,
357
+ )
315
358
 
316
359
  ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
317
360
  # Additionally, store base_context level binary modification arrays in adata.obsm
318
- append_base_context(adata,
319
- ref_column=cfg.reference_column,
320
- use_consensus=False,
321
- native=native,
322
- mod_target_bases=cfg.mod_target_bases,
323
- bypass=cfg.bypass_append_base_context,
324
- force_redo=cfg.force_redo_append_base_context)
325
-
326
- adata = append_binary_layer_by_base_context(adata,
327
- cfg.reference_column,
328
- smf_modality,
329
- bypass=cfg.bypass_append_binary_layer_by_base_context,
330
- force_redo=cfg.force_redo_append_binary_layer_by_base_context)
331
-
332
- ############### Calculate read methylation/deamination statistics for specific base contexts defined above ###############
333
- calculate_read_modification_stats(adata,
334
- cfg.reference_column,
335
- cfg.sample_column,
336
- cfg.mod_target_bases,
337
- bypass=cfg.bypass_calculate_read_modification_stats,
338
- force_redo=cfg.force_redo_calculate_read_modification_stats)
339
-
361
+ append_base_context(
362
+ adata,
363
+ ref_column=cfg.reference_column,
364
+ use_consensus=False,
365
+ native=native,
366
+ mod_target_bases=cfg.mod_target_bases,
367
+ bypass=cfg.bypass_append_base_context,
368
+ force_redo=cfg.force_redo_append_base_context,
369
+ )
370
+
371
+ ############### Calculate read methylation/deamination statistics for specific base contexts defined by append_base_context ###############
372
+ calculate_read_modification_stats(
373
+ adata,
374
+ cfg.reference_column,
375
+ cfg.sample_column,
376
+ cfg.mod_target_bases,
377
+ bypass=cfg.bypass_calculate_read_modification_stats,
378
+ force_redo=cfg.force_redo_calculate_read_modification_stats,
379
+ )
380
+
340
381
  ### Make a dir for outputting sample level read modification metrics before filtering ###
341
382
  pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
342
383
 
343
384
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
344
- print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
385
+ logger.debug(
386
+ f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
387
+ )
345
388
  else:
346
389
  make_dirs([pp_dir, pp_meth_qc_dir])
347
- obs_to_plot = ['Raw_modification_signal']
348
- if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
349
- obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_C_site_modified']
350
- if 'A' in cfg.mod_target_bases:
351
- obs_to_plot += ['Fraction_A_site_modified']
352
- plot_read_qc_histograms(adata,
353
- pp_meth_qc_dir, obs_to_plot,
354
- sample_key=cfg.sample_name_col_for_plotting,
355
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
390
+ obs_to_plot = ["Raw_modification_signal"]
391
+ if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
392
+ obs_to_plot += [
393
+ "Fraction_GpC_site_modified",
394
+ "Fraction_CpG_site_modified",
395
+ "Fraction_other_C_site_modified",
396
+ "Fraction_C_site_modified",
397
+ ]
398
+ if "A" in cfg.mod_target_bases:
399
+ obs_to_plot += ["Fraction_A_site_modified"]
400
+ plot_read_qc_histograms(
401
+ adata,
402
+ pp_meth_qc_dir,
403
+ obs_to_plot,
404
+ sample_key=cfg.sample_name_col_for_plotting,
405
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
406
+ )
356
407
 
357
408
  ##### Optionally filter reads on modification metrics
358
- adata = filter_reads_on_modification_thresholds(adata,
359
- smf_modality=smf_modality,
360
- mod_target_bases=cfg.mod_target_bases,
361
- gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
362
- cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
363
- any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
364
- a_thresholds=cfg.read_mod_filtering_a_thresholds,
365
- use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
366
- min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
367
- bypass=cfg.bypass_filter_reads_on_modification_thresholds,
368
- force_redo=cfg.force_redo_filter_reads_on_modification_thresholds)
369
-
409
+ adata = filter_reads_on_modification_thresholds(
410
+ adata,
411
+ smf_modality=smf_modality,
412
+ mod_target_bases=cfg.mod_target_bases,
413
+ gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
414
+ cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
415
+ any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
416
+ a_thresholds=cfg.read_mod_filtering_a_thresholds,
417
+ use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
418
+ min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
419
+ bypass=cfg.bypass_filter_reads_on_modification_thresholds,
420
+ force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
421
+ )
422
+
370
423
  pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
371
-
424
+
372
425
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
373
- print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
426
+ logger.debug(
427
+ f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
428
+ )
374
429
  else:
375
430
  make_dirs([pp_dir, pp_meth_qc_dir])
376
- obs_to_plot = ['Raw_modification_signal']
377
- if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
378
- obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_C_site_modified']
379
- if 'A' in cfg.mod_target_bases:
380
- obs_to_plot += ['Fraction_A_site_modified']
381
- plot_read_qc_histograms(adata,
382
- pp_meth_qc_dir, obs_to_plot,
383
- sample_key=cfg.sample_name_col_for_plotting,
384
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
431
+ obs_to_plot = ["Raw_modification_signal"]
432
+ if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
433
+ obs_to_plot += [
434
+ "Fraction_GpC_site_modified",
435
+ "Fraction_CpG_site_modified",
436
+ "Fraction_other_C_site_modified",
437
+ "Fraction_C_site_modified",
438
+ ]
439
+ if "A" in cfg.mod_target_bases:
440
+ obs_to_plot += ["Fraction_A_site_modified"]
441
+ plot_read_qc_histograms(
442
+ adata,
443
+ pp_meth_qc_dir,
444
+ obs_to_plot,
445
+ sample_key=cfg.sample_name_col_for_plotting,
446
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
447
+ )
448
+
449
+ ############### Calculate final positional coverage by reference set in dataset after filtering reads ###############
450
+ calculate_coverage(
451
+ adata,
452
+ ref_column=cfg.reference_column,
453
+ position_nan_threshold=cfg.position_max_nan_threshold,
454
+ smf_modality=smf_modality,
455
+ target_layer=cfg.output_binary_layer_name,
456
+ force_redo=True,
457
+ )
458
+
459
+ ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats after filtering reads ###############
460
+ # Additionally, store base_context level binary modification arrays in adata.obsm
461
+ append_base_context(
462
+ adata,
463
+ ref_column=cfg.reference_column,
464
+ use_consensus=False,
465
+ native=native,
466
+ mod_target_bases=cfg.mod_target_bases,
467
+ bypass=cfg.bypass_append_base_context,
468
+ force_redo=True,
469
+ )
470
+
471
+ # Add site type binary modification layers for valid coverage sites
472
+ adata = append_binary_layer_by_base_context(
473
+ adata,
474
+ cfg.reference_column,
475
+ smf_modality,
476
+ bypass=cfg.bypass_append_binary_layer_by_base_context,
477
+ force_redo=cfg.force_redo_append_binary_layer_by_base_context,
478
+ from_valid_sites_only=True,
479
+ )
385
480
 
386
481
  ############### Duplicate detection for conversion/deamination SMF ###############
387
- if smf_modality != 'direct':
482
+ if smf_modality != "direct":
388
483
  references = adata.obs[cfg.reference_column].cat.categories
389
484
 
390
- var_filters_sets =[]
485
+ var_filters_sets = []
391
486
  for ref in references:
392
487
  for site_type in cfg.duplicate_detection_site_types:
393
488
  var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
@@ -397,27 +492,30 @@ def preprocess_adata_core(
397
492
  make_dirs([pp_dup_qc_dir])
398
493
 
399
494
  # Flag duplicate reads and plot duplicate detection QC
400
- adata_unique, adata = flag_duplicate_reads(adata,
401
- var_filters_sets,
402
- distance_threshold=cfg.duplicate_detection_distance_threshold,
403
- obs_reference_col=cfg.reference_column,
404
- sample_col=cfg.sample_name_col_for_plotting,
405
- output_directory=pp_dup_qc_dir,
406
- metric_keys=cfg.hamming_vs_metric_keys,
407
- keep_best_metric=cfg.duplicate_detection_keep_best_metric,
408
- bypass=cfg.bypass_flag_duplicate_reads,
409
- force_redo=cfg.force_redo_flag_duplicate_reads,
410
- window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
411
- min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
412
- do_pca=cfg.duplicate_detection_do_pca,
413
- pca_n_components=50,
414
- pca_center=True,
415
- do_hierarchical=cfg.duplicate_detection_do_hierarchical,
416
- hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
417
- hierarchical_metric="euclidean",
418
- hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors
419
- )
420
-
495
+ adata_unique, adata = flag_duplicate_reads(
496
+ adata,
497
+ var_filters_sets,
498
+ distance_threshold=cfg.duplicate_detection_distance_threshold,
499
+ obs_reference_col=cfg.reference_column,
500
+ sample_col=cfg.sample_name_col_for_plotting,
501
+ output_directory=pp_dup_qc_dir,
502
+ metric_keys=cfg.hamming_vs_metric_keys,
503
+ keep_best_metric=cfg.duplicate_detection_keep_best_metric,
504
+ bypass=cfg.bypass_flag_duplicate_reads,
505
+ force_redo=cfg.force_redo_flag_duplicate_reads,
506
+ window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
507
+ min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
508
+ do_pca=cfg.duplicate_detection_do_pca,
509
+ pca_n_components=50,
510
+ pca_center=True,
511
+ do_hierarchical=cfg.duplicate_detection_do_hierarchical,
512
+ hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
513
+ hierarchical_metric="euclidean",
514
+ hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
515
+ demux_types=("double", "already"),
516
+ demux_col="demux_type",
517
+ )
518
+
421
519
  # Use the flagged duplicate read groups and perform complexity analysis
422
520
  complexity_outs = pp_dup_qc_dir / "sample_complexity_analyses"
423
521
  make_dirs([complexity_outs])
@@ -426,15 +524,15 @@ def preprocess_adata_core(
426
524
  output_directory=complexity_outs,
427
525
  sample_col=cfg.sample_name_col_for_plotting,
428
526
  ref_col=cfg.reference_column,
429
- cluster_col='sequence__merged_cluster_id',
527
+ cluster_col="sequence__merged_cluster_id",
430
528
  plot=True,
431
- save_plot=True, # set False to display instead
529
+ save_plot=True, # set False to display instead
432
530
  n_boot=30,
433
531
  n_depths=12,
434
532
  random_state=42,
435
533
  csv_summary=True,
436
534
  bypass=cfg.bypass_complexity_analysis,
437
- force_redo=cfg.force_redo_complexity_analysis
535
+ force_redo=cfg.force_redo_complexity_analysis,
438
536
  )
439
537
 
440
538
  else:
@@ -443,13 +541,29 @@ def preprocess_adata_core(
443
541
 
444
542
  ############################################### Save preprocessed adata with duplicate detection ###############################################
445
543
  if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
446
- print('Saving preprocessed adata.')
544
+ logger.info("Saving preprocessed adata.")
545
+ record_smftools_metadata(
546
+ adata,
547
+ step_name="preprocess",
548
+ cfg=cfg,
549
+ config_path=config_path,
550
+ input_paths=[source_adata_path] if source_adata_path else None,
551
+ output_path=pp_adata_path,
552
+ )
447
553
  write_gz_h5ad(adata, pp_adata_path)
448
554
 
449
555
  if not pp_dup_rem_adata_path.exists() or cfg.force_redo_preprocessing:
450
- print('Saving preprocessed adata with duplicates removed.')
451
- write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
556
+ logger.info("Saving preprocessed adata with duplicates removed.")
557
+ record_smftools_metadata(
558
+ adata_unique,
559
+ step_name="preprocess",
560
+ cfg=cfg,
561
+ config_path=config_path,
562
+ input_paths=[pp_adata_path],
563
+ output_path=pp_dup_rem_adata_path,
564
+ )
565
+ write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
452
566
 
453
567
  ########################################################################################################################
454
568
 
455
- return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
569
+ return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)