smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,293 +1,488 @@
1
- def preprocess_adata(config_path):
1
+ from pathlib import Path
2
+ from typing import Optional, Tuple
3
+
4
+ import anndata as ad
5
+
6
+ from smftools.logging_utils import get_logger
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ def preprocess_adata(
12
+ config_path: str,
13
+ ) -> Tuple[Optional[ad.AnnData], Optional[Path], Optional[ad.AnnData], Optional[Path]]:
2
14
  """
3
- High-level function to call for preprocessing an adata object.
4
- Command line accesses this through smftools preprocess <config_path>
15
+ CLI-facing wrapper for preprocessing.
5
16
 
6
- Parameters:
7
- config_path (str): A string representing the file path to the experiment configuration csv file.
17
+ Called by: `smftools preprocess <config_path>`
8
18
 
9
- Returns:
10
- (pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path)
19
+ - Ensure a raw AnnData exists (or some later-stage AnnData) via `load_adata`.
20
+ - Determine which AnnData stages exist (raw, pp, pp_dedup, spatial, hmm).
21
+ - Respect cfg flags (force_redo_preprocessing, force_redo_flag_duplicate_reads).
22
+ - Decide what starting AnnData to load (or whether to early-return).
23
+ - Call `preprocess_adata_core(...)` when appropriate.
24
+
25
+ Returns
26
+ -------
27
+ pp_adata : AnnData | None
28
+ Preprocessed AnnData (may be None if we skipped work).
29
+ pp_adata_path : Path | None
30
+ Path to preprocessed AnnData.
31
+ pp_dedup_adata : AnnData | None
32
+ Preprocessed, duplicate-removed AnnData.
33
+ pp_dedup_adata_path : Path | None
34
+ Path to preprocessed, duplicate-removed AnnData.
11
35
  """
12
- from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
36
+ from ..readwrite import safe_read_h5ad
37
+ from .helpers import get_adata_paths
13
38
  from .load_adata import load_adata
14
39
 
15
- import numpy as np
16
- import pandas as pd
17
- import anndata as ad
18
- import scanpy as sc
40
+ # 1) Ensure config is loaded and at least *some* AnnData stage exists
41
+ loaded_adata, loaded_path, cfg = load_adata(config_path)
19
42
 
20
- import os
21
- from importlib import resources
22
- from pathlib import Path
43
+ # 2) Compute canonical paths
44
+ paths = get_adata_paths(cfg)
45
+ raw_path = paths.raw
46
+ pp_path = paths.pp
47
+ pp_dedup_path = paths.pp_dedup
48
+ spatial_path = paths.spatial
49
+ hmm_path = paths.hmm
23
50
 
24
- from datetime import datetime
25
- date_str = datetime.today().strftime("%y%m%d")
51
+ raw_exists = raw_path.exists()
52
+ pp_exists = pp_path.exists()
53
+ pp_dedup_exists = pp_dedup_path.exists()
54
+ spatial_exists = spatial_path.exists()
55
+ hmm_exists = hmm_path.exists()
26
56
 
27
- ################################### 1) Load existing ###################################
28
- adata, adata_path, cfg = load_adata(config_path)
57
+ # Helper: reuse loaded_adata if it matches the path we want, else read from disk
58
+ def _load(path: Path):
59
+ if loaded_adata is not None and loaded_path == path:
60
+ return loaded_adata
61
+ adata, _ = safe_read_h5ad(path)
62
+ return adata
29
63
 
30
- # General config variable init - Necessary user passed inputs
31
- smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
32
- output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
64
+ # -----------------------------
65
+ # Case A: full redo of preprocessing
66
+ # -----------------------------
67
+ if getattr(cfg, "force_redo_preprocessing", False):
68
+ logger.info(
69
+ "Forcing full redo of preprocessing workflow, starting from latest stage AnnData available."
70
+ )
33
71
 
34
- # Make initial output directory
35
- make_dirs([output_directory])
72
+ if hmm_exists:
73
+ adata = _load(hmm_path)
74
+ source_path = hmm_path
75
+ elif spatial_exists:
76
+ adata = _load(spatial_path)
77
+ source_path = spatial_path
78
+ elif pp_dedup_exists:
79
+ adata = _load(pp_dedup_path)
80
+ source_path = pp_dedup_path
81
+ elif pp_exists:
82
+ adata = _load(pp_path)
83
+ source_path = pp_path
84
+ elif raw_exists:
85
+ adata = _load(raw_path)
86
+ source_path = raw_path
87
+ else:
88
+ logger.error("Cannot redo preprocessing: no AnnData available at any stage.")
89
+ return (None, None, None, None)
36
90
 
37
- input_manager_df = pd.read_csv(cfg.summary_file)
38
- initial_adata_path = Path(input_manager_df['load_adata'][0])
39
- pp_adata_path = Path(input_manager_df['pp_adata'][0])
40
- pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
41
- spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
42
- hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
91
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
92
+ adata=adata,
93
+ cfg=cfg,
94
+ pp_adata_path=pp_path,
95
+ pp_dup_rem_adata_path=pp_dedup_path,
96
+ source_adata_path=source_path,
97
+ config_path=config_path,
98
+ )
99
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
43
100
 
44
- if adata:
45
- # This happens on first run of the load pipeline
46
- pass
47
- else:
48
- # If an anndata is saved, check which stages of the anndata are available
49
- initial_version_available = initial_adata_path.exists()
50
- preprocessed_version_available = pp_adata_path.exists()
51
- preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
52
- spatial_adata_exists = spatial_adata_path.exists()
53
- hmm_adata_exists = hmm_adata_path.exists()
54
-
55
- if cfg.force_redo_preprocessing:
56
- print(f"Forcing full redo of preprocessing workflow, starting from earliest stage adata available.")
57
- if initial_version_available:
58
- adata, load_report = safe_read_h5ad(initial_adata_path)
59
- elif preprocessed_version_available:
60
- adata, load_report = safe_read_h5ad(pp_adata_path)
61
- elif preprocessed_dup_removed_version_available:
62
- adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
63
- else:
64
- print(f"Can not redo preprocessing when there is no adata available.")
65
- return
66
- elif cfg.force_redo_flag_duplicate_reads:
67
- print(f"Forcing redo of duplicate detection workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
68
- if preprocessed_version_available:
69
- adata, load_report = safe_read_h5ad(pp_adata_path)
70
- elif initial_version_available:
71
- adata, load_report = safe_read_h5ad(initial_adata_path)
72
- else:
73
- print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
74
- return
75
- elif cfg.force_redo_basic_analyses:
76
- print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
77
- if preprocessed_version_available:
78
- adata, load_report = safe_read_h5ad(pp_adata_path)
79
- elif initial_version_available:
80
- adata, load_report = safe_read_h5ad(initial_adata_path)
81
- else:
82
- print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
83
- elif hmm_adata_exists:
84
- print(f"HMM anndata found: {hmm_adata_path}")
85
- return (None, None, None, None)
86
- elif spatial_adata_exists:
87
- print(f"Spatial anndata found: {spatial_adata_exists}")
88
- return (None, None, None, None)
89
- elif preprocessed_dup_removed_version_available:
90
- print(f"Preprocessed deduplicated anndata found: {pp_dup_rem_adata_path}")
91
- return (None, pp_adata_path, None, pp_dup_rem_adata_path)
92
- elif preprocessed_version_available:
93
- print(f"Preprocessed anndata found: {pp_adata_path}")
94
- adata, load_report = safe_read_h5ad(pp_adata_path)
95
- elif initial_version_available:
96
- adata, load_report = safe_read_h5ad(initial_adata_path)
101
+ # -----------------------------
102
+ # Case B: redo duplicate detection only
103
+ # -----------------------------
104
+ if getattr(cfg, "force_redo_flag_duplicate_reads", False):
105
+ logger.info(
106
+ "Forcing redo of duplicate detection workflow, starting from the preprocessed AnnData "
107
+ "if available. Otherwise, will use the raw AnnData."
108
+ )
109
+ if pp_exists:
110
+ adata = _load(pp_path)
111
+ source_path = pp_path
112
+ elif raw_exists:
113
+ adata = _load(raw_path)
114
+ source_path = raw_path
97
115
  else:
98
- print(f"No adata available.")
99
- return
100
-
116
+ logger.error(
117
+ "Cannot redo duplicate detection: no compatible AnnData available "
118
+ "(need at least raw or preprocessed)."
119
+ )
120
+ return (None, None, None, None)
121
+
122
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
123
+ adata=adata,
124
+ cfg=cfg,
125
+ pp_adata_path=pp_path,
126
+ pp_dup_rem_adata_path=pp_dedup_path,
127
+ source_adata_path=source_path,
128
+ config_path=config_path,
129
+ )
130
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
131
+
132
+ # -----------------------------
133
+ # Case C: normal behavior (no explicit redo flags)
134
+ # -----------------------------
135
+
136
+ # If HMM exists, preprocessing is considered “done enough”
137
+ if hmm_exists:
138
+ logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
139
+ return (None, None, None, None)
140
+
141
+ # If spatial exists, also skip re-preprocessing by default
142
+ if spatial_exists:
143
+ logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
144
+ return (None, None, None, None)
145
+
146
+ # If pp_dedup exists, just return paths (no recomputation)
147
+ if pp_dedup_exists:
148
+ logger.debug(
149
+ f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
150
+ )
151
+ return (None, pp_path, None, pp_dedup_path)
152
+
153
+ # If pp exists but pp_dedup does not, load pp and run core
154
+ if pp_exists:
155
+ logger.debug(f"Preprocessed AnnData found: {pp_path}")
156
+ adata = _load(pp_path)
157
+ source_path = pp_path
158
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
159
+ adata=adata,
160
+ cfg=cfg,
161
+ pp_adata_path=pp_path,
162
+ pp_dup_rem_adata_path=pp_dedup_path,
163
+ source_adata_path=source_path,
164
+ config_path=config_path,
165
+ )
166
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
167
+
168
+ # Otherwise, fall back to raw (if available)
169
+ if raw_exists:
170
+ adata = _load(raw_path)
171
+ source_path = raw_path
172
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
173
+ adata=adata,
174
+ cfg=cfg,
175
+ pp_adata_path=pp_path,
176
+ pp_dup_rem_adata_path=pp_dedup_path,
177
+ source_adata_path=source_path,
178
+ config_path=config_path,
179
+ )
180
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
181
+
182
+ logger.error("No AnnData available at any stage for preprocessing.")
183
+ return (None, None, None, None)
184
+
185
+
186
+ def preprocess_adata_core(
187
+ adata: ad.AnnData,
188
+ cfg,
189
+ pp_adata_path: Path,
190
+ pp_dup_rem_adata_path: Path,
191
+ source_adata_path: Optional[Path] = None,
192
+ config_path: Optional[str] = None,
193
+ ) -> Tuple[ad.AnnData, Path, ad.AnnData, Path]:
194
+ """
195
+ Core preprocessing pipeline.
196
+
197
+ Assumes:
198
+ - `adata` is an AnnData object at some stage (raw/pp/etc.) to start preprocessing from.
199
+ - `cfg` is the ExperimentConfig containing all thresholds & options.
200
+ - `pp_adata_path` and `pp_dup_rem_adata_path` are the target output paths for
201
+ preprocessed and preprocessed+deduplicated AnnData.
202
+
203
+ Does NOT:
204
+ - Decide which stage to load from (that's the wrapper's job).
205
+ - Decide whether to skip entirely; it always runs its steps, but individual
206
+ sub-steps may skip based on `cfg.bypass_*` or directory existence.
207
+
208
+ Returns
209
+ -------
210
+ pp_adata : AnnData
211
+ Preprocessed AnnData (with QC filters, binarization, etc.).
212
+ pp_adata_path : Path
213
+ Path where pp_adata was written.
214
+ pp_dedup_adata : AnnData
215
+ Preprocessed AnnData with duplicate reads removed (for non-direct SMF).
216
+ pp_dup_rem_adata_path : Path
217
+ Path where pp_dedup_adata was written.
218
+ """
219
+ from pathlib import Path
220
+
221
+ from ..metadata import record_smftools_metadata
222
+ from ..plotting import plot_read_qc_histograms
223
+ from ..preprocessing import (
224
+ append_base_context,
225
+ append_binary_layer_by_base_context,
226
+ binarize_adata,
227
+ binarize_on_Youden,
228
+ calculate_complexity_II,
229
+ calculate_coverage,
230
+ calculate_position_Youden,
231
+ calculate_read_modification_stats,
232
+ clean_NaN,
233
+ filter_reads_on_length_quality_mapping,
234
+ filter_reads_on_modification_thresholds,
235
+ flag_duplicate_reads,
236
+ load_sample_sheet,
237
+ )
238
+ from ..readwrite import make_dirs
239
+ from .helpers import write_gz_h5ad
240
+
241
+ ################################### 1) Load existing ###################################
242
+ # General config variable init - Necessary user passed inputs
243
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
244
+ output_directory = Path(
245
+ cfg.output_directory
246
+ ) # Path to the output directory to make for the analysis. Necessary.
247
+ make_dirs([output_directory])
248
+
101
249
  ######### Begin Preprocessing #########
102
250
  pp_dir = output_directory / "preprocessed"
103
251
 
104
252
  ## Load sample sheet metadata based on barcode mapping ##
105
- if cfg.sample_sheet_path:
106
- from ..preprocessing import load_sample_sheet
107
- load_sample_sheet(adata,
108
- cfg.sample_sheet_path,
109
- mapping_key_column=cfg.sample_sheet_mapping_column,
110
- as_category=True,
111
- force_reload=cfg.force_reload_sample_sheet)
253
+ if getattr(cfg, "sample_sheet_path", None):
254
+ load_sample_sheet(
255
+ adata,
256
+ cfg.sample_sheet_path,
257
+ mapping_key_column=cfg.sample_sheet_mapping_column,
258
+ as_category=True,
259
+ force_reload=cfg.force_reload_sample_sheet,
260
+ )
112
261
  else:
113
262
  pass
114
-
263
+
115
264
  # Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
116
265
  pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
117
266
 
118
267
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
119
- print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
268
+ logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
120
269
  else:
121
- from ..plotting import plot_read_qc_histograms
122
270
  make_dirs([pp_dir, pp_length_qc_dir])
123
- obs_to_plot = ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal']
124
- plot_read_qc_histograms(adata,
125
- pp_length_qc_dir,
126
- obs_to_plot,
127
- sample_key=cfg.sample_name_col_for_plotting,
128
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
271
+ plot_read_qc_histograms(
272
+ adata,
273
+ pp_length_qc_dir,
274
+ cfg.obs_to_plot_pp_qc,
275
+ sample_key=cfg.sample_name_col_for_plotting,
276
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
277
+ )
129
278
 
130
279
  # Filter on read length, read quality, reference length, mapped_length, and mapping quality metadata.
131
- from ..preprocessing import filter_reads_on_length_quality_mapping
132
280
  print(adata.shape)
133
- adata = filter_reads_on_length_quality_mapping(adata,
134
- filter_on_coordinates=cfg.read_coord_filter,
135
- read_length=cfg.read_len_filter_thresholds,
136
- length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
137
- read_quality=cfg.read_quality_filter_thresholds,
138
- mapping_quality=cfg.read_mapping_quality_filter_thresholds,
139
- bypass=None,
140
- force_redo=None)
281
+ adata = filter_reads_on_length_quality_mapping(
282
+ adata,
283
+ filter_on_coordinates=cfg.read_coord_filter,
284
+ read_length=cfg.read_len_filter_thresholds,
285
+ length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
286
+ read_quality=cfg.read_quality_filter_thresholds,
287
+ mapping_quality=cfg.read_mapping_quality_filter_thresholds,
288
+ bypass=None,
289
+ force_redo=None,
290
+ )
141
291
  print(adata.shape)
142
292
 
143
293
  pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
144
294
 
145
295
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
146
- print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
296
+ logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
147
297
  else:
148
- from ..plotting import plot_read_qc_histograms
149
298
  make_dirs([pp_dir, pp_length_qc_dir])
150
- obs_to_plot = ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal']
151
- plot_read_qc_histograms(adata,
152
- pp_length_qc_dir,
153
- obs_to_plot,
154
- sample_key=cfg.sample_name_col_for_plotting,
155
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
156
-
299
+ plot_read_qc_histograms(
300
+ adata,
301
+ pp_length_qc_dir,
302
+ cfg.obs_to_plot_pp_qc,
303
+ sample_key=cfg.sample_name_col_for_plotting,
304
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
305
+ )
306
+
157
307
  ############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
158
- from ..preprocessing import clean_NaN
159
- if smf_modality == 'direct':
160
- from ..preprocessing import calculate_position_Youden, binarize_on_Youden, binarize_adata
308
+ if smf_modality == "direct":
161
309
  native = True
162
310
  if cfg.fit_position_methylation_thresholds:
163
311
  pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
164
312
  make_dirs([pp_Youden_dir])
165
313
  # Calculate positional methylation thresholds for mod calls
166
- calculate_position_Youden(adata,
167
- positive_control_sample=cfg.positive_control_sample_methylation_fitting,
168
- negative_control_sample=cfg.negative_control_sample_methylation_fitting,
169
- J_threshold=cfg.fit_j_threshold,
170
- obs_column=cfg.reference_column,
171
- infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
172
- inference_variable=cfg.inference_variable_sample_methylation_fitting,
173
- save=True,
174
- output_directory=pp_Youden_dir
175
- )
314
+ calculate_position_Youden(
315
+ adata,
316
+ positive_control_sample=cfg.positive_control_sample_methylation_fitting,
317
+ negative_control_sample=cfg.negative_control_sample_methylation_fitting,
318
+ J_threshold=cfg.fit_j_threshold,
319
+ ref_column=cfg.reference_column,
320
+ sample_column=cfg.sample_column,
321
+ infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
322
+ inference_variable=cfg.inference_variable_sample_methylation_fitting,
323
+ save=True,
324
+ output_directory=pp_Youden_dir,
325
+ )
176
326
  # binarize the modcalls based on the determined thresholds
177
- binarize_on_Youden(adata,
178
- obs_column=cfg.reference_column,
179
- output_layer_name=cfg.output_binary_layer_name
180
- )
327
+ binarize_on_Youden(
328
+ adata,
329
+ ref_column=cfg.reference_column,
330
+ output_layer_name=cfg.output_binary_layer_name,
331
+ )
181
332
  else:
182
- binarize_adata(adata,
183
- source="X",
184
- target_layer=cfg.output_binary_layer_name,
185
- threshold=cfg.binarize_on_fixed_methlyation_threshold)
186
-
187
- clean_NaN(adata,
188
- layer=cfg.output_binary_layer_name,
189
- bypass=cfg.bypass_clean_nan,
190
- force_redo=cfg.force_redo_clean_nan
191
- )
333
+ binarize_adata(
334
+ adata,
335
+ source="X",
336
+ target_layer=cfg.output_binary_layer_name,
337
+ threshold=cfg.binarize_on_fixed_methlyation_threshold,
338
+ )
339
+
340
+ clean_NaN(
341
+ adata,
342
+ layer=cfg.output_binary_layer_name,
343
+ bypass=cfg.bypass_clean_nan,
344
+ force_redo=cfg.force_redo_clean_nan,
345
+ )
192
346
  else:
193
347
  native = False
194
- clean_NaN(adata,
195
- bypass=cfg.bypass_clean_nan,
196
- force_redo=cfg.force_redo_clean_nan
197
- )
348
+ clean_NaN(adata, bypass=cfg.bypass_clean_nan, force_redo=cfg.force_redo_clean_nan)
349
+
350
+ ############### Calculate positional coverage by reference set in dataset ###############
351
+ calculate_coverage(
352
+ adata,
353
+ ref_column=cfg.reference_column,
354
+ position_nan_threshold=cfg.position_max_nan_threshold,
355
+ smf_modality=smf_modality,
356
+ target_layer=cfg.output_binary_layer_name,
357
+ )
198
358
 
199
359
  ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
200
- from ..preprocessing import append_base_context, append_binary_layer_by_base_context
201
360
  # Additionally, store base_context level binary modification arrays in adata.obsm
202
- append_base_context(adata,
203
- obs_column=cfg.reference_column,
204
- use_consensus=False,
205
- native=native,
206
- mod_target_bases=cfg.mod_target_bases,
207
- bypass=cfg.bypass_append_base_context,
208
- force_redo=cfg.force_redo_append_base_context)
209
-
210
- adata = append_binary_layer_by_base_context(adata,
211
- cfg.reference_column,
212
- smf_modality,
213
- bypass=cfg.bypass_append_binary_layer_by_base_context,
214
- force_redo=cfg.force_redo_append_binary_layer_by_base_context)
215
-
216
- ############### Optional inversion of the adata along positions axis ###################
217
- if cfg.invert_adata:
218
- from ..preprocessing import invert_adata
219
- adata = invert_adata(adata)
220
-
221
- ############### Calculate read methylation/deamination statistics for specific base contexts defined above ###############
222
- from ..preprocessing import calculate_read_modification_stats
223
- calculate_read_modification_stats(adata,
224
- cfg.reference_column,
225
- cfg.sample_column,
226
- cfg.mod_target_bases,
227
- bypass=cfg.bypass_calculate_read_modification_stats,
228
- force_redo=cfg.force_redo_calculate_read_modification_stats)
229
-
361
+ append_base_context(
362
+ adata,
363
+ ref_column=cfg.reference_column,
364
+ use_consensus=False,
365
+ native=native,
366
+ mod_target_bases=cfg.mod_target_bases,
367
+ bypass=cfg.bypass_append_base_context,
368
+ force_redo=cfg.force_redo_append_base_context,
369
+ )
370
+
371
+ ############### Calculate read methylation/deamination statistics for specific base contexts defined by append_base_context ###############
372
+ calculate_read_modification_stats(
373
+ adata,
374
+ cfg.reference_column,
375
+ cfg.sample_column,
376
+ cfg.mod_target_bases,
377
+ bypass=cfg.bypass_calculate_read_modification_stats,
378
+ force_redo=cfg.force_redo_calculate_read_modification_stats,
379
+ )
380
+
230
381
  ### Make a dir for outputting sample level read modification metrics before filtering ###
231
382
  pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
232
383
 
233
384
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
234
- print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
385
+ logger.debug(
386
+ f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
387
+ )
235
388
  else:
236
- from ..plotting import plot_read_qc_histograms
237
389
  make_dirs([pp_dir, pp_meth_qc_dir])
238
- obs_to_plot = ['Raw_modification_signal']
239
- if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
240
- obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_any_C_site_modified']
241
- if 'A' in cfg.mod_target_bases:
242
- obs_to_plot += ['Fraction_A_site_modified']
243
- plot_read_qc_histograms(adata,
244
- pp_meth_qc_dir, obs_to_plot,
245
- sample_key=cfg.sample_name_col_for_plotting,
246
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
390
+ obs_to_plot = ["Raw_modification_signal"]
391
+ if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
392
+ obs_to_plot += [
393
+ "Fraction_GpC_site_modified",
394
+ "Fraction_CpG_site_modified",
395
+ "Fraction_other_C_site_modified",
396
+ "Fraction_C_site_modified",
397
+ ]
398
+ if "A" in cfg.mod_target_bases:
399
+ obs_to_plot += ["Fraction_A_site_modified"]
400
+ plot_read_qc_histograms(
401
+ adata,
402
+ pp_meth_qc_dir,
403
+ obs_to_plot,
404
+ sample_key=cfg.sample_name_col_for_plotting,
405
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
406
+ )
247
407
 
248
408
  ##### Optionally filter reads on modification metrics
249
- from ..preprocessing import filter_reads_on_modification_thresholds
250
- adata = filter_reads_on_modification_thresholds(adata,
251
- smf_modality=smf_modality,
252
- mod_target_bases=cfg.mod_target_bases,
253
- gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
254
- cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
255
- any_c_thresholds=cfg.read_mod_filtering_any_c_thresholds,
256
- a_thresholds=cfg.read_mod_filtering_a_thresholds,
257
- use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
258
- min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
259
- bypass=cfg.bypass_filter_reads_on_modification_thresholds,
260
- force_redo=cfg.force_redo_filter_reads_on_modification_thresholds)
261
-
409
+ adata = filter_reads_on_modification_thresholds(
410
+ adata,
411
+ smf_modality=smf_modality,
412
+ mod_target_bases=cfg.mod_target_bases,
413
+ gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
414
+ cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
415
+ any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
416
+ a_thresholds=cfg.read_mod_filtering_a_thresholds,
417
+ use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
418
+ min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
419
+ bypass=cfg.bypass_filter_reads_on_modification_thresholds,
420
+ force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
421
+ )
422
+
262
423
  pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
263
-
424
+
264
425
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
265
- print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
426
+ logger.debug(
427
+ f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
428
+ )
266
429
  else:
267
- from ..plotting import plot_read_qc_histograms
268
430
  make_dirs([pp_dir, pp_meth_qc_dir])
269
- obs_to_plot = ['Raw_modification_signal']
270
- if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
271
- obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_any_C_site_modified']
272
- if 'A' in cfg.mod_target_bases:
273
- obs_to_plot += ['Fraction_A_site_modified']
274
- plot_read_qc_histograms(adata,
275
- pp_meth_qc_dir, obs_to_plot,
276
- sample_key=cfg.sample_name_col_for_plotting,
277
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
278
-
279
- ############### Calculate positional coverage in dataset ###############
280
- from ..preprocessing import calculate_coverage
281
- calculate_coverage(adata,
282
- obs_column=cfg.reference_column,
283
- position_nan_threshold=cfg.position_max_nan_threshold)
431
+ obs_to_plot = ["Raw_modification_signal"]
432
+ if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
433
+ obs_to_plot += [
434
+ "Fraction_GpC_site_modified",
435
+ "Fraction_CpG_site_modified",
436
+ "Fraction_other_C_site_modified",
437
+ "Fraction_C_site_modified",
438
+ ]
439
+ if "A" in cfg.mod_target_bases:
440
+ obs_to_plot += ["Fraction_A_site_modified"]
441
+ plot_read_qc_histograms(
442
+ adata,
443
+ pp_meth_qc_dir,
444
+ obs_to_plot,
445
+ sample_key=cfg.sample_name_col_for_plotting,
446
+ rows_per_fig=cfg.rows_per_qc_histogram_grid,
447
+ )
448
+
449
+ ############### Calculate final positional coverage by reference set in dataset after filtering reads ###############
450
+ calculate_coverage(
451
+ adata,
452
+ ref_column=cfg.reference_column,
453
+ position_nan_threshold=cfg.position_max_nan_threshold,
454
+ smf_modality=smf_modality,
455
+ target_layer=cfg.output_binary_layer_name,
456
+ force_redo=True,
457
+ )
458
+
459
+ ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats after filtering reads ###############
460
+ # Additionally, store base_context level binary modification arrays in adata.obsm
461
+ append_base_context(
462
+ adata,
463
+ ref_column=cfg.reference_column,
464
+ use_consensus=False,
465
+ native=native,
466
+ mod_target_bases=cfg.mod_target_bases,
467
+ bypass=cfg.bypass_append_base_context,
468
+ force_redo=True,
469
+ )
470
+
471
+ # Add site type binary modification layers for valid coverage sites
472
+ adata = append_binary_layer_by_base_context(
473
+ adata,
474
+ cfg.reference_column,
475
+ smf_modality,
476
+ bypass=cfg.bypass_append_binary_layer_by_base_context,
477
+ force_redo=cfg.force_redo_append_binary_layer_by_base_context,
478
+ from_valid_sites_only=True,
479
+ )
284
480
 
285
481
  ############### Duplicate detection for conversion/deamination SMF ###############
286
- if smf_modality != 'direct':
287
- from ..preprocessing import flag_duplicate_reads, calculate_complexity_II
482
+ if smf_modality != "direct":
288
483
  references = adata.obs[cfg.reference_column].cat.categories
289
484
 
290
- var_filters_sets =[]
485
+ var_filters_sets = []
291
486
  for ref in references:
292
487
  for site_type in cfg.duplicate_detection_site_types:
293
488
  var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
@@ -297,27 +492,30 @@ def preprocess_adata(config_path):
297
492
  make_dirs([pp_dup_qc_dir])
298
493
 
299
494
  # Flag duplicate reads and plot duplicate detection QC
300
- adata_unique, adata = flag_duplicate_reads(adata,
301
- var_filters_sets,
302
- distance_threshold=cfg.duplicate_detection_distance_threshold,
303
- obs_reference_col=cfg.reference_column,
304
- sample_col=cfg.sample_name_col_for_plotting,
305
- output_directory=pp_dup_qc_dir,
306
- metric_keys=cfg.hamming_vs_metric_keys,
307
- keep_best_metric=cfg.duplicate_detection_keep_best_metric,
308
- bypass=cfg.bypass_flag_duplicate_reads,
309
- force_redo=cfg.force_redo_flag_duplicate_reads,
310
- window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
311
- min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
312
- do_pca=cfg.duplicate_detection_do_pca,
313
- pca_n_components=50,
314
- pca_center=True,
315
- do_hierarchical=cfg.duplicate_detection_do_hierarchical,
316
- hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
317
- hierarchical_metric="euclidean",
318
- hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors
319
- )
320
-
495
+ adata_unique, adata = flag_duplicate_reads(
496
+ adata,
497
+ var_filters_sets,
498
+ distance_threshold=cfg.duplicate_detection_distance_threshold,
499
+ obs_reference_col=cfg.reference_column,
500
+ sample_col=cfg.sample_name_col_for_plotting,
501
+ output_directory=pp_dup_qc_dir,
502
+ metric_keys=cfg.hamming_vs_metric_keys,
503
+ keep_best_metric=cfg.duplicate_detection_keep_best_metric,
504
+ bypass=cfg.bypass_flag_duplicate_reads,
505
+ force_redo=cfg.force_redo_flag_duplicate_reads,
506
+ window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
507
+ min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
508
+ do_pca=cfg.duplicate_detection_do_pca,
509
+ pca_n_components=50,
510
+ pca_center=True,
511
+ do_hierarchical=cfg.duplicate_detection_do_hierarchical,
512
+ hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
513
+ hierarchical_metric="euclidean",
514
+ hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
515
+ demux_types=("double", "already"),
516
+ demux_col="demux_type",
517
+ )
518
+
321
519
  # Use the flagged duplicate read groups and perform complexity analysis
322
520
  complexity_outs = pp_dup_qc_dir / "sample_complexity_analyses"
323
521
  make_dirs([complexity_outs])
@@ -326,15 +524,15 @@ def preprocess_adata(config_path):
326
524
  output_directory=complexity_outs,
327
525
  sample_col=cfg.sample_name_col_for_plotting,
328
526
  ref_col=cfg.reference_column,
329
- cluster_col='sequence__merged_cluster_id',
527
+ cluster_col="sequence__merged_cluster_id",
330
528
  plot=True,
331
- save_plot=True, # set False to display instead
529
+ save_plot=True, # set False to display instead
332
530
  n_boot=30,
333
531
  n_depths=12,
334
532
  random_state=42,
335
533
  csv_summary=True,
336
534
  bypass=cfg.bypass_complexity_analysis,
337
- force_redo=cfg.force_redo_complexity_analysis
535
+ force_redo=cfg.force_redo_complexity_analysis,
338
536
  )
339
537
 
340
538
  else:
@@ -342,22 +540,30 @@ def preprocess_adata(config_path):
342
540
  ########################################################################################################################
343
541
 
344
542
  ############################################### Save preprocessed adata with duplicate detection ###############################################
345
- from ..readwrite import safe_write_h5ad
346
543
  if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
347
- print('Saving preprocessed adata.')
348
- if ".gz" == pp_adata_path.suffix:
349
- safe_write_h5ad(adata, pp_adata_path, compression='gzip', backup=True)
350
- else:
351
- pp_adata_path = pp_adata_path.with_name(pp_adata_path.name + '.gz')
352
- safe_write_h5ad(adata, pp_adata_path, compression='gzip', backup=True)
544
+ logger.info("Saving preprocessed adata.")
545
+ record_smftools_metadata(
546
+ adata,
547
+ step_name="preprocess",
548
+ cfg=cfg,
549
+ config_path=config_path,
550
+ input_paths=[source_adata_path] if source_adata_path else None,
551
+ output_path=pp_adata_path,
552
+ )
553
+ write_gz_h5ad(adata, pp_adata_path)
353
554
 
354
555
  if not pp_dup_rem_adata_path.exists() or cfg.force_redo_preprocessing:
355
- print('Saving preprocessed adata with duplicates removed.')
356
- if ".gz" == pp_dup_rem_adata_path.suffix:
357
- safe_write_h5ad(adata_unique, pp_dup_rem_adata_path, compression='gzip', backup=True)
358
- else:
359
- pp_adata_path = pp_dup_rem_adata_path.with_name(pp_dup_rem_adata_path.name + '.gz')
360
- safe_write_h5ad(adata_unique, pp_dup_rem_adata_path, compression='gzip', backup=True)
556
+ logger.info("Saving preprocessed adata with duplicates removed.")
557
+ record_smftools_metadata(
558
+ adata_unique,
559
+ step_name="preprocess",
560
+ cfg=cfg,
561
+ config_path=config_path,
562
+ input_paths=[pp_adata_path],
563
+ output_path=pp_dup_rem_adata_path,
564
+ )
565
+ write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
566
+
361
567
  ########################################################################################################################
362
568
 
363
- return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
569
+ return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)