smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,455 @@
1
+ from pathlib import Path
2
+ from typing import Optional, Tuple
3
+
4
+ import anndata as ad
5
+
6
+ def preprocess_adata(
7
+ config_path: str,
8
+ ) -> Tuple[Optional[ad.AnnData], Optional[Path], Optional[ad.AnnData], Optional[Path]]:
9
+ """
10
+ CLI-facing wrapper for preprocessing.
11
+
12
+ Called by: `smftools preprocess <config_path>`
13
+
14
+ - Ensure a raw AnnData exists (or some later-stage AnnData) via `load_adata`.
15
+ - Determine which AnnData stages exist (raw, pp, pp_dedup, spatial, hmm).
16
+ - Respect cfg flags (force_redo_preprocessing, force_redo_flag_duplicate_reads).
17
+ - Decide what starting AnnData to load (or whether to early-return).
18
+ - Call `preprocess_adata_core(...)` when appropriate.
19
+
20
+ Returns
21
+ -------
22
+ pp_adata : AnnData | None
23
+ Preprocessed AnnData (may be None if we skipped work).
24
+ pp_adata_path : Path | None
25
+ Path to preprocessed AnnData.
26
+ pp_dedup_adata : AnnData | None
27
+ Preprocessed, duplicate-removed AnnData.
28
+ pp_dedup_adata_path : Path | None
29
+ Path to preprocessed, duplicate-removed AnnData.
30
+ """
31
+ from ..readwrite import safe_read_h5ad
32
+ from .load_adata import load_adata
33
+ from .helpers import get_adata_paths
34
+
35
+ # 1) Ensure config is loaded and at least *some* AnnData stage exists
36
+ loaded_adata, loaded_path, cfg = load_adata(config_path)
37
+
38
+ # 2) Compute canonical paths
39
+ paths = get_adata_paths(cfg)
40
+ raw_path = paths.raw
41
+ pp_path = paths.pp
42
+ pp_dedup_path = paths.pp_dedup
43
+ spatial_path = paths.spatial
44
+ hmm_path = paths.hmm
45
+
46
+ raw_exists = raw_path.exists()
47
+ pp_exists = pp_path.exists()
48
+ pp_dedup_exists = pp_dedup_path.exists()
49
+ spatial_exists = spatial_path.exists()
50
+ hmm_exists = hmm_path.exists()
51
+
52
+ # Helper: reuse loaded_adata if it matches the path we want, else read from disk
53
+ def _load(path: Path):
54
+ if loaded_adata is not None and loaded_path == path:
55
+ return loaded_adata
56
+ adata, _ = safe_read_h5ad(path)
57
+ return adata
58
+
59
+ # -----------------------------
60
+ # Case A: full redo of preprocessing
61
+ # -----------------------------
62
+ if getattr(cfg, "force_redo_preprocessing", False):
63
+ print("Forcing full redo of preprocessing workflow, starting from latest stage AnnData available.")
64
+
65
+ if hmm_exists:
66
+ adata = _load(hmm_path)
67
+ elif spatial_exists:
68
+ adata = _load(spatial_path)
69
+ elif pp_dedup_exists:
70
+ adata = _load(pp_dedup_path)
71
+ elif pp_exists:
72
+ adata = _load(pp_path)
73
+ elif raw_exists:
74
+ adata = _load(raw_path)
75
+ else:
76
+ print("Cannot redo preprocessing: no AnnData available at any stage.")
77
+ return (None, None, None, None)
78
+
79
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
80
+ adata=adata,
81
+ cfg=cfg,
82
+ pp_adata_path=pp_path,
83
+ pp_dup_rem_adata_path=pp_dedup_path,
84
+ )
85
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
86
+
87
+ # -----------------------------
88
+ # Case B: redo duplicate detection only
89
+ # -----------------------------
90
+ if getattr(cfg, "force_redo_flag_duplicate_reads", False):
91
+ print(
92
+ "Forcing redo of duplicate detection workflow, starting from the preprocessed AnnData "
93
+ "if available. Otherwise, will use the raw AnnData."
94
+ )
95
+ if pp_exists:
96
+ adata = _load(pp_path)
97
+ elif raw_exists:
98
+ adata = _load(raw_path)
99
+ else:
100
+ print(
101
+ "Cannot redo duplicate detection: no compatible AnnData available "
102
+ "(need at least raw or preprocessed)."
103
+ )
104
+ return (None, None, None, None)
105
+
106
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
107
+ adata=adata,
108
+ cfg=cfg,
109
+ pp_adata_path=pp_path,
110
+ pp_dup_rem_adata_path=pp_dedup_path,
111
+ )
112
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
113
+
114
+ # -----------------------------
115
+ # Case C: normal behavior (no explicit redo flags)
116
+ # -----------------------------
117
+
118
+ # If HMM exists, preprocessing is considered “done enough”
119
+ if hmm_exists:
120
+ print(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
121
+ return (None, None, None, None)
122
+
123
+ # If spatial exists, also skip re-preprocessing by default
124
+ if spatial_exists:
125
+ print(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
126
+ return (None, None, None, None)
127
+
128
+ # If pp_dedup exists, just return paths (no recomputation)
129
+ if pp_dedup_exists:
130
+ print(f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}")
131
+ return (None, pp_path, None, pp_dedup_path)
132
+
133
+ # If pp exists but pp_dedup does not, load pp and run core
134
+ if pp_exists:
135
+ print(f"Preprocessed AnnData found: {pp_path}")
136
+ adata = _load(pp_path)
137
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
138
+ adata=adata,
139
+ cfg=cfg,
140
+ pp_adata_path=pp_path,
141
+ pp_dup_rem_adata_path=pp_dedup_path,
142
+ )
143
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
144
+
145
+ # Otherwise, fall back to raw (if available)
146
+ if raw_exists:
147
+ adata = _load(raw_path)
148
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
149
+ adata=adata,
150
+ cfg=cfg,
151
+ pp_adata_path=pp_path,
152
+ pp_dup_rem_adata_path=pp_dedup_path,
153
+ )
154
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
155
+
156
+ print("No AnnData available at any stage for preprocessing.")
157
+ return (None, None, None, None)
158
+
159
+
160
+ def preprocess_adata_core(
161
+ adata: ad.AnnData,
162
+ cfg,
163
+ pp_adata_path: Path,
164
+ pp_dup_rem_adata_path: Path,
165
+ ) -> Tuple[ad.AnnData, Path, ad.AnnData, Path]:
166
+ """
167
+ Core preprocessing pipeline.
168
+
169
+ Assumes:
170
+ - `adata` is an AnnData object at some stage (raw/pp/etc.) to start preprocessing from.
171
+ - `cfg` is the ExperimentConfig containing all thresholds & options.
172
+ - `pp_adata_path` and `pp_dup_rem_adata_path` are the target output paths for
173
+ preprocessed and preprocessed+deduplicated AnnData.
174
+
175
+ Does NOT:
176
+ - Decide which stage to load from (that's the wrapper's job).
177
+ - Decide whether to skip entirely; it always runs its steps, but individual
178
+ sub-steps may skip based on `cfg.bypass_*` or directory existence.
179
+
180
+ Returns
181
+ -------
182
+ pp_adata : AnnData
183
+ Preprocessed AnnData (with QC filters, binarization, etc.).
184
+ pp_adata_path : Path
185
+ Path where pp_adata was written.
186
+ pp_dedup_adata : AnnData
187
+ Preprocessed AnnData with duplicate reads removed (for non-direct SMF).
188
+ pp_dup_rem_adata_path : Path
189
+ Path where pp_dedup_adata was written.
190
+ """
191
+ from pathlib import Path
192
+
193
+ import numpy as np
194
+
195
+ from .helpers import write_gz_h5ad
196
+ from ..readwrite import make_dirs
197
+ from ..preprocessing import (
198
+ load_sample_sheet,
199
+ filter_reads_on_length_quality_mapping,
200
+ clean_NaN,
201
+ calculate_coverage,
202
+ append_base_context,
203
+ append_binary_layer_by_base_context,
204
+ calculate_read_modification_stats,
205
+ filter_reads_on_modification_thresholds,
206
+ flag_duplicate_reads,
207
+ calculate_complexity_II,
208
+ calculate_position_Youden,
209
+ binarize_on_Youden,
210
+ binarize_adata,
211
+ )
212
+ from ..plotting import plot_read_qc_histograms
213
+
214
+ ################################### 1) Load existing ###################################
215
+ # General config variable init - Necessary user passed inputs
216
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
217
+ output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
218
+ make_dirs([output_directory])
219
+
220
+ ######### Begin Preprocessing #########
221
+ pp_dir = output_directory / "preprocessed"
222
+
223
+ ## Load sample sheet metadata based on barcode mapping ##
224
+ if getattr(cfg, "sample_sheet_path", None):
225
+ load_sample_sheet(adata,
226
+ cfg.sample_sheet_path,
227
+ mapping_key_column=cfg.sample_sheet_mapping_column,
228
+ as_category=True,
229
+ force_reload=cfg.force_reload_sample_sheet)
230
+ else:
231
+ pass
232
+
233
+ # Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
234
+ pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
235
+
236
+ if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
237
+ print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
238
+ else:
239
+ make_dirs([pp_dir, pp_length_qc_dir])
240
+ plot_read_qc_histograms(adata,
241
+ pp_length_qc_dir,
242
+ cfg.obs_to_plot_pp_qc,
243
+ sample_key=cfg.sample_name_col_for_plotting,
244
+ rows_per_fig=cfg.rows_per_qc_histogram_grid)
245
+
246
+ # Filter on read length, read quality, reference length, mapped_length, and mapping quality metadata.
247
+ print(adata.shape)
248
+ adata = filter_reads_on_length_quality_mapping(adata,
249
+ filter_on_coordinates=cfg.read_coord_filter,
250
+ read_length=cfg.read_len_filter_thresholds,
251
+ length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
252
+ read_quality=cfg.read_quality_filter_thresholds,
253
+ mapping_quality=cfg.read_mapping_quality_filter_thresholds,
254
+ bypass=None,
255
+ force_redo=None)
256
+ print(adata.shape)
257
+
258
+ pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
259
+
260
+ if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
261
+ print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
262
+ else:
263
+ make_dirs([pp_dir, pp_length_qc_dir])
264
+ plot_read_qc_histograms(adata,
265
+ pp_length_qc_dir,
266
+ cfg.obs_to_plot_pp_qc,
267
+ sample_key=cfg.sample_name_col_for_plotting,
268
+ rows_per_fig=cfg.rows_per_qc_histogram_grid)
269
+
270
+ ############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
271
+ if smf_modality == 'direct':
272
+ native = True
273
+ if cfg.fit_position_methylation_thresholds:
274
+ pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
275
+ make_dirs([pp_Youden_dir])
276
+ # Calculate positional methylation thresholds for mod calls
277
+ calculate_position_Youden(adata,
278
+ positive_control_sample=cfg.positive_control_sample_methylation_fitting,
279
+ negative_control_sample=cfg.negative_control_sample_methylation_fitting,
280
+ J_threshold=cfg.fit_j_threshold,
281
+ ref_column=cfg.reference_column,
282
+ sample_column=cfg.sample_column,
283
+ infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
284
+ inference_variable=cfg.inference_variable_sample_methylation_fitting,
285
+ save=True,
286
+ output_directory=pp_Youden_dir
287
+ )
288
+ # binarize the modcalls based on the determined thresholds
289
+ binarize_on_Youden(adata,
290
+ ref_column=cfg.reference_column,
291
+ output_layer_name=cfg.output_binary_layer_name
292
+ )
293
+ else:
294
+ binarize_adata(adata,
295
+ source="X",
296
+ target_layer=cfg.output_binary_layer_name,
297
+ threshold=cfg.binarize_on_fixed_methlyation_threshold)
298
+
299
+ clean_NaN(adata,
300
+ layer=cfg.output_binary_layer_name,
301
+ bypass=cfg.bypass_clean_nan,
302
+ force_redo=cfg.force_redo_clean_nan
303
+ )
304
+ else:
305
+ native = False
306
+ clean_NaN(adata,
307
+ bypass=cfg.bypass_clean_nan,
308
+ force_redo=cfg.force_redo_clean_nan
309
+ )
310
+
311
+ ############### Calculate positional coverage by reference set in dataset ###############
312
+ calculate_coverage(adata,
313
+ ref_column=cfg.reference_column,
314
+ position_nan_threshold=cfg.position_max_nan_threshold)
315
+
316
+ ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
317
+ # Additionally, store base_context level binary modification arrays in adata.obsm
318
+ append_base_context(adata,
319
+ ref_column=cfg.reference_column,
320
+ use_consensus=False,
321
+ native=native,
322
+ mod_target_bases=cfg.mod_target_bases,
323
+ bypass=cfg.bypass_append_base_context,
324
+ force_redo=cfg.force_redo_append_base_context)
325
+
326
+ adata = append_binary_layer_by_base_context(adata,
327
+ cfg.reference_column,
328
+ smf_modality,
329
+ bypass=cfg.bypass_append_binary_layer_by_base_context,
330
+ force_redo=cfg.force_redo_append_binary_layer_by_base_context)
331
+
332
+ ############### Calculate read methylation/deamination statistics for specific base contexts defined above ###############
333
+ calculate_read_modification_stats(adata,
334
+ cfg.reference_column,
335
+ cfg.sample_column,
336
+ cfg.mod_target_bases,
337
+ bypass=cfg.bypass_calculate_read_modification_stats,
338
+ force_redo=cfg.force_redo_calculate_read_modification_stats)
339
+
340
+ ### Make a dir for outputting sample level read modification metrics before filtering ###
341
+ pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
342
+
343
+ if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
344
+ print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
345
+ else:
346
+ make_dirs([pp_dir, pp_meth_qc_dir])
347
+ obs_to_plot = ['Raw_modification_signal']
348
+ if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
349
+ obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_C_site_modified']
350
+ if 'A' in cfg.mod_target_bases:
351
+ obs_to_plot += ['Fraction_A_site_modified']
352
+ plot_read_qc_histograms(adata,
353
+ pp_meth_qc_dir, obs_to_plot,
354
+ sample_key=cfg.sample_name_col_for_plotting,
355
+ rows_per_fig=cfg.rows_per_qc_histogram_grid)
356
+
357
+ ##### Optionally filter reads on modification metrics
358
+ adata = filter_reads_on_modification_thresholds(adata,
359
+ smf_modality=smf_modality,
360
+ mod_target_bases=cfg.mod_target_bases,
361
+ gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
362
+ cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
363
+ any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
364
+ a_thresholds=cfg.read_mod_filtering_a_thresholds,
365
+ use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
366
+ min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
367
+ bypass=cfg.bypass_filter_reads_on_modification_thresholds,
368
+ force_redo=cfg.force_redo_filter_reads_on_modification_thresholds)
369
+
370
+ pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
371
+
372
+ if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
373
+ print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
374
+ else:
375
+ make_dirs([pp_dir, pp_meth_qc_dir])
376
+ obs_to_plot = ['Raw_modification_signal']
377
+ if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
378
+ obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_C_site_modified']
379
+ if 'A' in cfg.mod_target_bases:
380
+ obs_to_plot += ['Fraction_A_site_modified']
381
+ plot_read_qc_histograms(adata,
382
+ pp_meth_qc_dir, obs_to_plot,
383
+ sample_key=cfg.sample_name_col_for_plotting,
384
+ rows_per_fig=cfg.rows_per_qc_histogram_grid)
385
+
386
+ ############### Duplicate detection for conversion/deamination SMF ###############
387
+ if smf_modality != 'direct':
388
+ references = adata.obs[cfg.reference_column].cat.categories
389
+
390
+ var_filters_sets =[]
391
+ for ref in references:
392
+ for site_type in cfg.duplicate_detection_site_types:
393
+ var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
394
+
395
+ pp_dup_qc_dir = pp_dir / "05_read_duplication_QC_metrics"
396
+
397
+ make_dirs([pp_dup_qc_dir])
398
+
399
+ # Flag duplicate reads and plot duplicate detection QC
400
+ adata_unique, adata = flag_duplicate_reads(adata,
401
+ var_filters_sets,
402
+ distance_threshold=cfg.duplicate_detection_distance_threshold,
403
+ obs_reference_col=cfg.reference_column,
404
+ sample_col=cfg.sample_name_col_for_plotting,
405
+ output_directory=pp_dup_qc_dir,
406
+ metric_keys=cfg.hamming_vs_metric_keys,
407
+ keep_best_metric=cfg.duplicate_detection_keep_best_metric,
408
+ bypass=cfg.bypass_flag_duplicate_reads,
409
+ force_redo=cfg.force_redo_flag_duplicate_reads,
410
+ window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
411
+ min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
412
+ do_pca=cfg.duplicate_detection_do_pca,
413
+ pca_n_components=50,
414
+ pca_center=True,
415
+ do_hierarchical=cfg.duplicate_detection_do_hierarchical,
416
+ hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
417
+ hierarchical_metric="euclidean",
418
+ hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors
419
+ )
420
+
421
+ # Use the flagged duplicate read groups and perform complexity analysis
422
+ complexity_outs = pp_dup_qc_dir / "sample_complexity_analyses"
423
+ make_dirs([complexity_outs])
424
+ calculate_complexity_II(
425
+ adata=adata,
426
+ output_directory=complexity_outs,
427
+ sample_col=cfg.sample_name_col_for_plotting,
428
+ ref_col=cfg.reference_column,
429
+ cluster_col='sequence__merged_cluster_id',
430
+ plot=True,
431
+ save_plot=True, # set False to display instead
432
+ n_boot=30,
433
+ n_depths=12,
434
+ random_state=42,
435
+ csv_summary=True,
436
+ bypass=cfg.bypass_complexity_analysis,
437
+ force_redo=cfg.force_redo_complexity_analysis
438
+ )
439
+
440
+ else:
441
+ adata_unique = adata
442
+ ########################################################################################################################
443
+
444
+ ############################################### Save preprocessed adata with duplicate detection ###############################################
445
+ if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
446
+ print('Saving preprocessed adata.')
447
+ write_gz_h5ad(adata, pp_adata_path)
448
+
449
+ if not pp_dup_rem_adata_path.exists() or cfg.force_redo_preprocessing:
450
+ print('Saving preprocessed adata with duplicates removed.')
451
+ write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
452
+
453
+ ########################################################################################################################
454
+
455
+ return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)