smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/helpers.py +48 -0
  3. smftools/cli/hmm_adata.py +168 -145
  4. smftools/cli/load_adata.py +155 -95
  5. smftools/cli/preprocess_adata.py +222 -130
  6. smftools/cli/spatial_adata.py +441 -308
  7. smftools/cli_entry.py +4 -5
  8. smftools/config/conversion.yaml +12 -5
  9. smftools/config/deaminase.yaml +11 -9
  10. smftools/config/default.yaml +123 -19
  11. smftools/config/direct.yaml +3 -0
  12. smftools/config/experiment_config.py +120 -19
  13. smftools/hmm/HMM.py +12 -1
  14. smftools/hmm/__init__.py +0 -6
  15. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  16. smftools/hmm/call_hmm_peaks.py +318 -90
  17. smftools/informatics/bam_functions.py +28 -29
  18. smftools/informatics/h5ad_functions.py +1 -1
  19. smftools/plotting/general_plotting.py +97 -51
  20. smftools/plotting/position_stats.py +3 -3
  21. smftools/preprocessing/__init__.py +2 -4
  22. smftools/preprocessing/append_base_context.py +34 -25
  23. smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
  24. smftools/preprocessing/binarize_on_Youden.py +10 -8
  25. smftools/preprocessing/calculate_complexity_II.py +1 -1
  26. smftools/preprocessing/calculate_coverage.py +16 -13
  27. smftools/preprocessing/calculate_position_Youden.py +41 -25
  28. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  29. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  30. smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
  31. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  32. smftools/preprocessing/invert_adata.py +1 -1
  33. smftools/preprocessing/load_sample_sheet.py +1 -1
  34. smftools/preprocessing/reindex_references_adata.py +37 -0
  35. smftools/readwrite.py +94 -0
  36. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
  37. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
  38. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  39. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  40. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  41. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  42. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  43. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  44. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  45. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
  46. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,109 +1,227 @@
1
- def preprocess_adata(config_path):
1
+ from pathlib import Path
2
+ from typing import Optional, Tuple
3
+
4
+ import anndata as ad
5
+
6
+ def preprocess_adata(
7
+ config_path: str,
8
+ ) -> Tuple[Optional[ad.AnnData], Optional[Path], Optional[ad.AnnData], Optional[Path]]:
2
9
  """
3
- High-level function to call for preprocessing an adata object.
4
- Command line accesses this through smftools preprocess <config_path>
10
+ CLI-facing wrapper for preprocessing.
5
11
 
6
- Parameters:
7
- config_path (str): A string representing the file path to the experiment configuration csv file.
12
+ Called by: `smftools preprocess <config_path>`
8
13
 
9
- Returns:
10
- (pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path)
14
+ - Ensure a raw AnnData exists (or some later-stage AnnData) via `load_adata`.
15
+ - Determine which AnnData stages exist (raw, pp, pp_dedup, spatial, hmm).
16
+ - Respect cfg flags (force_redo_preprocessing, force_redo_flag_duplicate_reads).
17
+ - Decide what starting AnnData to load (or whether to early-return).
18
+ - Call `preprocess_adata_core(...)` when appropriate.
19
+
20
+ Returns
21
+ -------
22
+ pp_adata : AnnData | None
23
+ Preprocessed AnnData (may be None if we skipped work).
24
+ pp_adata_path : Path | None
25
+ Path to preprocessed AnnData.
26
+ pp_dedup_adata : AnnData | None
27
+ Preprocessed, duplicate-removed AnnData.
28
+ pp_dedup_adata_path : Path | None
29
+ Path to preprocessed, duplicate-removed AnnData.
11
30
  """
12
- from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
31
+ from ..readwrite import safe_read_h5ad
13
32
  from .load_adata import load_adata
33
+ from .helpers import get_adata_paths
14
34
 
15
- import numpy as np
16
- import pandas as pd
17
- import anndata as ad
18
- import scanpy as sc
35
+ # 1) Ensure config is loaded and at least *some* AnnData stage exists
36
+ loaded_adata, loaded_path, cfg = load_adata(config_path)
37
+
38
+ # 2) Compute canonical paths
39
+ paths = get_adata_paths(cfg)
40
+ raw_path = paths.raw
41
+ pp_path = paths.pp
42
+ pp_dedup_path = paths.pp_dedup
43
+ spatial_path = paths.spatial
44
+ hmm_path = paths.hmm
45
+
46
+ raw_exists = raw_path.exists()
47
+ pp_exists = pp_path.exists()
48
+ pp_dedup_exists = pp_dedup_path.exists()
49
+ spatial_exists = spatial_path.exists()
50
+ hmm_exists = hmm_path.exists()
51
+
52
+ # Helper: reuse loaded_adata if it matches the path we want, else read from disk
53
+ def _load(path: Path):
54
+ if loaded_adata is not None and loaded_path == path:
55
+ return loaded_adata
56
+ adata, _ = safe_read_h5ad(path)
57
+ return adata
58
+
59
+ # -----------------------------
60
+ # Case A: full redo of preprocessing
61
+ # -----------------------------
62
+ if getattr(cfg, "force_redo_preprocessing", False):
63
+ print("Forcing full redo of preprocessing workflow, starting from latest stage AnnData available.")
64
+
65
+ if hmm_exists:
66
+ adata = _load(hmm_path)
67
+ elif spatial_exists:
68
+ adata = _load(spatial_path)
69
+ elif pp_dedup_exists:
70
+ adata = _load(pp_dedup_path)
71
+ elif pp_exists:
72
+ adata = _load(pp_path)
73
+ elif raw_exists:
74
+ adata = _load(raw_path)
75
+ else:
76
+ print("Cannot redo preprocessing: no AnnData available at any stage.")
77
+ return (None, None, None, None)
78
+
79
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
80
+ adata=adata,
81
+ cfg=cfg,
82
+ pp_adata_path=pp_path,
83
+ pp_dup_rem_adata_path=pp_dedup_path,
84
+ )
85
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
86
+
87
+ # -----------------------------
88
+ # Case B: redo duplicate detection only
89
+ # -----------------------------
90
+ if getattr(cfg, "force_redo_flag_duplicate_reads", False):
91
+ print(
92
+ "Forcing redo of duplicate detection workflow, starting from the preprocessed AnnData "
93
+ "if available. Otherwise, will use the raw AnnData."
94
+ )
95
+ if pp_exists:
96
+ adata = _load(pp_path)
97
+ elif raw_exists:
98
+ adata = _load(raw_path)
99
+ else:
100
+ print(
101
+ "Cannot redo duplicate detection: no compatible AnnData available "
102
+ "(need at least raw or preprocessed)."
103
+ )
104
+ return (None, None, None, None)
105
+
106
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
107
+ adata=adata,
108
+ cfg=cfg,
109
+ pp_adata_path=pp_path,
110
+ pp_dup_rem_adata_path=pp_dedup_path,
111
+ )
112
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
113
+
114
+ # -----------------------------
115
+ # Case C: normal behavior (no explicit redo flags)
116
+ # -----------------------------
117
+
118
+ # If HMM exists, preprocessing is considered “done enough”
119
+ if hmm_exists:
120
+ print(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
121
+ return (None, None, None, None)
122
+
123
+ # If spatial exists, also skip re-preprocessing by default
124
+ if spatial_exists:
125
+ print(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
126
+ return (None, None, None, None)
127
+
128
+ # If pp_dedup exists, just return paths (no recomputation)
129
+ if pp_dedup_exists:
130
+ print(f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}")
131
+ return (None, pp_path, None, pp_dedup_path)
132
+
133
+ # If pp exists but pp_dedup does not, load pp and run core
134
+ if pp_exists:
135
+ print(f"Preprocessed AnnData found: {pp_path}")
136
+ adata = _load(pp_path)
137
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
138
+ adata=adata,
139
+ cfg=cfg,
140
+ pp_adata_path=pp_path,
141
+ pp_dup_rem_adata_path=pp_dedup_path,
142
+ )
143
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
144
+
145
+ # Otherwise, fall back to raw (if available)
146
+ if raw_exists:
147
+ adata = _load(raw_path)
148
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
149
+ adata=adata,
150
+ cfg=cfg,
151
+ pp_adata_path=pp_path,
152
+ pp_dup_rem_adata_path=pp_dedup_path,
153
+ )
154
+ return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
155
+
156
+ print("No AnnData available at any stage for preprocessing.")
157
+ return (None, None, None, None)
158
+
159
+
160
+ def preprocess_adata_core(
161
+ adata: ad.AnnData,
162
+ cfg,
163
+ pp_adata_path: Path,
164
+ pp_dup_rem_adata_path: Path,
165
+ ) -> Tuple[ad.AnnData, Path, ad.AnnData, Path]:
166
+ """
167
+ Core preprocessing pipeline.
168
+
169
+ Assumes:
170
+ - `adata` is an AnnData object at some stage (raw/pp/etc.) to start preprocessing from.
171
+ - `cfg` is the ExperimentConfig containing all thresholds & options.
172
+ - `pp_adata_path` and `pp_dup_rem_adata_path` are the target output paths for
173
+ preprocessed and preprocessed+deduplicated AnnData.
19
174
 
20
- import os
21
- from importlib import resources
175
+ Does NOT:
176
+ - Decide which stage to load from (that's the wrapper's job).
177
+ - Decide whether to skip entirely; it always runs its steps, but individual
178
+ sub-steps may skip based on `cfg.bypass_*` or directory existence.
179
+
180
+ Returns
181
+ -------
182
+ pp_adata : AnnData
183
+ Preprocessed AnnData (with QC filters, binarization, etc.).
184
+ pp_adata_path : Path
185
+ Path where pp_adata was written.
186
+ pp_dedup_adata : AnnData
187
+ Preprocessed AnnData with duplicate reads removed (for non-direct SMF).
188
+ pp_dup_rem_adata_path : Path
189
+ Path where pp_dedup_adata was written.
190
+ """
22
191
  from pathlib import Path
23
192
 
24
- from datetime import datetime
25
- date_str = datetime.today().strftime("%y%m%d")
193
+ import numpy as np
26
194
 
27
- ################################### 1) Load existing ###################################
28
- adata, adata_path, cfg = load_adata(config_path)
195
+ from .helpers import write_gz_h5ad
196
+ from ..readwrite import make_dirs
197
+ from ..preprocessing import (
198
+ load_sample_sheet,
199
+ filter_reads_on_length_quality_mapping,
200
+ clean_NaN,
201
+ calculate_coverage,
202
+ append_base_context,
203
+ append_binary_layer_by_base_context,
204
+ calculate_read_modification_stats,
205
+ filter_reads_on_modification_thresholds,
206
+ flag_duplicate_reads,
207
+ calculate_complexity_II,
208
+ calculate_position_Youden,
209
+ binarize_on_Youden,
210
+ binarize_adata,
211
+ )
212
+ from ..plotting import plot_read_qc_histograms
29
213
 
214
+ ################################### 1) Load existing ###################################
30
215
  # General config variable init - Necessary user passed inputs
31
216
  smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
32
217
  output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
33
-
34
- # Make initial output directory
35
218
  make_dirs([output_directory])
36
219
 
37
- input_manager_df = pd.read_csv(cfg.summary_file)
38
- initial_adata_path = Path(input_manager_df['load_adata'][0])
39
- pp_adata_path = Path(input_manager_df['pp_adata'][0])
40
- pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
41
- spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
42
- hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
43
-
44
- if adata:
45
- # This happens on first run of the load pipeline
46
- pass
47
- else:
48
- # If an anndata is saved, check which stages of the anndata are available
49
- initial_version_available = initial_adata_path.exists()
50
- preprocessed_version_available = pp_adata_path.exists()
51
- preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
52
- spatial_adata_exists = spatial_adata_path.exists()
53
- hmm_adata_exists = hmm_adata_path.exists()
54
-
55
- if cfg.force_redo_preprocessing:
56
- print(f"Forcing full redo of preprocessing workflow, starting from earliest stage adata available.")
57
- if initial_version_available:
58
- adata, load_report = safe_read_h5ad(initial_adata_path)
59
- elif preprocessed_version_available:
60
- adata, load_report = safe_read_h5ad(pp_adata_path)
61
- elif preprocessed_dup_removed_version_available:
62
- adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
63
- else:
64
- print(f"Can not redo preprocessing when there is no adata available.")
65
- return
66
- elif cfg.force_redo_flag_duplicate_reads:
67
- print(f"Forcing redo of duplicate detection workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
68
- if preprocessed_version_available:
69
- adata, load_report = safe_read_h5ad(pp_adata_path)
70
- elif initial_version_available:
71
- adata, load_report = safe_read_h5ad(initial_adata_path)
72
- else:
73
- print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
74
- return
75
- elif cfg.force_redo_basic_analyses:
76
- print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
77
- if preprocessed_version_available:
78
- adata, load_report = safe_read_h5ad(pp_adata_path)
79
- elif initial_version_available:
80
- adata, load_report = safe_read_h5ad(initial_adata_path)
81
- else:
82
- print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
83
- elif hmm_adata_exists:
84
- print(f"HMM anndata found: {hmm_adata_path}")
85
- return (None, None, None, None)
86
- elif spatial_adata_exists:
87
- print(f"Spatial anndata found: {spatial_adata_exists}")
88
- return (None, None, None, None)
89
- elif preprocessed_dup_removed_version_available:
90
- print(f"Preprocessed deduplicated anndata found: {pp_dup_rem_adata_path}")
91
- return (None, pp_adata_path, None, pp_dup_rem_adata_path)
92
- elif preprocessed_version_available:
93
- print(f"Preprocessed anndata found: {pp_adata_path}")
94
- adata, load_report = safe_read_h5ad(pp_adata_path)
95
- elif initial_version_available:
96
- adata, load_report = safe_read_h5ad(initial_adata_path)
97
- else:
98
- print(f"No adata available.")
99
- return
100
-
101
220
  ######### Begin Preprocessing #########
102
221
  pp_dir = output_directory / "preprocessed"
103
222
 
104
223
  ## Load sample sheet metadata based on barcode mapping ##
105
- if cfg.sample_sheet_path:
106
- from ..preprocessing import load_sample_sheet
224
+ if getattr(cfg, "sample_sheet_path", None):
107
225
  load_sample_sheet(adata,
108
226
  cfg.sample_sheet_path,
109
227
  mapping_key_column=cfg.sample_sheet_mapping_column,
@@ -118,17 +236,14 @@ def preprocess_adata(config_path):
118
236
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
119
237
  print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
120
238
  else:
121
- from ..plotting import plot_read_qc_histograms
122
239
  make_dirs([pp_dir, pp_length_qc_dir])
123
- obs_to_plot = ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal']
124
240
  plot_read_qc_histograms(adata,
125
241
  pp_length_qc_dir,
126
- obs_to_plot,
242
+ cfg.obs_to_plot_pp_qc,
127
243
  sample_key=cfg.sample_name_col_for_plotting,
128
244
  rows_per_fig=cfg.rows_per_qc_histogram_grid)
129
245
 
130
246
  # Filter on read length, read quality, reference length, mapped_length, and mapping quality metadata.
131
- from ..preprocessing import filter_reads_on_length_quality_mapping
132
247
  print(adata.shape)
133
248
  adata = filter_reads_on_length_quality_mapping(adata,
134
249
  filter_on_coordinates=cfg.read_coord_filter,
@@ -145,19 +260,15 @@ def preprocess_adata(config_path):
145
260
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
146
261
  print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
147
262
  else:
148
- from ..plotting import plot_read_qc_histograms
149
263
  make_dirs([pp_dir, pp_length_qc_dir])
150
- obs_to_plot = ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal']
151
264
  plot_read_qc_histograms(adata,
152
265
  pp_length_qc_dir,
153
- obs_to_plot,
266
+ cfg.obs_to_plot_pp_qc,
154
267
  sample_key=cfg.sample_name_col_for_plotting,
155
268
  rows_per_fig=cfg.rows_per_qc_histogram_grid)
156
269
 
157
270
  ############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
158
- from ..preprocessing import clean_NaN
159
271
  if smf_modality == 'direct':
160
- from ..preprocessing import calculate_position_Youden, binarize_on_Youden, binarize_adata
161
272
  native = True
162
273
  if cfg.fit_position_methylation_thresholds:
163
274
  pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
@@ -167,7 +278,8 @@ def preprocess_adata(config_path):
167
278
  positive_control_sample=cfg.positive_control_sample_methylation_fitting,
168
279
  negative_control_sample=cfg.negative_control_sample_methylation_fitting,
169
280
  J_threshold=cfg.fit_j_threshold,
170
- obs_column=cfg.reference_column,
281
+ ref_column=cfg.reference_column,
282
+ sample_column=cfg.sample_column,
171
283
  infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
172
284
  inference_variable=cfg.inference_variable_sample_methylation_fitting,
173
285
  save=True,
@@ -175,7 +287,7 @@ def preprocess_adata(config_path):
175
287
  )
176
288
  # binarize the modcalls based on the determined thresholds
177
289
  binarize_on_Youden(adata,
178
- obs_column=cfg.reference_column,
290
+ ref_column=cfg.reference_column,
179
291
  output_layer_name=cfg.output_binary_layer_name
180
292
  )
181
293
  else:
@@ -195,12 +307,16 @@ def preprocess_adata(config_path):
195
307
  bypass=cfg.bypass_clean_nan,
196
308
  force_redo=cfg.force_redo_clean_nan
197
309
  )
310
+
311
+ ############### Calculate positional coverage by reference set in dataset ###############
312
+ calculate_coverage(adata,
313
+ ref_column=cfg.reference_column,
314
+ position_nan_threshold=cfg.position_max_nan_threshold)
198
315
 
199
316
  ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
200
- from ..preprocessing import append_base_context, append_binary_layer_by_base_context
201
317
  # Additionally, store base_context level binary modification arrays in adata.obsm
202
318
  append_base_context(adata,
203
- obs_column=cfg.reference_column,
319
+ ref_column=cfg.reference_column,
204
320
  use_consensus=False,
205
321
  native=native,
206
322
  mod_target_bases=cfg.mod_target_bases,
@@ -212,20 +328,14 @@ def preprocess_adata(config_path):
212
328
  smf_modality,
213
329
  bypass=cfg.bypass_append_binary_layer_by_base_context,
214
330
  force_redo=cfg.force_redo_append_binary_layer_by_base_context)
215
-
216
- ############### Optional inversion of the adata along positions axis ###################
217
- if cfg.invert_adata:
218
- from ..preprocessing import invert_adata
219
- adata = invert_adata(adata)
220
331
 
221
332
  ############### Calculate read methylation/deamination statistics for specific base contexts defined above ###############
222
- from ..preprocessing import calculate_read_modification_stats
223
333
  calculate_read_modification_stats(adata,
224
334
  cfg.reference_column,
225
335
  cfg.sample_column,
226
336
  cfg.mod_target_bases,
227
337
  bypass=cfg.bypass_calculate_read_modification_stats,
228
- force_redo=cfg.force_redo_calculate_read_modification_stats)
338
+ force_redo=cfg.force_redo_calculate_read_modification_stats)
229
339
 
230
340
  ### Make a dir for outputting sample level read modification metrics before filtering ###
231
341
  pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
@@ -233,11 +343,10 @@ def preprocess_adata(config_path):
233
343
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
234
344
  print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
235
345
  else:
236
- from ..plotting import plot_read_qc_histograms
237
346
  make_dirs([pp_dir, pp_meth_qc_dir])
238
347
  obs_to_plot = ['Raw_modification_signal']
239
348
  if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
240
- obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_any_C_site_modified']
349
+ obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_C_site_modified']
241
350
  if 'A' in cfg.mod_target_bases:
242
351
  obs_to_plot += ['Fraction_A_site_modified']
243
352
  plot_read_qc_histograms(adata,
@@ -246,13 +355,12 @@ def preprocess_adata(config_path):
246
355
  rows_per_fig=cfg.rows_per_qc_histogram_grid)
247
356
 
248
357
  ##### Optionally filter reads on modification metrics
249
- from ..preprocessing import filter_reads_on_modification_thresholds
250
358
  adata = filter_reads_on_modification_thresholds(adata,
251
359
  smf_modality=smf_modality,
252
360
  mod_target_bases=cfg.mod_target_bases,
253
361
  gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
254
362
  cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
255
- any_c_thresholds=cfg.read_mod_filtering_any_c_thresholds,
363
+ any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
256
364
  a_thresholds=cfg.read_mod_filtering_a_thresholds,
257
365
  use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
258
366
  min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
@@ -264,27 +372,19 @@ def preprocess_adata(config_path):
264
372
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
265
373
  print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
266
374
  else:
267
- from ..plotting import plot_read_qc_histograms
268
375
  make_dirs([pp_dir, pp_meth_qc_dir])
269
376
  obs_to_plot = ['Raw_modification_signal']
270
377
  if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
271
- obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_any_C_site_modified']
378
+ obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_C_site_modified']
272
379
  if 'A' in cfg.mod_target_bases:
273
380
  obs_to_plot += ['Fraction_A_site_modified']
274
381
  plot_read_qc_histograms(adata,
275
382
  pp_meth_qc_dir, obs_to_plot,
276
383
  sample_key=cfg.sample_name_col_for_plotting,
277
384
  rows_per_fig=cfg.rows_per_qc_histogram_grid)
278
-
279
- ############### Calculate positional coverage in dataset ###############
280
- from ..preprocessing import calculate_coverage
281
- calculate_coverage(adata,
282
- obs_column=cfg.reference_column,
283
- position_nan_threshold=cfg.position_max_nan_threshold)
284
385
 
285
386
  ############### Duplicate detection for conversion/deamination SMF ###############
286
387
  if smf_modality != 'direct':
287
- from ..preprocessing import flag_duplicate_reads, calculate_complexity_II
288
388
  references = adata.obs[cfg.reference_column].cat.categories
289
389
 
290
390
  var_filters_sets =[]
@@ -342,22 +442,14 @@ def preprocess_adata(config_path):
342
442
  ########################################################################################################################
343
443
 
344
444
  ############################################### Save preprocessed adata with duplicate detection ###############################################
345
- from ..readwrite import safe_write_h5ad
346
445
  if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
347
446
  print('Saving preprocessed adata.')
348
- if ".gz" == pp_adata_path.suffix:
349
- safe_write_h5ad(adata, pp_adata_path, compression='gzip', backup=True)
350
- else:
351
- pp_adata_path = pp_adata_path.with_name(pp_adata_path.name + '.gz')
352
- safe_write_h5ad(adata, pp_adata_path, compression='gzip', backup=True)
447
+ write_gz_h5ad(adata, pp_adata_path)
353
448
 
354
449
  if not pp_dup_rem_adata_path.exists() or cfg.force_redo_preprocessing:
355
450
  print('Saving preprocessed adata with duplicates removed.')
356
- if ".gz" == pp_dup_rem_adata_path.suffix:
357
- safe_write_h5ad(adata_unique, pp_dup_rem_adata_path, compression='gzip', backup=True)
358
- else:
359
- pp_adata_path = pp_dup_rem_adata_path.with_name(pp_dup_rem_adata_path.name + '.gz')
360
- safe_write_h5ad(adata_unique, pp_dup_rem_adata_path, compression='gzip', backup=True)
451
+ write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
452
+
361
453
  ########################################################################################################################
362
454
 
363
455
  return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)