smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +34 -6
  7. smftools/cli/hmm_adata.py +239 -33
  8. smftools/cli/latent_adata.py +318 -0
  9. smftools/cli/load_adata.py +167 -131
  10. smftools/cli/preprocess_adata.py +180 -53
  11. smftools/cli/spatial_adata.py +152 -100
  12. smftools/cli_entry.py +38 -1
  13. smftools/config/__init__.py +2 -0
  14. smftools/config/conversion.yaml +11 -1
  15. smftools/config/default.yaml +42 -2
  16. smftools/config/experiment_config.py +59 -1
  17. smftools/constants.py +65 -0
  18. smftools/datasets/__init__.py +2 -0
  19. smftools/hmm/HMM.py +97 -3
  20. smftools/hmm/__init__.py +24 -13
  21. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  22. smftools/hmm/archived/calculate_distances.py +2 -0
  23. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  24. smftools/hmm/archived/train_hmm.py +2 -0
  25. smftools/hmm/call_hmm_peaks.py +5 -2
  26. smftools/hmm/display_hmm.py +4 -1
  27. smftools/hmm/hmm_readwrite.py +7 -2
  28. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  29. smftools/informatics/__init__.py +59 -34
  30. smftools/informatics/archived/bam_conversion.py +2 -0
  31. smftools/informatics/archived/bam_direct.py +2 -0
  32. smftools/informatics/archived/basecall_pod5s.py +2 -0
  33. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  34. smftools/informatics/archived/conversion_smf.py +2 -0
  35. smftools/informatics/archived/deaminase_smf.py +1 -0
  36. smftools/informatics/archived/direct_smf.py +2 -0
  37. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  38. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  39. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  40. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  41. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  42. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  43. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  44. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  45. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  48. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  49. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  50. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  52. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  53. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  54. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  55. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  56. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  57. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  58. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  59. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  60. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  61. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  62. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  63. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  64. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  65. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  66. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  67. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  68. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  69. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  70. smftools/informatics/archived/subsample_pod5.py +2 -0
  71. smftools/informatics/bam_functions.py +1093 -176
  72. smftools/informatics/basecalling.py +2 -0
  73. smftools/informatics/bed_functions.py +271 -61
  74. smftools/informatics/binarize_converted_base_identities.py +3 -0
  75. smftools/informatics/complement_base_list.py +2 -0
  76. smftools/informatics/converted_BAM_to_adata.py +641 -176
  77. smftools/informatics/fasta_functions.py +94 -10
  78. smftools/informatics/h5ad_functions.py +123 -4
  79. smftools/informatics/modkit_extract_to_adata.py +1019 -431
  80. smftools/informatics/modkit_functions.py +2 -0
  81. smftools/informatics/ohe.py +2 -0
  82. smftools/informatics/pod5_functions.py +3 -2
  83. smftools/informatics/sequence_encoding.py +72 -0
  84. smftools/logging_utils.py +21 -2
  85. smftools/machine_learning/__init__.py +22 -6
  86. smftools/machine_learning/data/__init__.py +2 -0
  87. smftools/machine_learning/data/anndata_data_module.py +18 -4
  88. smftools/machine_learning/data/preprocessing.py +2 -0
  89. smftools/machine_learning/evaluation/__init__.py +2 -0
  90. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  91. smftools/machine_learning/evaluation/evaluators.py +14 -9
  92. smftools/machine_learning/inference/__init__.py +2 -0
  93. smftools/machine_learning/inference/inference_utils.py +2 -0
  94. smftools/machine_learning/inference/lightning_inference.py +6 -1
  95. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  96. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  97. smftools/machine_learning/models/__init__.py +2 -0
  98. smftools/machine_learning/models/base.py +7 -2
  99. smftools/machine_learning/models/cnn.py +7 -2
  100. smftools/machine_learning/models/lightning_base.py +16 -11
  101. smftools/machine_learning/models/mlp.py +5 -1
  102. smftools/machine_learning/models/positional.py +7 -2
  103. smftools/machine_learning/models/rnn.py +5 -1
  104. smftools/machine_learning/models/sklearn_models.py +14 -9
  105. smftools/machine_learning/models/transformer.py +7 -2
  106. smftools/machine_learning/models/wrappers.py +6 -2
  107. smftools/machine_learning/training/__init__.py +2 -0
  108. smftools/machine_learning/training/train_lightning_model.py +13 -3
  109. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  110. smftools/machine_learning/utils/__init__.py +2 -0
  111. smftools/machine_learning/utils/device.py +5 -1
  112. smftools/machine_learning/utils/grl.py +5 -1
  113. smftools/metadata.py +1 -1
  114. smftools/optional_imports.py +31 -0
  115. smftools/plotting/__init__.py +41 -31
  116. smftools/plotting/autocorrelation_plotting.py +9 -5
  117. smftools/plotting/classifiers.py +16 -4
  118. smftools/plotting/general_plotting.py +2415 -629
  119. smftools/plotting/hmm_plotting.py +97 -9
  120. smftools/plotting/position_stats.py +15 -7
  121. smftools/plotting/qc_plotting.py +6 -1
  122. smftools/preprocessing/__init__.py +36 -37
  123. smftools/preprocessing/append_base_context.py +17 -17
  124. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  125. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  126. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  127. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  128. smftools/preprocessing/archived/preprocessing.py +2 -0
  129. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  130. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  131. smftools/preprocessing/calculate_complexity_II.py +4 -1
  132. smftools/preprocessing/calculate_consensus.py +1 -1
  133. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  134. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  135. smftools/preprocessing/calculate_position_Youden.py +9 -2
  136. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  137. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  138. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  139. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  140. smftools/preprocessing/make_dirs.py +2 -1
  141. smftools/preprocessing/min_non_diagonal.py +2 -0
  142. smftools/preprocessing/recipes.py +2 -0
  143. smftools/readwrite.py +53 -17
  144. smftools/schema/anndata_schema_v1.yaml +15 -1
  145. smftools/tools/__init__.py +30 -18
  146. smftools/tools/archived/apply_hmm.py +2 -0
  147. smftools/tools/archived/classifiers.py +2 -0
  148. smftools/tools/archived/classify_methylated_features.py +2 -0
  149. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  150. smftools/tools/archived/subset_adata_v1.py +2 -0
  151. smftools/tools/archived/subset_adata_v2.py +2 -0
  152. smftools/tools/calculate_leiden.py +57 -0
  153. smftools/tools/calculate_nmf.py +119 -0
  154. smftools/tools/calculate_umap.py +93 -8
  155. smftools/tools/cluster_adata_on_methylation.py +7 -1
  156. smftools/tools/position_stats.py +17 -27
  157. smftools/tools/rolling_nn_distance.py +235 -0
  158. smftools/tools/tensor_factorization.py +169 -0
  159. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
  160. smftools-0.3.1.dist-info/RECORD +189 -0
  161. smftools-0.2.5.dist-info/RECORD +0 -181
  162. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  163. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  164. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
1
4
  from pathlib import Path
2
5
  from typing import Optional, Tuple
3
6
 
4
7
  import anndata as ad
5
8
 
6
- from smftools.logging_utils import get_logger
9
+ from smftools.constants import LOGGING_DIR, PREPROCESS_DIR
10
+ from smftools.logging_utils import get_logger, setup_logging
7
11
 
8
12
  logger = get_logger(__name__)
9
13
 
@@ -34,30 +38,23 @@ def preprocess_adata(
34
38
  Path to preprocessed, duplicate-removed AnnData.
35
39
  """
36
40
  from ..readwrite import safe_read_h5ad
37
- from .helpers import get_adata_paths
38
- from .load_adata import load_adata
41
+ from .helpers import get_adata_paths, load_experiment_config
39
42
 
40
43
  # 1) Ensure config is loaded and at least *some* AnnData stage exists
41
- loaded_adata, loaded_path, cfg = load_adata(config_path)
44
+ cfg = load_experiment_config(config_path)
42
45
 
43
46
  # 2) Compute canonical paths
44
47
  paths = get_adata_paths(cfg)
45
48
  raw_path = paths.raw
46
49
  pp_path = paths.pp
47
50
  pp_dedup_path = paths.pp_dedup
48
- spatial_path = paths.spatial
49
- hmm_path = paths.hmm
50
51
 
51
52
  raw_exists = raw_path.exists()
52
53
  pp_exists = pp_path.exists()
53
54
  pp_dedup_exists = pp_dedup_path.exists()
54
- spatial_exists = spatial_path.exists()
55
- hmm_exists = hmm_path.exists()
56
55
 
57
- # Helper: reuse loaded_adata if it matches the path we want, else read from disk
56
+ # Helper: read from disk
58
57
  def _load(path: Path):
59
- if loaded_adata is not None and loaded_path == path:
60
- return loaded_adata
61
58
  adata, _ = safe_read_h5ad(path)
62
59
  return adata
63
60
 
@@ -65,20 +62,8 @@ def preprocess_adata(
65
62
  # Case A: full redo of preprocessing
66
63
  # -----------------------------
67
64
  if getattr(cfg, "force_redo_preprocessing", False):
68
- logger.info(
69
- "Forcing full redo of preprocessing workflow, starting from latest stage AnnData available."
70
- )
71
-
72
- if hmm_exists:
73
- adata = _load(hmm_path)
74
- source_path = hmm_path
75
- elif spatial_exists:
76
- adata = _load(spatial_path)
77
- source_path = spatial_path
78
- elif pp_dedup_exists:
79
- adata = _load(pp_dedup_path)
80
- source_path = pp_dedup_path
81
- elif pp_exists:
65
+ logger.info("Forcing full redo of preprocessing workflow.")
66
+ if pp_exists:
82
67
  adata = _load(pp_path)
83
68
  source_path = pp_path
84
69
  elif raw_exists:
@@ -133,26 +118,16 @@ def preprocess_adata(
133
118
  # Case C: normal behavior (no explicit redo flags)
134
119
  # -----------------------------
135
120
 
136
- # If HMM exists, preprocessing is considered “done enough”
137
- if hmm_exists:
138
- logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
139
- return (None, None, None, None)
140
-
141
- # If spatial exists, also skip re-preprocessing by default
142
- if spatial_exists:
143
- logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
144
- return (None, None, None, None)
145
-
146
121
  # If pp_dedup exists, just return paths (no recomputation)
147
122
  if pp_dedup_exists:
148
- logger.debug(
123
+ logger.info(
149
124
  f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
150
125
  )
151
126
  return (None, pp_path, None, pp_dedup_path)
152
127
 
153
128
  # If pp exists but pp_dedup does not, load pp and run core
154
129
  if pp_exists:
155
- logger.debug(f"Preprocessed AnnData found: {pp_path}")
130
+ logger.info(f"Preprocessed AnnData found: {pp_path}")
156
131
  adata = _load(pp_path)
157
132
  source_path = pp_path
158
133
  pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
@@ -216,13 +191,19 @@ def preprocess_adata_core(
216
191
  pp_dup_rem_adata_path : Path
217
192
  Path where pp_dedup_adata was written.
218
193
  """
194
+ from datetime import datetime
219
195
  from pathlib import Path
220
196
 
221
197
  from ..metadata import record_smftools_metadata
222
- from ..plotting import plot_read_qc_histograms
198
+ from ..plotting import (
199
+ plot_read_qc_histograms,
200
+ plot_read_span_quality_clustermaps,
201
+ plot_sequence_integer_encoding_clustermaps,
202
+ )
223
203
  from ..preprocessing import (
224
204
  append_base_context,
225
205
  append_binary_layer_by_base_context,
206
+ append_mismatch_frequency_sites,
226
207
  binarize_adata,
227
208
  binarize_on_Youden,
228
209
  calculate_complexity_II,
@@ -233,22 +214,39 @@ def preprocess_adata_core(
233
214
  filter_reads_on_length_quality_mapping,
234
215
  filter_reads_on_modification_thresholds,
235
216
  flag_duplicate_reads,
217
+ invert_adata,
236
218
  load_sample_sheet,
219
+ reindex_references_adata,
237
220
  )
238
221
  from ..readwrite import make_dirs
239
222
  from .helpers import write_gz_h5ad
240
223
 
241
224
  ################################### 1) Load existing ###################################
225
+ date_str = datetime.today().strftime("%y%m%d")
226
+ now = datetime.now()
227
+ time_str = now.strftime("%H%M%S")
228
+
229
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
230
+
242
231
  # General config variable init - Necessary user passed inputs
243
232
  smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
244
233
  output_directory = Path(
245
234
  cfg.output_directory
246
235
  ) # Path to the output directory to make for the analysis. Necessary.
247
- make_dirs([output_directory])
236
+ preprocess_directory = output_directory / PREPROCESS_DIR
237
+ logging_directory = preprocess_directory / LOGGING_DIR
248
238
 
249
- ######### Begin Preprocessing #########
250
- pp_dir = output_directory / "preprocessed"
239
+ make_dirs([output_directory, preprocess_directory])
240
+
241
+ if cfg.emit_log_file:
242
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
243
+ make_dirs([logging_directory])
244
+ else:
245
+ log_file = None
246
+
247
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
251
248
 
249
+ ######### Begin Preprocessing #########
252
250
  ## Load sample sheet metadata based on barcode mapping ##
253
251
  if getattr(cfg, "sample_sheet_path", None):
254
252
  load_sample_sheet(
@@ -262,12 +260,12 @@ def preprocess_adata_core(
262
260
  pass
263
261
 
264
262
  # Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
265
- pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
263
+ pp_length_qc_dir = preprocess_directory / "01_Read_length_and_quality_QC_metrics"
266
264
 
267
265
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
268
266
  logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
269
267
  else:
270
- make_dirs([pp_dir, pp_length_qc_dir])
268
+ make_dirs([preprocess_directory, pp_length_qc_dir])
271
269
  plot_read_qc_histograms(
272
270
  adata,
273
271
  pp_length_qc_dir,
@@ -290,12 +288,12 @@ def preprocess_adata_core(
290
288
  )
291
289
  print(adata.shape)
292
290
 
293
- pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
291
+ pp_length_qc_dir = preprocess_directory / "02_Read_length_and_quality_QC_metrics_post_filtering"
294
292
 
295
293
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
296
294
  logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
297
295
  else:
298
- make_dirs([pp_dir, pp_length_qc_dir])
296
+ make_dirs([preprocess_directory, pp_length_qc_dir])
299
297
  plot_read_qc_histograms(
300
298
  adata,
301
299
  pp_length_qc_dir,
@@ -308,7 +306,7 @@ def preprocess_adata_core(
308
306
  if smf_modality == "direct":
309
307
  native = True
310
308
  if cfg.fit_position_methylation_thresholds:
311
- pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
309
+ pp_Youden_dir = preprocess_directory / "02B_Position_wide_Youden_threshold_performance"
312
310
  make_dirs([pp_Youden_dir])
313
311
  # Calculate positional methylation thresholds for mod calls
314
312
  calculate_position_Youden(
@@ -357,7 +355,6 @@ def preprocess_adata_core(
357
355
  )
358
356
 
359
357
  ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
360
- # Additionally, store base_context level binary modification arrays in adata.obsm
361
358
  append_base_context(
362
359
  adata,
363
360
  ref_column=cfg.reference_column,
@@ -376,17 +373,18 @@ def preprocess_adata_core(
376
373
  cfg.mod_target_bases,
377
374
  bypass=cfg.bypass_calculate_read_modification_stats,
378
375
  force_redo=cfg.force_redo_calculate_read_modification_stats,
376
+ smf_modality=cfg.smf_modality,
379
377
  )
380
378
 
381
379
  ### Make a dir for outputting sample level read modification metrics before filtering ###
382
- pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
380
+ pp_meth_qc_dir = preprocess_directory / "03_read_modification_QC_metrics"
383
381
 
384
382
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
385
383
  logger.debug(
386
384
  f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
387
385
  )
388
386
  else:
389
- make_dirs([pp_dir, pp_meth_qc_dir])
387
+ make_dirs([preprocess_directory, pp_meth_qc_dir])
390
388
  obs_to_plot = ["Raw_modification_signal"]
391
389
  if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
392
390
  obs_to_plot += [
@@ -420,14 +418,14 @@ def preprocess_adata_core(
420
418
  force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
421
419
  )
422
420
 
423
- pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
421
+ pp_meth_qc_dir = preprocess_directory / "04_read_modification_QC_metrics_post_filtering"
424
422
 
425
423
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
426
424
  logger.debug(
427
425
  f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
428
426
  )
429
427
  else:
430
- make_dirs([pp_dir, pp_meth_qc_dir])
428
+ make_dirs([preprocess_directory, pp_meth_qc_dir])
431
429
  obs_to_plot = ["Raw_modification_signal"]
432
430
  if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
433
431
  obs_to_plot += [
@@ -487,7 +485,7 @@ def preprocess_adata_core(
487
485
  for site_type in cfg.duplicate_detection_site_types:
488
486
  var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
489
487
 
490
- pp_dup_qc_dir = pp_dir / "05_read_duplication_QC_metrics"
488
+ pp_dup_qc_dir = preprocess_directory / "05_read_duplication_QC_metrics"
491
489
 
492
490
  make_dirs([pp_dup_qc_dir])
493
491
 
@@ -512,7 +510,7 @@ def preprocess_adata_core(
512
510
  hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
513
511
  hierarchical_metric="euclidean",
514
512
  hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
515
- demux_types=("double", "already"),
513
+ demux_types=cfg.duplicate_detection_demux_types_to_use,
516
514
  demux_col="demux_type",
517
515
  )
518
516
 
@@ -539,6 +537,135 @@ def preprocess_adata_core(
539
537
  adata_unique = adata
540
538
  ########################################################################################################################
541
539
 
540
+ # -----------------------------
541
+ # Optional inversion along positions axis
542
+ # -----------------------------
543
+ if getattr(cfg, "invert_adata", False):
544
+ adata = invert_adata(adata)
545
+
546
+ # -----------------------------
547
+ # Optional reindexing by reference
548
+ # -----------------------------
549
+ reindex_references_adata(
550
+ adata,
551
+ reference_col=cfg.reference_column,
552
+ offsets=cfg.reindexing_offsets,
553
+ new_col=cfg.reindexed_var_suffix,
554
+ )
555
+
556
+ ############################################### Append mismatch frequency per position ###############################################
557
+ append_mismatch_frequency_sites(
558
+ adata_unique,
559
+ ref_column=cfg.reference_column,
560
+ mismatch_layer=cfg.mismatch_frequency_layer,
561
+ read_span_layer=cfg.mismatch_frequency_read_span_layer,
562
+ mismatch_frequency_range=cfg.mismatch_frequency_range,
563
+ bypass=cfg.bypass_append_mismatch_frequency_sites,
564
+ force_redo=cfg.force_redo_append_mismatch_frequency_sites,
565
+ )
566
+
567
+ ############################################### Plot integer sequence encoding clustermaps ###############################################
568
+ if "sequence_integer_encoding" not in adata.layers:
569
+ logger.debug(
570
+ "sequence_integer_encoding layer not found; skipping integer encoding clustermaps."
571
+ )
572
+ else:
573
+ pp_seq_clustermap_dir = preprocess_directory / "06_sequence_integer_encoding_clustermaps"
574
+ if pp_seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
575
+ logger.debug(
576
+ f"{pp_seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
577
+ )
578
+ else:
579
+ make_dirs([pp_seq_clustermap_dir])
580
+ plot_sequence_integer_encoding_clustermaps(
581
+ adata,
582
+ sample_col=cfg.sample_name_col_for_plotting,
583
+ reference_col=cfg.reference_column,
584
+ demux_types=cfg.clustermap_demux_types_to_plot,
585
+ min_quality=None,
586
+ min_length=None,
587
+ min_mapped_length_to_reference_length_ratio=None,
588
+ sort_by="none",
589
+ max_unknown_fraction=0.5,
590
+ save_path=pp_seq_clustermap_dir,
591
+ show_position_axis=True,
592
+ )
593
+
594
+ pp_dedup_seq_clustermap_dir = (
595
+ preprocess_directory / "deduplicated" / "06_sequence_integer_encoding_clustermaps"
596
+ )
597
+ if pp_dedup_seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
598
+ logger.debug(
599
+ f"{pp_dedup_seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
600
+ )
601
+ else:
602
+ make_dirs([pp_dedup_seq_clustermap_dir])
603
+ plot_sequence_integer_encoding_clustermaps(
604
+ adata_unique,
605
+ sample_col=cfg.sample_name_col_for_plotting,
606
+ reference_col=cfg.reference_column,
607
+ demux_types=cfg.clustermap_demux_types_to_plot,
608
+ min_quality=None,
609
+ min_length=None,
610
+ min_mapped_length_to_reference_length_ratio=None,
611
+ sort_by="none",
612
+ max_unknown_fraction=0.5,
613
+ save_path=pp_dedup_seq_clustermap_dir,
614
+ show_position_axis=True,
615
+ )
616
+
617
+ ############################################### Plot read span mask + base quality clustermaps ###############################################
618
+ quality_layer = None
619
+ if "base_quality_scores" in adata.layers:
620
+ quality_layer = "base_quality_scores"
621
+ elif "base_qualities" in adata.layers:
622
+ quality_layer = "base_qualities"
623
+
624
+ if "read_span_mask" not in adata.layers or quality_layer is None:
625
+ logger.debug(
626
+ "read_span_mask and base quality layers not found; skipping read span/base quality clustermaps."
627
+ )
628
+ else:
629
+ pp_span_quality_dir = preprocess_directory / "07_read_span_quality_clustermaps"
630
+ if pp_span_quality_dir.is_dir() and not cfg.force_redo_preprocessing:
631
+ logger.debug(
632
+ f"{pp_span_quality_dir} already exists. Skipping read span/base quality clustermaps."
633
+ )
634
+ else:
635
+ make_dirs([pp_span_quality_dir])
636
+ plot_read_span_quality_clustermaps(
637
+ adata,
638
+ sample_col=cfg.sample_name_col_for_plotting,
639
+ reference_col=cfg.reference_column,
640
+ quality_layer=quality_layer,
641
+ read_span_layer="read_span_mask",
642
+ demux_types=cfg.clustermap_demux_types_to_plot,
643
+ save_path=pp_span_quality_dir,
644
+ show_position_axis=True,
645
+ max_nan_fraction=0.5,
646
+ )
647
+
648
+ pp_dedup_span_quality_dir = (
649
+ preprocess_directory / "deduplicated" / "07_read_span_quality_clustermaps"
650
+ )
651
+ if pp_dedup_span_quality_dir.is_dir() and not cfg.force_redo_preprocessing:
652
+ logger.debug(
653
+ f"{pp_dedup_span_quality_dir} already exists. Skipping read span/base quality clustermaps."
654
+ )
655
+ elif quality_layer in adata_unique.layers and "read_span_mask" in adata_unique.layers:
656
+ make_dirs([pp_dedup_span_quality_dir])
657
+ plot_read_span_quality_clustermaps(
658
+ adata_unique,
659
+ sample_col=cfg.sample_name_col_for_plotting,
660
+ reference_col=cfg.reference_column,
661
+ quality_layer=quality_layer,
662
+ read_span_layer="read_span_mask",
663
+ demux_types=cfg.clustermap_demux_types_to_plot,
664
+ save_path=pp_dedup_span_quality_dir,
665
+ show_position_axis=True,
666
+ max_nan_fraction=0.5,
667
+ )
668
+
542
669
  ############################################### Save preprocessed adata with duplicate detection ###############################################
543
670
  if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
544
671
  logger.info("Saving preprocessed adata.")