smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +49 -7
  4. smftools/cli/hmm_adata.py +250 -32
  5. smftools/cli/latent_adata.py +773 -0
  6. smftools/cli/load_adata.py +78 -74
  7. smftools/cli/preprocess_adata.py +122 -58
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +74 -112
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +52 -4
  12. smftools/config/conversion.yaml +1 -1
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +85 -12
  15. smftools/config/experiment_config.py +146 -1
  16. smftools/constants.py +69 -0
  17. smftools/hmm/HMM.py +88 -0
  18. smftools/hmm/call_hmm_peaks.py +1 -1
  19. smftools/informatics/__init__.py +6 -0
  20. smftools/informatics/bam_functions.py +358 -8
  21. smftools/informatics/binarize_converted_base_identities.py +2 -89
  22. smftools/informatics/converted_BAM_to_adata.py +636 -175
  23. smftools/informatics/h5ad_functions.py +198 -2
  24. smftools/informatics/modkit_extract_to_adata.py +1007 -425
  25. smftools/informatics/sequence_encoding.py +72 -0
  26. smftools/logging_utils.py +21 -2
  27. smftools/metadata.py +1 -1
  28. smftools/plotting/__init__.py +26 -3
  29. smftools/plotting/autocorrelation_plotting.py +22 -4
  30. smftools/plotting/chimeric_plotting.py +1893 -0
  31. smftools/plotting/classifiers.py +28 -14
  32. smftools/plotting/general_plotting.py +62 -1583
  33. smftools/plotting/hmm_plotting.py +1670 -8
  34. smftools/plotting/latent_plotting.py +804 -0
  35. smftools/plotting/plotting_utils.py +243 -0
  36. smftools/plotting/position_stats.py +16 -8
  37. smftools/plotting/preprocess_plotting.py +281 -0
  38. smftools/plotting/qc_plotting.py +8 -3
  39. smftools/plotting/spatial_plotting.py +1134 -0
  40. smftools/plotting/variant_plotting.py +1231 -0
  41. smftools/preprocessing/__init__.py +4 -0
  42. smftools/preprocessing/append_base_context.py +18 -18
  43. smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
  44. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  45. smftools/preprocessing/append_variant_call_layer.py +480 -0
  46. smftools/preprocessing/calculate_consensus.py +1 -1
  47. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  48. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/readwrite.py +159 -99
  51. smftools/schema/anndata_schema_v1.yaml +15 -1
  52. smftools/tools/__init__.py +10 -0
  53. smftools/tools/calculate_knn.py +121 -0
  54. smftools/tools/calculate_leiden.py +57 -0
  55. smftools/tools/calculate_nmf.py +130 -0
  56. smftools/tools/calculate_pca.py +180 -0
  57. smftools/tools/calculate_umap.py +79 -80
  58. smftools/tools/position_stats.py +4 -4
  59. smftools/tools/rolling_nn_distance.py +872 -0
  60. smftools/tools/sequence_alignment.py +140 -0
  61. smftools/tools/tensor_factorization.py +217 -0
  62. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
  63. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
  64. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  65. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  66. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,19 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from pathlib import Path
4
5
  from typing import Optional, Tuple
5
6
 
6
7
  import anndata as ad
7
8
 
8
- from smftools.logging_utils import get_logger
9
+ from smftools.constants import (
10
+ BASE_QUALITY_SCORES,
11
+ DEMUX_TYPE,
12
+ LOGGING_DIR,
13
+ PREPROCESS_DIR,
14
+ READ_SPAN_MASK,
15
+ )
16
+ from smftools.logging_utils import get_logger, setup_logging
9
17
 
10
18
  logger = get_logger(__name__)
11
19
 
@@ -36,30 +44,23 @@ def preprocess_adata(
36
44
  Path to preprocessed, duplicate-removed AnnData.
37
45
  """
38
46
  from ..readwrite import safe_read_h5ad
39
- from .helpers import get_adata_paths
40
- from .load_adata import load_adata
47
+ from .helpers import get_adata_paths, load_experiment_config
41
48
 
42
49
  # 1) Ensure config is loaded and at least *some* AnnData stage exists
43
- loaded_adata, loaded_path, cfg = load_adata(config_path)
50
+ cfg = load_experiment_config(config_path)
44
51
 
45
52
  # 2) Compute canonical paths
46
53
  paths = get_adata_paths(cfg)
47
54
  raw_path = paths.raw
48
55
  pp_path = paths.pp
49
56
  pp_dedup_path = paths.pp_dedup
50
- spatial_path = paths.spatial
51
- hmm_path = paths.hmm
52
57
 
53
58
  raw_exists = raw_path.exists()
54
59
  pp_exists = pp_path.exists()
55
60
  pp_dedup_exists = pp_dedup_path.exists()
56
- spatial_exists = spatial_path.exists()
57
- hmm_exists = hmm_path.exists()
58
61
 
59
- # Helper: reuse loaded_adata if it matches the path we want, else read from disk
62
+ # Helper: read from disk
60
63
  def _load(path: Path):
61
- if loaded_adata is not None and loaded_path == path:
62
- return loaded_adata
63
64
  adata, _ = safe_read_h5ad(path)
64
65
  return adata
65
66
 
@@ -67,20 +68,8 @@ def preprocess_adata(
67
68
  # Case A: full redo of preprocessing
68
69
  # -----------------------------
69
70
  if getattr(cfg, "force_redo_preprocessing", False):
70
- logger.info(
71
- "Forcing full redo of preprocessing workflow, starting from latest stage AnnData available."
72
- )
73
-
74
- if hmm_exists:
75
- adata = _load(hmm_path)
76
- source_path = hmm_path
77
- elif spatial_exists:
78
- adata = _load(spatial_path)
79
- source_path = spatial_path
80
- elif pp_dedup_exists:
81
- adata = _load(pp_dedup_path)
82
- source_path = pp_dedup_path
83
- elif pp_exists:
71
+ logger.info("Forcing full redo of preprocessing workflow.")
72
+ if pp_exists:
84
73
  adata = _load(pp_path)
85
74
  source_path = pp_path
86
75
  elif raw_exists:
@@ -135,26 +124,16 @@ def preprocess_adata(
135
124
  # Case C: normal behavior (no explicit redo flags)
136
125
  # -----------------------------
137
126
 
138
- # If HMM exists, preprocessing is considered “done enough”
139
- if hmm_exists:
140
- logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
141
- return (None, None, None, None)
142
-
143
- # If spatial exists, also skip re-preprocessing by default
144
- if spatial_exists:
145
- logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
146
- return (None, None, None, None)
147
-
148
127
  # If pp_dedup exists, just return paths (no recomputation)
149
128
  if pp_dedup_exists:
150
- logger.debug(
129
+ logger.info(
151
130
  f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
152
131
  )
153
132
  return (None, pp_path, None, pp_dedup_path)
154
133
 
155
134
  # If pp exists but pp_dedup does not, load pp and run core
156
135
  if pp_exists:
157
- logger.debug(f"Preprocessed AnnData found: {pp_path}")
136
+ logger.info(f"Preprocessed AnnData found: {pp_path}")
158
137
  adata = _load(pp_path)
159
138
  source_path = pp_path
160
139
  pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
@@ -202,10 +181,6 @@ def preprocess_adata_core(
202
181
  - `pp_adata_path` and `pp_dup_rem_adata_path` are the target output paths for
203
182
  preprocessed and preprocessed+deduplicated AnnData.
204
183
 
205
- Does NOT:
206
- - Decide which stage to load from (that's the wrapper's job).
207
- - Decide whether to skip entirely; it always runs its steps, but individual
208
- sub-steps may skip based on `cfg.bypass_*` or directory existence.
209
184
 
210
185
  Returns
211
186
  -------
@@ -218,10 +193,14 @@ def preprocess_adata_core(
218
193
  pp_dup_rem_adata_path : Path
219
194
  Path where pp_dedup_adata was written.
220
195
  """
196
+ from datetime import datetime
221
197
  from pathlib import Path
222
198
 
223
199
  from ..metadata import record_smftools_metadata
224
- from ..plotting import plot_read_qc_histograms
200
+ from ..plotting import (
201
+ plot_read_qc_histograms,
202
+ plot_read_span_quality_clustermaps,
203
+ )
225
204
  from ..preprocessing import (
226
205
  append_base_context,
227
206
  append_binary_layer_by_base_context,
@@ -235,22 +214,39 @@ def preprocess_adata_core(
235
214
  filter_reads_on_length_quality_mapping,
236
215
  filter_reads_on_modification_thresholds,
237
216
  flag_duplicate_reads,
217
+ invert_adata,
238
218
  load_sample_sheet,
219
+ reindex_references_adata,
239
220
  )
240
221
  from ..readwrite import make_dirs
241
222
  from .helpers import write_gz_h5ad
242
223
 
243
224
  ################################### 1) Load existing ###################################
225
+ date_str = datetime.today().strftime("%y%m%d")
226
+ now = datetime.now()
227
+ time_str = now.strftime("%H%M%S")
228
+
229
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
230
+
244
231
  # General config variable init - Necessary user passed inputs
245
232
  smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
246
233
  output_directory = Path(
247
234
  cfg.output_directory
248
235
  ) # Path to the output directory to make for the analysis. Necessary.
249
- make_dirs([output_directory])
236
+ preprocess_directory = output_directory / PREPROCESS_DIR
237
+ logging_directory = preprocess_directory / LOGGING_DIR
250
238
 
251
- ######### Begin Preprocessing #########
252
- pp_dir = output_directory / "preprocessed"
239
+ make_dirs([output_directory, preprocess_directory])
240
+
241
+ if cfg.emit_log_file:
242
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
243
+ make_dirs([logging_directory])
244
+ else:
245
+ log_file = None
246
+
247
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
253
248
 
249
+ ######### Begin Preprocessing #########
254
250
  ## Load sample sheet metadata based on barcode mapping ##
255
251
  if getattr(cfg, "sample_sheet_path", None):
256
252
  load_sample_sheet(
@@ -264,12 +260,12 @@ def preprocess_adata_core(
264
260
  pass
265
261
 
266
262
  # Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
267
- pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
263
+ pp_length_qc_dir = preprocess_directory / "01_Read_length_and_quality_QC_metrics"
268
264
 
269
265
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
270
266
  logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
271
267
  else:
272
- make_dirs([pp_dir, pp_length_qc_dir])
268
+ make_dirs([preprocess_directory, pp_length_qc_dir])
273
269
  plot_read_qc_histograms(
274
270
  adata,
275
271
  pp_length_qc_dir,
@@ -292,12 +288,12 @@ def preprocess_adata_core(
292
288
  )
293
289
  print(adata.shape)
294
290
 
295
- pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
291
+ pp_length_qc_dir = preprocess_directory / "02_Read_length_and_quality_QC_metrics_post_filtering"
296
292
 
297
293
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
298
294
  logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
299
295
  else:
300
- make_dirs([pp_dir, pp_length_qc_dir])
296
+ make_dirs([preprocess_directory, pp_length_qc_dir])
301
297
  plot_read_qc_histograms(
302
298
  adata,
303
299
  pp_length_qc_dir,
@@ -310,7 +306,7 @@ def preprocess_adata_core(
310
306
  if smf_modality == "direct":
311
307
  native = True
312
308
  if cfg.fit_position_methylation_thresholds:
313
- pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
309
+ pp_Youden_dir = preprocess_directory / "02B_Position_wide_Youden_threshold_performance"
314
310
  make_dirs([pp_Youden_dir])
315
311
  # Calculate positional methylation thresholds for mod calls
316
312
  calculate_position_Youden(
@@ -359,7 +355,6 @@ def preprocess_adata_core(
359
355
  )
360
356
 
361
357
  ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
362
- # Additionally, store base_context level binary modification arrays in adata.obsm
363
358
  append_base_context(
364
359
  adata,
365
360
  ref_column=cfg.reference_column,
@@ -378,17 +373,18 @@ def preprocess_adata_core(
378
373
  cfg.mod_target_bases,
379
374
  bypass=cfg.bypass_calculate_read_modification_stats,
380
375
  force_redo=cfg.force_redo_calculate_read_modification_stats,
376
+ smf_modality=cfg.smf_modality,
381
377
  )
382
378
 
383
379
  ### Make a dir for outputting sample level read modification metrics before filtering ###
384
- pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
380
+ pp_meth_qc_dir = preprocess_directory / "03_read_modification_QC_metrics"
385
381
 
386
382
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
387
383
  logger.debug(
388
384
  f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
389
385
  )
390
386
  else:
391
- make_dirs([pp_dir, pp_meth_qc_dir])
387
+ make_dirs([preprocess_directory, pp_meth_qc_dir])
392
388
  obs_to_plot = ["Raw_modification_signal"]
393
389
  if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
394
390
  obs_to_plot += [
@@ -422,14 +418,14 @@ def preprocess_adata_core(
422
418
  force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
423
419
  )
424
420
 
425
- pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
421
+ pp_meth_qc_dir = preprocess_directory / "04_read_modification_QC_metrics_post_filtering"
426
422
 
427
423
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
428
424
  logger.debug(
429
425
  f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
430
426
  )
431
427
  else:
432
- make_dirs([pp_dir, pp_meth_qc_dir])
428
+ make_dirs([preprocess_directory, pp_meth_qc_dir])
433
429
  obs_to_plot = ["Raw_modification_signal"]
434
430
  if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
435
431
  obs_to_plot += [
@@ -480,6 +476,22 @@ def preprocess_adata_core(
480
476
  from_valid_sites_only=True,
481
477
  )
482
478
 
479
+ # -----------------------------
480
+ # Optional inversion along positions axis
481
+ # -----------------------------
482
+ if getattr(cfg, "invert_adata", False):
483
+ adata = invert_adata(adata)
484
+
485
+ # -----------------------------
486
+ # Optional reindexing by reference
487
+ # -----------------------------
488
+ reindex_references_adata(
489
+ adata,
490
+ reference_col=cfg.reference_column,
491
+ offsets=cfg.reindexing_offsets,
492
+ new_col=cfg.reindexed_var_suffix,
493
+ )
494
+
483
495
  ############### Duplicate detection for conversion/deamination SMF ###############
484
496
  if smf_modality != "direct":
485
497
  references = adata.obs[cfg.reference_column].cat.categories
@@ -489,7 +501,7 @@ def preprocess_adata_core(
489
501
  for site_type in cfg.duplicate_detection_site_types:
490
502
  var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
491
503
 
492
- pp_dup_qc_dir = pp_dir / "05_read_duplication_QC_metrics"
504
+ pp_dup_qc_dir = preprocess_directory / "05_read_duplication_QC_metrics"
493
505
 
494
506
  make_dirs([pp_dup_qc_dir])
495
507
 
@@ -514,8 +526,8 @@ def preprocess_adata_core(
514
526
  hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
515
527
  hierarchical_metric="euclidean",
516
528
  hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
517
- demux_types=("double", "already"),
518
- demux_col="demux_type",
529
+ demux_types=cfg.duplicate_detection_demux_types_to_use,
530
+ demux_col=DEMUX_TYPE,
519
531
  )
520
532
 
521
533
  # Use the flagged duplicate read groups and perform complexity analysis
@@ -541,6 +553,58 @@ def preprocess_adata_core(
541
553
  adata_unique = adata
542
554
  ########################################################################################################################
543
555
 
556
+ ############################################### Plot read span mask + base quality clustermaps ###############################################
557
+ quality_layer = None
558
+ if BASE_QUALITY_SCORES in adata.layers:
559
+ quality_layer = BASE_QUALITY_SCORES
560
+ elif "base_qualities" in adata.layers:
561
+ quality_layer = "base_qualities"
562
+
563
+ if READ_SPAN_MASK not in adata.layers or quality_layer is None:
564
+ logger.debug(
565
+ "read_span_mask and base quality layers not found; skipping read span/base quality clustermaps."
566
+ )
567
+ else:
568
+ pp_span_quality_dir = preprocess_directory / "06_read_span_and_quality_clustermaps"
569
+ if pp_span_quality_dir.is_dir() and not cfg.force_redo_preprocessing:
570
+ logger.debug(
571
+ f"{pp_span_quality_dir} already exists. Skipping read span/base quality clustermaps."
572
+ )
573
+ else:
574
+ make_dirs([pp_span_quality_dir])
575
+ plot_read_span_quality_clustermaps(
576
+ adata,
577
+ sample_col=cfg.sample_name_col_for_plotting,
578
+ reference_col=cfg.reference_column,
579
+ quality_layer=quality_layer,
580
+ read_span_layer=READ_SPAN_MASK,
581
+ demux_types=cfg.clustermap_demux_types_to_plot,
582
+ save_path=pp_span_quality_dir,
583
+ show_position_axis=True,
584
+ max_nan_fraction=0.5,
585
+ )
586
+
587
+ pp_dedup_span_quality_dir = (
588
+ preprocess_directory / "deduplicated" / "06_read_span_and_quality_clustermaps"
589
+ )
590
+ if pp_dedup_span_quality_dir.is_dir() and not cfg.force_redo_preprocessing:
591
+ logger.debug(
592
+ f"{pp_dedup_span_quality_dir} already exists. Skipping read span/base quality clustermaps."
593
+ )
594
+ elif quality_layer in adata_unique.layers and READ_SPAN_MASK in adata_unique.layers:
595
+ make_dirs([pp_dedup_span_quality_dir])
596
+ plot_read_span_quality_clustermaps(
597
+ adata_unique,
598
+ sample_col=cfg.sample_name_col_for_plotting,
599
+ reference_col=cfg.reference_column,
600
+ quality_layer=quality_layer,
601
+ read_span_layer=READ_SPAN_MASK,
602
+ demux_types=cfg.clustermap_demux_types_to_plot,
603
+ save_path=pp_dedup_span_quality_dir,
604
+ show_position_axis=True,
605
+ max_nan_fraction=0.5,
606
+ )
607
+
544
608
  ############################################### Save preprocessed adata with duplicate detection ###############################################
545
609
  if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
546
610
  logger.info("Saving preprocessed adata.")
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Optional, Tuple
5
+
6
+ import anndata as ad
7
+
8
+ from ..cli.chimeric_adata import chimeric_adata
9
+ from ..cli.hmm_adata import hmm_adata
10
+ from ..cli.latent_adata import latent_adata
11
+ from ..cli.load_adata import load_adata
12
+ from ..cli.preprocess_adata import preprocess_adata
13
+ from ..cli.spatial_adata import spatial_adata
14
+ from ..cli.variant_adata import variant_adata
15
+
16
+
17
+ def full_flow(
18
+ config_path: str,
19
+ ) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
20
+ load_adata(config_path)
21
+ preprocess_adata(config_path)
22
+ spatial_adata(config_path)
23
+ variant_adata(config_path)
24
+ chimeric_adata(config_path)
25
+ hmm_adata(config_path)
26
+ latent_adata(config_path)