smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/helpers.py +32 -6
  3. smftools/cli/hmm_adata.py +232 -31
  4. smftools/cli/latent_adata.py +318 -0
  5. smftools/cli/load_adata.py +77 -73
  6. smftools/cli/preprocess_adata.py +178 -53
  7. smftools/cli/spatial_adata.py +149 -101
  8. smftools/cli_entry.py +12 -0
  9. smftools/config/conversion.yaml +11 -1
  10. smftools/config/default.yaml +38 -1
  11. smftools/config/experiment_config.py +53 -1
  12. smftools/constants.py +65 -0
  13. smftools/hmm/HMM.py +88 -0
  14. smftools/informatics/__init__.py +6 -0
  15. smftools/informatics/bam_functions.py +358 -8
  16. smftools/informatics/converted_BAM_to_adata.py +584 -163
  17. smftools/informatics/h5ad_functions.py +115 -2
  18. smftools/informatics/modkit_extract_to_adata.py +1003 -425
  19. smftools/informatics/sequence_encoding.py +72 -0
  20. smftools/logging_utils.py +21 -2
  21. smftools/metadata.py +1 -1
  22. smftools/plotting/__init__.py +9 -0
  23. smftools/plotting/general_plotting.py +2411 -628
  24. smftools/plotting/hmm_plotting.py +85 -7
  25. smftools/preprocessing/__init__.py +1 -0
  26. smftools/preprocessing/append_base_context.py +17 -17
  27. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  28. smftools/preprocessing/calculate_consensus.py +1 -1
  29. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  30. smftools/readwrite.py +53 -17
  31. smftools/schema/anndata_schema_v1.yaml +15 -1
  32. smftools/tools/__init__.py +4 -0
  33. smftools/tools/calculate_leiden.py +57 -0
  34. smftools/tools/calculate_nmf.py +119 -0
  35. smftools/tools/calculate_umap.py +91 -8
  36. smftools/tools/rolling_nn_distance.py +235 -0
  37. smftools/tools/tensor_factorization.py +169 -0
  38. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
  39. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
  40. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  41. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  42. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from pathlib import Path
4
5
  from typing import Optional, Tuple
5
6
 
6
7
  import anndata as ad
7
8
 
8
- from smftools.logging_utils import get_logger
9
+ from smftools.constants import LOGGING_DIR, PREPROCESS_DIR
10
+ from smftools.logging_utils import get_logger, setup_logging
9
11
 
10
12
  logger = get_logger(__name__)
11
13
 
@@ -36,30 +38,23 @@ def preprocess_adata(
36
38
  Path to preprocessed, duplicate-removed AnnData.
37
39
  """
38
40
  from ..readwrite import safe_read_h5ad
39
- from .helpers import get_adata_paths
40
- from .load_adata import load_adata
41
+ from .helpers import get_adata_paths, load_experiment_config
41
42
 
42
43
  # 1) Ensure config is loaded and at least *some* AnnData stage exists
43
- loaded_adata, loaded_path, cfg = load_adata(config_path)
44
+ cfg = load_experiment_config(config_path)
44
45
 
45
46
  # 2) Compute canonical paths
46
47
  paths = get_adata_paths(cfg)
47
48
  raw_path = paths.raw
48
49
  pp_path = paths.pp
49
50
  pp_dedup_path = paths.pp_dedup
50
- spatial_path = paths.spatial
51
- hmm_path = paths.hmm
52
51
 
53
52
  raw_exists = raw_path.exists()
54
53
  pp_exists = pp_path.exists()
55
54
  pp_dedup_exists = pp_dedup_path.exists()
56
- spatial_exists = spatial_path.exists()
57
- hmm_exists = hmm_path.exists()
58
55
 
59
- # Helper: reuse loaded_adata if it matches the path we want, else read from disk
56
+ # Helper: read from disk
60
57
  def _load(path: Path):
61
- if loaded_adata is not None and loaded_path == path:
62
- return loaded_adata
63
58
  adata, _ = safe_read_h5ad(path)
64
59
  return adata
65
60
 
@@ -67,20 +62,8 @@ def preprocess_adata(
67
62
  # Case A: full redo of preprocessing
68
63
  # -----------------------------
69
64
  if getattr(cfg, "force_redo_preprocessing", False):
70
- logger.info(
71
- "Forcing full redo of preprocessing workflow, starting from latest stage AnnData available."
72
- )
73
-
74
- if hmm_exists:
75
- adata = _load(hmm_path)
76
- source_path = hmm_path
77
- elif spatial_exists:
78
- adata = _load(spatial_path)
79
- source_path = spatial_path
80
- elif pp_dedup_exists:
81
- adata = _load(pp_dedup_path)
82
- source_path = pp_dedup_path
83
- elif pp_exists:
65
+ logger.info("Forcing full redo of preprocessing workflow.")
66
+ if pp_exists:
84
67
  adata = _load(pp_path)
85
68
  source_path = pp_path
86
69
  elif raw_exists:
@@ -135,26 +118,16 @@ def preprocess_adata(
135
118
  # Case C: normal behavior (no explicit redo flags)
136
119
  # -----------------------------
137
120
 
138
- # If HMM exists, preprocessing is considered “done enough”
139
- if hmm_exists:
140
- logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
141
- return (None, None, None, None)
142
-
143
- # If spatial exists, also skip re-preprocessing by default
144
- if spatial_exists:
145
- logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
146
- return (None, None, None, None)
147
-
148
121
  # If pp_dedup exists, just return paths (no recomputation)
149
122
  if pp_dedup_exists:
150
- logger.debug(
123
+ logger.info(
151
124
  f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
152
125
  )
153
126
  return (None, pp_path, None, pp_dedup_path)
154
127
 
155
128
  # If pp exists but pp_dedup does not, load pp and run core
156
129
  if pp_exists:
157
- logger.debug(f"Preprocessed AnnData found: {pp_path}")
130
+ logger.info(f"Preprocessed AnnData found: {pp_path}")
158
131
  adata = _load(pp_path)
159
132
  source_path = pp_path
160
133
  pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
@@ -218,13 +191,19 @@ def preprocess_adata_core(
218
191
  pp_dup_rem_adata_path : Path
219
192
  Path where pp_dedup_adata was written.
220
193
  """
194
+ from datetime import datetime
221
195
  from pathlib import Path
222
196
 
223
197
  from ..metadata import record_smftools_metadata
224
- from ..plotting import plot_read_qc_histograms
198
+ from ..plotting import (
199
+ plot_read_qc_histograms,
200
+ plot_read_span_quality_clustermaps,
201
+ plot_sequence_integer_encoding_clustermaps,
202
+ )
225
203
  from ..preprocessing import (
226
204
  append_base_context,
227
205
  append_binary_layer_by_base_context,
206
+ append_mismatch_frequency_sites,
228
207
  binarize_adata,
229
208
  binarize_on_Youden,
230
209
  calculate_complexity_II,
@@ -235,22 +214,39 @@ def preprocess_adata_core(
235
214
  filter_reads_on_length_quality_mapping,
236
215
  filter_reads_on_modification_thresholds,
237
216
  flag_duplicate_reads,
217
+ invert_adata,
238
218
  load_sample_sheet,
219
+ reindex_references_adata,
239
220
  )
240
221
  from ..readwrite import make_dirs
241
222
  from .helpers import write_gz_h5ad
242
223
 
243
224
  ################################### 1) Load existing ###################################
225
+ date_str = datetime.today().strftime("%y%m%d")
226
+ now = datetime.now()
227
+ time_str = now.strftime("%H%M%S")
228
+
229
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
230
+
244
231
  # General config variable init - Necessary user passed inputs
245
232
  smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
246
233
  output_directory = Path(
247
234
  cfg.output_directory
248
235
  ) # Path to the output directory to make for the analysis. Necessary.
249
- make_dirs([output_directory])
236
+ preprocess_directory = output_directory / PREPROCESS_DIR
237
+ logging_directory = preprocess_directory / LOGGING_DIR
250
238
 
251
- ######### Begin Preprocessing #########
252
- pp_dir = output_directory / "preprocessed"
239
+ make_dirs([output_directory, preprocess_directory])
240
+
241
+ if cfg.emit_log_file:
242
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
243
+ make_dirs([logging_directory])
244
+ else:
245
+ log_file = None
253
246
 
247
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
248
+
249
+ ######### Begin Preprocessing #########
254
250
  ## Load sample sheet metadata based on barcode mapping ##
255
251
  if getattr(cfg, "sample_sheet_path", None):
256
252
  load_sample_sheet(
@@ -264,12 +260,12 @@ def preprocess_adata_core(
264
260
  pass
265
261
 
266
262
  # Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
267
- pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
263
+ pp_length_qc_dir = preprocess_directory / "01_Read_length_and_quality_QC_metrics"
268
264
 
269
265
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
270
266
  logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
271
267
  else:
272
- make_dirs([pp_dir, pp_length_qc_dir])
268
+ make_dirs([preprocess_directory, pp_length_qc_dir])
273
269
  plot_read_qc_histograms(
274
270
  adata,
275
271
  pp_length_qc_dir,
@@ -292,12 +288,12 @@ def preprocess_adata_core(
292
288
  )
293
289
  print(adata.shape)
294
290
 
295
- pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
291
+ pp_length_qc_dir = preprocess_directory / "02_Read_length_and_quality_QC_metrics_post_filtering"
296
292
 
297
293
  if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
298
294
  logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
299
295
  else:
300
- make_dirs([pp_dir, pp_length_qc_dir])
296
+ make_dirs([preprocess_directory, pp_length_qc_dir])
301
297
  plot_read_qc_histograms(
302
298
  adata,
303
299
  pp_length_qc_dir,
@@ -310,7 +306,7 @@ def preprocess_adata_core(
310
306
  if smf_modality == "direct":
311
307
  native = True
312
308
  if cfg.fit_position_methylation_thresholds:
313
- pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
309
+ pp_Youden_dir = preprocess_directory / "02B_Position_wide_Youden_threshold_performance"
314
310
  make_dirs([pp_Youden_dir])
315
311
  # Calculate positional methylation thresholds for mod calls
316
312
  calculate_position_Youden(
@@ -359,7 +355,6 @@ def preprocess_adata_core(
359
355
  )
360
356
 
361
357
  ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
362
- # Additionally, store base_context level binary modification arrays in adata.obsm
363
358
  append_base_context(
364
359
  adata,
365
360
  ref_column=cfg.reference_column,
@@ -378,17 +373,18 @@ def preprocess_adata_core(
378
373
  cfg.mod_target_bases,
379
374
  bypass=cfg.bypass_calculate_read_modification_stats,
380
375
  force_redo=cfg.force_redo_calculate_read_modification_stats,
376
+ smf_modality=cfg.smf_modality,
381
377
  )
382
378
 
383
379
  ### Make a dir for outputting sample level read modification metrics before filtering ###
384
- pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
380
+ pp_meth_qc_dir = preprocess_directory / "03_read_modification_QC_metrics"
385
381
 
386
382
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
387
383
  logger.debug(
388
384
  f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
389
385
  )
390
386
  else:
391
- make_dirs([pp_dir, pp_meth_qc_dir])
387
+ make_dirs([preprocess_directory, pp_meth_qc_dir])
392
388
  obs_to_plot = ["Raw_modification_signal"]
393
389
  if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
394
390
  obs_to_plot += [
@@ -422,14 +418,14 @@ def preprocess_adata_core(
422
418
  force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
423
419
  )
424
420
 
425
- pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
421
+ pp_meth_qc_dir = preprocess_directory / "04_read_modification_QC_metrics_post_filtering"
426
422
 
427
423
  if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
428
424
  logger.debug(
429
425
  f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
430
426
  )
431
427
  else:
432
- make_dirs([pp_dir, pp_meth_qc_dir])
428
+ make_dirs([preprocess_directory, pp_meth_qc_dir])
433
429
  obs_to_plot = ["Raw_modification_signal"]
434
430
  if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
435
431
  obs_to_plot += [
@@ -489,7 +485,7 @@ def preprocess_adata_core(
489
485
  for site_type in cfg.duplicate_detection_site_types:
490
486
  var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
491
487
 
492
- pp_dup_qc_dir = pp_dir / "05_read_duplication_QC_metrics"
488
+ pp_dup_qc_dir = preprocess_directory / "05_read_duplication_QC_metrics"
493
489
 
494
490
  make_dirs([pp_dup_qc_dir])
495
491
 
@@ -514,7 +510,7 @@ def preprocess_adata_core(
514
510
  hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
515
511
  hierarchical_metric="euclidean",
516
512
  hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
517
- demux_types=("double", "already"),
513
+ demux_types=cfg.duplicate_detection_demux_types_to_use,
518
514
  demux_col="demux_type",
519
515
  )
520
516
 
@@ -541,6 +537,135 @@ def preprocess_adata_core(
541
537
  adata_unique = adata
542
538
  ########################################################################################################################
543
539
 
540
+ # -----------------------------
541
+ # Optional inversion along positions axis
542
+ # -----------------------------
543
+ if getattr(cfg, "invert_adata", False):
544
+ adata = invert_adata(adata)
545
+
546
+ # -----------------------------
547
+ # Optional reindexing by reference
548
+ # -----------------------------
549
+ reindex_references_adata(
550
+ adata,
551
+ reference_col=cfg.reference_column,
552
+ offsets=cfg.reindexing_offsets,
553
+ new_col=cfg.reindexed_var_suffix,
554
+ )
555
+
556
+ ############################################### Append mismatch frequency per position ###############################################
557
+ append_mismatch_frequency_sites(
558
+ adata_unique,
559
+ ref_column=cfg.reference_column,
560
+ mismatch_layer=cfg.mismatch_frequency_layer,
561
+ read_span_layer=cfg.mismatch_frequency_read_span_layer,
562
+ mismatch_frequency_range=cfg.mismatch_frequency_range,
563
+ bypass=cfg.bypass_append_mismatch_frequency_sites,
564
+ force_redo=cfg.force_redo_append_mismatch_frequency_sites,
565
+ )
566
+
567
+ ############################################### Plot integer sequence encoding clustermaps ###############################################
568
+ if "sequence_integer_encoding" not in adata.layers:
569
+ logger.debug(
570
+ "sequence_integer_encoding layer not found; skipping integer encoding clustermaps."
571
+ )
572
+ else:
573
+ pp_seq_clustermap_dir = preprocess_directory / "06_sequence_integer_encoding_clustermaps"
574
+ if pp_seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
575
+ logger.debug(
576
+ f"{pp_seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
577
+ )
578
+ else:
579
+ make_dirs([pp_seq_clustermap_dir])
580
+ plot_sequence_integer_encoding_clustermaps(
581
+ adata,
582
+ sample_col=cfg.sample_name_col_for_plotting,
583
+ reference_col=cfg.reference_column,
584
+ demux_types=cfg.clustermap_demux_types_to_plot,
585
+ min_quality=None,
586
+ min_length=None,
587
+ min_mapped_length_to_reference_length_ratio=None,
588
+ sort_by="none",
589
+ max_unknown_fraction=0.5,
590
+ save_path=pp_seq_clustermap_dir,
591
+ show_position_axis=True,
592
+ )
593
+
594
+ pp_dedup_seq_clustermap_dir = (
595
+ preprocess_directory / "deduplicated" / "06_sequence_integer_encoding_clustermaps"
596
+ )
597
+ if pp_dedup_seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
598
+ logger.debug(
599
+ f"{pp_dedup_seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
600
+ )
601
+ else:
602
+ make_dirs([pp_dedup_seq_clustermap_dir])
603
+ plot_sequence_integer_encoding_clustermaps(
604
+ adata_unique,
605
+ sample_col=cfg.sample_name_col_for_plotting,
606
+ reference_col=cfg.reference_column,
607
+ demux_types=cfg.clustermap_demux_types_to_plot,
608
+ min_quality=None,
609
+ min_length=None,
610
+ min_mapped_length_to_reference_length_ratio=None,
611
+ sort_by="none",
612
+ max_unknown_fraction=0.5,
613
+ save_path=pp_dedup_seq_clustermap_dir,
614
+ show_position_axis=True,
615
+ )
616
+
617
+ ############################################### Plot read span mask + base quality clustermaps ###############################################
618
+ quality_layer = None
619
+ if "base_quality_scores" in adata.layers:
620
+ quality_layer = "base_quality_scores"
621
+ elif "base_qualities" in adata.layers:
622
+ quality_layer = "base_qualities"
623
+
624
+ if "read_span_mask" not in adata.layers or quality_layer is None:
625
+ logger.debug(
626
+ "read_span_mask and base quality layers not found; skipping read span/base quality clustermaps."
627
+ )
628
+ else:
629
+ pp_span_quality_dir = preprocess_directory / "07_read_span_quality_clustermaps"
630
+ if pp_span_quality_dir.is_dir() and not cfg.force_redo_preprocessing:
631
+ logger.debug(
632
+ f"{pp_span_quality_dir} already exists. Skipping read span/base quality clustermaps."
633
+ )
634
+ else:
635
+ make_dirs([pp_span_quality_dir])
636
+ plot_read_span_quality_clustermaps(
637
+ adata,
638
+ sample_col=cfg.sample_name_col_for_plotting,
639
+ reference_col=cfg.reference_column,
640
+ quality_layer=quality_layer,
641
+ read_span_layer="read_span_mask",
642
+ demux_types=cfg.clustermap_demux_types_to_plot,
643
+ save_path=pp_span_quality_dir,
644
+ show_position_axis=True,
645
+ max_nan_fraction=0.5,
646
+ )
647
+
648
+ pp_dedup_span_quality_dir = (
649
+ preprocess_directory / "deduplicated" / "07_read_span_quality_clustermaps"
650
+ )
651
+ if pp_dedup_span_quality_dir.is_dir() and not cfg.force_redo_preprocessing:
652
+ logger.debug(
653
+ f"{pp_dedup_span_quality_dir} already exists. Skipping read span/base quality clustermaps."
654
+ )
655
+ elif quality_layer in adata_unique.layers and "read_span_mask" in adata_unique.layers:
656
+ make_dirs([pp_dedup_span_quality_dir])
657
+ plot_read_span_quality_clustermaps(
658
+ adata_unique,
659
+ sample_col=cfg.sample_name_col_for_plotting,
660
+ reference_col=cfg.reference_column,
661
+ quality_layer=quality_layer,
662
+ read_span_layer="read_span_mask",
663
+ demux_types=cfg.clustermap_demux_types_to_plot,
664
+ save_path=pp_dedup_span_quality_dir,
665
+ show_position_axis=True,
666
+ max_nan_fraction=0.5,
667
+ )
668
+
544
669
  ############################################### Save preprocessed adata with duplicate detection ###############################################
545
670
  if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
546
671
  logger.info("Saving preprocessed adata.")