smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +49 -7
  4. smftools/cli/hmm_adata.py +250 -32
  5. smftools/cli/latent_adata.py +773 -0
  6. smftools/cli/load_adata.py +78 -74
  7. smftools/cli/preprocess_adata.py +122 -58
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +74 -112
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +52 -4
  12. smftools/config/conversion.yaml +1 -1
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +85 -12
  15. smftools/config/experiment_config.py +146 -1
  16. smftools/constants.py +69 -0
  17. smftools/hmm/HMM.py +88 -0
  18. smftools/hmm/call_hmm_peaks.py +1 -1
  19. smftools/informatics/__init__.py +6 -0
  20. smftools/informatics/bam_functions.py +358 -8
  21. smftools/informatics/binarize_converted_base_identities.py +2 -89
  22. smftools/informatics/converted_BAM_to_adata.py +636 -175
  23. smftools/informatics/h5ad_functions.py +198 -2
  24. smftools/informatics/modkit_extract_to_adata.py +1007 -425
  25. smftools/informatics/sequence_encoding.py +72 -0
  26. smftools/logging_utils.py +21 -2
  27. smftools/metadata.py +1 -1
  28. smftools/plotting/__init__.py +26 -3
  29. smftools/plotting/autocorrelation_plotting.py +22 -4
  30. smftools/plotting/chimeric_plotting.py +1893 -0
  31. smftools/plotting/classifiers.py +28 -14
  32. smftools/plotting/general_plotting.py +62 -1583
  33. smftools/plotting/hmm_plotting.py +1670 -8
  34. smftools/plotting/latent_plotting.py +804 -0
  35. smftools/plotting/plotting_utils.py +243 -0
  36. smftools/plotting/position_stats.py +16 -8
  37. smftools/plotting/preprocess_plotting.py +281 -0
  38. smftools/plotting/qc_plotting.py +8 -3
  39. smftools/plotting/spatial_plotting.py +1134 -0
  40. smftools/plotting/variant_plotting.py +1231 -0
  41. smftools/preprocessing/__init__.py +4 -0
  42. smftools/preprocessing/append_base_context.py +18 -18
  43. smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
  44. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  45. smftools/preprocessing/append_variant_call_layer.py +480 -0
  46. smftools/preprocessing/calculate_consensus.py +1 -1
  47. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  48. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/readwrite.py +159 -99
  51. smftools/schema/anndata_schema_v1.yaml +15 -1
  52. smftools/tools/__init__.py +10 -0
  53. smftools/tools/calculate_knn.py +121 -0
  54. smftools/tools/calculate_leiden.py +57 -0
  55. smftools/tools/calculate_nmf.py +130 -0
  56. smftools/tools/calculate_pca.py +180 -0
  57. smftools/tools/calculate_umap.py +79 -80
  58. smftools/tools/position_stats.py +4 -4
  59. smftools/tools/rolling_nn_distance.py +872 -0
  60. smftools/tools/sequence_alignment.py +140 -0
  61. smftools/tools/tensor_factorization.py +217 -0
  62. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
  63. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
  64. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  65. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  66. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py CHANGED
@@ -9,6 +9,10 @@ import anndata as ad
9
9
  import pandas as pd
10
10
  from Bio import SeqIO
11
11
 
12
+ from smftools.logging_utils import get_logger
13
+
14
+ logger = get_logger(__name__)
15
+
12
16
 
13
17
  ######################################################################################################
14
18
  ## Datetime functionality
@@ -431,6 +435,8 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
431
435
  "layers_skipped": [],
432
436
  "obsm_converted": [],
433
437
  "obsm_skipped": [],
438
+ "varm_converted": [],
439
+ "varm_skipped": [],
434
440
  "X_replaced_or_converted": None,
435
441
  "errors": [],
436
442
  }
@@ -462,7 +468,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
462
468
  ser = df[col]
463
469
  # categorical handling
464
470
  try:
465
- is_cat = pd.api.types.is_categorical_dtype(ser.dtype)
471
+ is_cat = isinstance(ser.dtype, pd.CategoricalDtype)
466
472
  except Exception:
467
473
  is_cat = False
468
474
 
@@ -472,7 +478,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
472
478
  cats_str = cats.astype(str)
473
479
  df[col] = pd.Categorical(ser.astype(str), categories=cats_str)
474
480
  if verbose:
475
- print(f" coerced categorical column '{which}.{col}' -> string categories")
481
+ logger.debug(
482
+ f" coerced categorical column '{which}.{col}' -> string categories"
483
+ )
476
484
  if which == "obs":
477
485
  report["obs_converted_columns"].append(col)
478
486
  else:
@@ -487,7 +495,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
487
495
  report["var_backed_up_columns"].append(col)
488
496
  df[col] = ser.astype(str)
489
497
  if verbose:
490
- print(
498
+ logger.debug(
491
499
  f" coerced categorical column '{which}.{col}' -> strings (backup={backup})"
492
500
  )
493
501
  continue
@@ -510,7 +518,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
510
518
  report["var_backed_up_columns"].append(col)
511
519
  df[col] = ser.values.astype(str)
512
520
  if verbose:
513
- print(
521
+ logger.debug(
514
522
  f" converted object column '{which}.{col}' -> strings (backup={backup})"
515
523
  )
516
524
  if which == "obs":
@@ -535,7 +543,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
535
543
  report["var_backed_up_columns"].append(col)
536
544
  df[col] = [json.dumps(v, default=str) for v in ser.values]
537
545
  if verbose:
538
- print(
546
+ logger.debug(
539
547
  f" json-stringified object column '{which}.{col}' (backup={backup})"
540
548
  )
541
549
  if which == "obs":
@@ -552,7 +560,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
552
560
  report["var_backed_up_columns"].append(col)
553
561
  df[col] = ser.astype(str)
554
562
  if verbose:
555
- print(
563
+ logger.debug(
556
564
  f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up)."
557
565
  )
558
566
  if which == "obs":
@@ -581,7 +589,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
581
589
  _backup(v, f"uns_{k}_backup")
582
590
  backed_up.append(k)
583
591
  if verbose:
584
- print(
592
+ logger.debug(
585
593
  f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})"
586
594
  )
587
595
  report["uns_json_keys"].append(k)
@@ -592,23 +600,31 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
592
600
  clean[k + "_str"] = str(v)
593
601
  backed_up.append(k)
594
602
  if verbose:
595
- print(f" uns['{k}'] stored as string under '{k}_str' (backed up).")
603
+ logger.debug(
604
+ f" uns['{k}'] stored as string under '{k}_str' (backed up)."
605
+ )
596
606
  report["uns_backed_up_keys"].append(k)
597
607
  except Exception as e:
598
608
  msg = f"uns['{k}'] could not be preserved: {e}"
599
609
  report["errors"].append(msg)
600
610
  if verbose:
601
- print(" " + msg)
611
+ logger.debug(" " + msg)
602
612
  if backed_up and verbose:
603
- print(f"Sanitized .uns keys (backed up): {backed_up}")
613
+ logger.debug(f"Sanitized .uns keys (backed up): {backed_up}")
604
614
  return clean
605
615
 
606
616
  def _sanitize_layers_obsm(src_dict, which: str):
607
617
  """
608
- Ensure arrays in layers/obsm are numeric and non-object dtype.
618
+ Ensure arrays in layers/obsm/varm are numeric and non-object dtype.
609
619
  Returns a cleaned dict suitable to pass into AnnData(...)
610
620
  If an entry is not convertible, it is backed up & skipped.
611
621
  """
622
+ report_map = {
623
+ "layers": ("layers_converted", "layers_skipped"),
624
+ "obsm": ("obsm_converted", "obsm_skipped"),
625
+ "varm": ("varm_converted", "varm_skipped"),
626
+ }
627
+ converted_key, skipped_key = report_map[which]
612
628
  cleaned = {}
613
629
  for k, v in src_dict.items():
614
630
  try:
@@ -618,30 +634,23 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
618
634
  arr_f = arr.astype(float)
619
635
  cleaned[k] = arr_f
620
636
  report_key = f"{which}.{k}"
621
- report["layers_converted"].append(
622
- report_key
623
- ) if which == "layers" else report["obsm_converted"].append(report_key)
637
+ report[converted_key].append(report_key)
624
638
  if verbose:
625
- print(f" {which}.{k} object array coerced to float.")
639
+ logger.debug(f" {which}.{k} object array coerced to float.")
626
640
  except Exception:
627
641
  try:
628
642
  arr_i = arr.astype(int)
629
643
  cleaned[k] = arr_i
630
644
  report_key = f"{which}.{k}"
631
- report["layers_converted"].append(
632
- report_key
633
- ) if which == "layers" else report["obsm_converted"].append(report_key)
645
+ report[converted_key].append(report_key)
634
646
  if verbose:
635
- print(f" {which}.{k} object array coerced to int.")
647
+ logger.debug(f" {which}.{k} object array coerced to int.")
636
648
  except Exception:
637
649
  if backup:
638
650
  _backup(v, f"{which}_{k}_backup")
639
- if which == "layers":
640
- report["layers_skipped"].append(k)
641
- else:
642
- report["obsm_skipped"].append(k)
651
+ report[skipped_key].append(k)
643
652
  if verbose:
644
- print(
653
+ logger.debug(
645
654
  f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}"
646
655
  )
647
656
  continue
@@ -650,14 +659,11 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
650
659
  except Exception as e:
651
660
  if backup:
652
661
  _backup(v, f"{which}_{k}_backup")
653
- if which == "layers":
654
- report["layers_skipped"].append(k)
655
- else:
656
- report["obsm_skipped"].append(k)
662
+ report[skipped_key].append(k)
657
663
  msg = f" SKIPPING {which}.{k} due to conversion error: {e}"
658
664
  report["errors"].append(msg)
659
665
  if verbose:
660
- print(msg)
666
+ logger.debug(msg)
661
667
  continue
662
668
  return cleaned
663
669
 
@@ -668,7 +674,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
668
674
  msg = f"Failed to sanitize obs: {e}"
669
675
  report["errors"].append(msg)
670
676
  if verbose:
671
- print(msg)
677
+ logger.debug(msg)
672
678
  obs_clean = adata.obs.copy()
673
679
 
674
680
  try:
@@ -677,7 +683,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
677
683
  msg = f"Failed to sanitize var: {e}"
678
684
  report["errors"].append(msg)
679
685
  if verbose:
680
- print(msg)
686
+ logger.debug(msg)
681
687
  var_clean = adata.var.copy()
682
688
 
683
689
  # ---------- sanitize uns ----------
@@ -687,12 +693,13 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
687
693
  msg = f"Failed to sanitize uns: {e}"
688
694
  report["errors"].append(msg)
689
695
  if verbose:
690
- print(msg)
696
+ logger.debug(msg)
691
697
  uns_clean = {}
692
698
 
693
699
  # ---------- sanitize layers and obsm ----------
694
700
  layers_src = getattr(adata, "layers", {})
695
701
  obsm_src = getattr(adata, "obsm", {})
702
+ varm_src = getattr(adata, "varm", {})
696
703
 
697
704
  try:
698
705
  layers_clean = _sanitize_layers_obsm(layers_src, "layers")
@@ -700,7 +707,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
700
707
  msg = f"Failed to sanitize layers: {e}"
701
708
  report["errors"].append(msg)
702
709
  if verbose:
703
- print(msg)
710
+ logger.debug(msg)
704
711
  layers_clean = {}
705
712
 
706
713
  try:
@@ -709,9 +716,18 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
709
716
  msg = f"Failed to sanitize obsm: {e}"
710
717
  report["errors"].append(msg)
711
718
  if verbose:
712
- print(msg)
719
+ logger.debug(msg)
713
720
  obsm_clean = {}
714
721
 
722
+ try:
723
+ varm_clean = _sanitize_layers_obsm(varm_src, "varm")
724
+ except Exception as e:
725
+ msg = f"Failed to sanitize varm: {e}"
726
+ report["errors"].append(msg)
727
+ if verbose:
728
+ logger.debug(msg)
729
+ varm_clean = {}
730
+
715
731
  # ---------- handle X ----------
716
732
  X_to_use = adata.X
717
733
  try:
@@ -721,21 +737,21 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
721
737
  X_to_use = X_arr.astype(float)
722
738
  report["X_replaced_or_converted"] = "converted_to_float"
723
739
  if verbose:
724
- print("Converted adata.X object-dtype -> float")
740
+ logger.debug("Converted adata.X object-dtype -> float")
725
741
  except Exception:
726
742
  if backup:
727
743
  _backup(adata.X, "X_backup")
728
744
  X_to_use = np.zeros_like(X_arr, dtype=float)
729
745
  report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
730
746
  if verbose:
731
- print(
747
+ logger.debug(
732
748
  "adata.X had object dtype and couldn't be converted; replaced with zeros (backup set)."
733
749
  )
734
750
  except Exception as e:
735
751
  msg = f"Error handling adata.X: {e}"
736
752
  report["errors"].append(msg)
737
753
  if verbose:
738
- print(msg)
754
+ logger.debug(msg)
739
755
  X_to_use = adata.X
740
756
 
741
757
  # ---------- build lightweight AnnData copy ----------
@@ -747,7 +763,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
747
763
  layers=layers_clean,
748
764
  uns=uns_clean,
749
765
  obsm=obsm_clean,
750
- varm=getattr(adata, "varm", None),
766
+ varm=varm_clean,
751
767
  )
752
768
 
753
769
  # preserve names (as strings)
@@ -761,71 +777,71 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
761
777
  # --- write
762
778
  adata_copy.write_h5ad(path, compression=compression)
763
779
  if verbose:
764
- print(f"Saved safely to {path}")
780
+ logger.debug(f"Saved safely to {path}")
765
781
  except Exception as e:
766
782
  msg = f"Failed to write h5ad: {e}"
767
783
  report["errors"].append(msg)
768
784
  if verbose:
769
- print(msg)
785
+ logger.error(msg)
770
786
  raise
771
787
 
772
788
  # Print a concise interactive report
773
- print("\n=== safe_write_h5ad REPORT ===")
774
- print(f"Saved file: {path}")
775
- print(f"Adata shape: {adata.shape}")
789
+ logger.info("\n=== safe_write_h5ad REPORT ===")
790
+ logger.info(f"Saved file: {path}")
791
+ logger.info(f"Adata shape: {adata.shape}")
776
792
  if report["obs_converted_columns"] or report["obs_backed_up_columns"]:
777
- print(f"obs: converted columns -> {report['obs_converted_columns']}")
778
- print(f"obs: backed-up columns -> {report['obs_backed_up_columns']}")
793
+ logger.debug(f"obs: converted columns -> {report['obs_converted_columns']}")
794
+ logger.debug(f"obs: backed-up columns -> {report['obs_backed_up_columns']}")
779
795
  else:
780
- print("obs: no problematic columns found.")
796
+ logger.debug("obs: no problematic columns found.")
781
797
 
782
798
  if report["var_converted_columns"] or report["var_backed_up_columns"]:
783
- print(f"var: converted columns -> {report['var_converted_columns']}")
784
- print(f"var: backed-up columns -> {report['var_backed_up_columns']}")
799
+ logger.debug(f"var: converted columns -> {report['var_converted_columns']}")
800
+ logger.debug(f"var: backed-up columns -> {report['var_backed_up_columns']}")
785
801
  else:
786
- print("var: no problematic columns found.")
802
+ logger.debug("var: no problematic columns found.")
787
803
 
788
804
  if report["uns_json_keys"] or report["uns_backed_up_keys"]:
789
- print(f".uns: jsonified keys -> {report['uns_json_keys']}")
790
- print(f".uns: backed-up keys -> {report['uns_backed_up_keys']}")
805
+ logger.debug(f".uns: jsonified keys -> {report['uns_json_keys']}")
806
+ logger.debug(f".uns: backed-up keys -> {report['uns_backed_up_keys']}")
791
807
  else:
792
- print(".uns: no problematic keys found.")
808
+ logger.debug(".uns: no problematic keys found.")
793
809
 
794
810
  if report["layers_converted"] or report["layers_skipped"]:
795
- print(f"layers: converted -> {report['layers_converted']}")
796
- print(f"layers: skipped -> {report['layers_skipped']}")
811
+ logger.debug(f"layers: converted -> {report['layers_converted']}")
812
+ logger.debug(f"layers: skipped -> {report['layers_skipped']}")
797
813
  else:
798
- print("layers: no problematic entries found.")
814
+ logger.debug("layers: no problematic entries found.")
799
815
 
800
816
  if report["obsm_converted"] or report["obsm_skipped"]:
801
- print(f"obsm: converted -> {report['obsm_converted']}")
802
- print(f"obsm: skipped -> {report['obsm_skipped']}")
817
+ logger.debug(f"obsm: converted -> {report['obsm_converted']}")
818
+ logger.debug(f"obsm: skipped -> {report['obsm_skipped']}")
803
819
  else:
804
- print("obsm: no problematic entries found.")
820
+ logger.debug("obsm: no problematic entries found.")
805
821
 
806
822
  if report["X_replaced_or_converted"]:
807
- print(f"adata.X handled: {report['X_replaced_or_converted']}")
823
+ logger.debug(f"adata.X handled: {report['X_replaced_or_converted']}")
808
824
  else:
809
- print("adata.X: no changes.")
825
+ logger.debug("adata.X: no changes.")
810
826
 
811
827
  if report["errors"]:
812
- print("\nWarnings / errors encountered:")
828
+ logger.error("\nWarnings / errors encountered:")
813
829
  for e in report["errors"]:
814
- print(" -", e)
830
+ logger.error(" -", e)
815
831
 
816
- print("=== end report ===\n")
832
+ logger.info("=== end report ===\n")
817
833
 
818
834
  # ---------- create CSV output directory ----------
819
835
  try:
820
836
  csv_dir = path.parent / "csvs"
821
837
  csv_dir.mkdir(exist_ok=True)
822
838
  if verbose:
823
- print(f"CSV outputs will be written to: {csv_dir}")
839
+ logger.info(f"CSV outputs will be written to: {csv_dir}")
824
840
  except Exception as e:
825
841
  msg = f"Failed to create CSV output directory: {e}"
826
842
  report["errors"].append(msg)
827
843
  if verbose:
828
- print(msg)
844
+ logger.error(msg)
829
845
  csv_dir = path.parent # fallback just in case
830
846
 
831
847
  # ---------- write keys summary CSV ----------
@@ -872,6 +888,26 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
872
888
  }
873
889
  )
874
890
 
891
+ # varm
892
+ for k, v in adata_copy.varm.items():
893
+ meta_rows.append(
894
+ {
895
+ "kind": "varm",
896
+ "name": k,
897
+ "dtype": str(np.asarray(v).dtype),
898
+ }
899
+ )
900
+
901
+ # obsp
902
+ for k, v in adata_copy.obsp.items():
903
+ meta_rows.append(
904
+ {
905
+ "kind": "obsp",
906
+ "name": k,
907
+ "dtype": str(np.asarray(v).dtype),
908
+ }
909
+ )
910
+
875
911
  # uns
876
912
  for k, v in adata_copy.uns.items():
877
913
  meta_rows.append(
@@ -890,13 +926,13 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
890
926
 
891
927
  meta_df.to_csv(meta_path, index=False)
892
928
  if verbose:
893
- print(f"Wrote keys summary CSV to {meta_path}")
929
+ logger.info(f"Wrote keys summary CSV to {meta_path}")
894
930
 
895
931
  except Exception as e:
896
932
  msg = f"Failed to write keys CSV: {e}"
897
933
  report["errors"].append(msg)
898
934
  if verbose:
899
- print(msg)
935
+ logger.error(msg)
900
936
 
901
937
  # ---------- write full obs and var dataframes ----------
902
938
  try:
@@ -909,14 +945,14 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
909
945
  adata_copy.var.to_csv(var_path, index=True)
910
946
 
911
947
  if verbose:
912
- print(f"Wrote obs DataFrame to {obs_path}")
913
- print(f"Wrote var DataFrame to {var_path}")
948
+ logger.info(f"Wrote obs DataFrame to {obs_path}")
949
+ logger.info(f"Wrote var DataFrame to {var_path}")
914
950
 
915
951
  except Exception as e:
916
952
  msg = f"Failed to write obs/var CSVs: {e}"
917
953
  report["errors"].append(msg)
918
954
  if verbose:
919
- print(msg)
955
+ logger.error(msg)
920
956
 
921
957
  return report
922
958
 
@@ -977,6 +1013,7 @@ def safe_read_h5ad(
977
1013
  "parsed_uns_json_keys": [],
978
1014
  "restored_layers": [],
979
1015
  "restored_obsm": [],
1016
+ "restored_varm": [],
980
1017
  "recategorized_obs": [],
981
1018
  "recategorized_var": [],
982
1019
  "missing_backups": [],
@@ -984,7 +1021,7 @@ def safe_read_h5ad(
984
1021
  }
985
1022
 
986
1023
  if verbose:
987
- print(f"[safe_read_h5ad] loading {path}")
1024
+ logger.info(f"[safe_read_h5ad] loading {path}")
988
1025
 
989
1026
  # 1) load the cleaned h5ad
990
1027
  try:
@@ -994,7 +1031,7 @@ def safe_read_h5ad(
994
1031
 
995
1032
  # Ensure backup_dir exists (may be relative to cwd)
996
1033
  if verbose:
997
- print(f"[safe_read_h5ad] looking for backups in {backup_dir}")
1034
+ logger.debug(f"[safe_read_h5ad] looking for backups in {backup_dir}")
998
1035
 
999
1036
  def _load_pickle_if_exists(fname):
1000
1037
  if os.path.exists(fname):
@@ -1005,7 +1042,7 @@ def safe_read_h5ad(
1005
1042
  except Exception as e:
1006
1043
  report["errors"].append(f"Failed to load pickle {fname}: {e}")
1007
1044
  if verbose:
1008
- print(f" error loading {fname}: {e}")
1045
+ logger.error(f" error loading {fname}: {e}")
1009
1046
  return None
1010
1047
  return None
1011
1048
 
@@ -1030,7 +1067,7 @@ def safe_read_h5ad(
1030
1067
  report["restored_obs_columns"].append((col, bname2))
1031
1068
  restored = True
1032
1069
  if verbose:
1033
- print(f"[safe_read_h5ad] restored obs.{col} from {bname2}")
1070
+ logger.debug(f"[safe_read_h5ad] restored obs.{col} from {bname2}")
1034
1071
  except Exception as e:
1035
1072
  report["errors"].append(f"Failed to restore obs.{col} from {bname2}: {e}")
1036
1073
  restored = False
@@ -1048,7 +1085,7 @@ def safe_read_h5ad(
1048
1085
  report["restored_obs_columns"].append((col, bname1))
1049
1086
  restored = True
1050
1087
  if verbose:
1051
- print(f"[safe_read_h5ad] restored obs.{col} from {bname1}")
1088
+ logger.debug(f"[safe_read_h5ad] restored obs.{col} from {bname1}")
1052
1089
  except Exception as e:
1053
1090
  report["errors"].append(f"Failed to restore obs.{col} from {bname1}: {e}")
1054
1091
  restored = False
@@ -1079,7 +1116,7 @@ def safe_read_h5ad(
1079
1116
  report["restored_obs_columns"].append((col, "parsed_json"))
1080
1117
  restored = True
1081
1118
  if verbose:
1082
- print(
1119
+ logger.debug(
1083
1120
  f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects"
1084
1121
  )
1085
1122
 
@@ -1092,7 +1129,7 @@ def safe_read_h5ad(
1092
1129
  adata.obs[col] = adata.obs[col].astype(str).astype("category")
1093
1130
  report["recategorized_obs"].append(col)
1094
1131
  if verbose:
1095
- print(
1132
+ logger.debug(
1096
1133
  f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})"
1097
1134
  )
1098
1135
  except Exception as e:
@@ -1115,7 +1152,7 @@ def safe_read_h5ad(
1115
1152
  report["restored_var_columns"].append((col, bname2))
1116
1153
  restored = True
1117
1154
  if verbose:
1118
- print(f"[safe_read_h5ad] restored var.{col} from {bname2}")
1155
+ logger.debug(f"[safe_read_h5ad] restored var.{col} from {bname2}")
1119
1156
  except Exception as e:
1120
1157
  report["errors"].append(f"Failed to restore var.{col} from {bname2}: {e}")
1121
1158
 
@@ -1132,7 +1169,7 @@ def safe_read_h5ad(
1132
1169
  report["restored_var_columns"].append((col, bname1))
1133
1170
  restored = True
1134
1171
  if verbose:
1135
- print(f"[safe_read_h5ad] restored var.{col} from {bname1}")
1172
+ logger.debug(f"[safe_read_h5ad] restored var.{col} from {bname1}")
1136
1173
  except Exception as e:
1137
1174
  report["errors"].append(f"Failed to restore var.{col} from {bname1}: {e}")
1138
1175
 
@@ -1160,7 +1197,7 @@ def safe_read_h5ad(
1160
1197
  adata.var[col] = pd.Series(parsed, index=adata.var.index)
1161
1198
  report["restored_var_columns"].append((col, "parsed_json"))
1162
1199
  if verbose:
1163
- print(
1200
+ logger.debug(
1164
1201
  f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects"
1165
1202
  )
1166
1203
 
@@ -1171,7 +1208,7 @@ def safe_read_h5ad(
1171
1208
  adata.var[col] = adata.var[col].astype(str).astype("category")
1172
1209
  report["recategorized_var"].append(col)
1173
1210
  if verbose:
1174
- print(
1211
+ logger.debug(
1175
1212
  f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})"
1176
1213
  )
1177
1214
  except Exception as e:
@@ -1189,7 +1226,7 @@ def safe_read_h5ad(
1189
1226
  adata.uns[base] = parsed
1190
1227
  report["parsed_uns_json_keys"].append(base)
1191
1228
  if verbose:
1192
- print(f"[safe_read_h5ad] parsed adata.uns['{k}'] -> adata.uns['{base}']")
1229
+ logger.debug(f"[safe_read_h5ad] parsed adata.uns['{k}'] -> adata.uns['{base}']")
1193
1230
  # remove the _json entry
1194
1231
  try:
1195
1232
  del adata.uns[k]
@@ -1212,10 +1249,10 @@ def safe_read_h5ad(
1212
1249
  adata.uns[key] = val
1213
1250
  report["restored_uns_keys"].append((key, full))
1214
1251
  if verbose:
1215
- print(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
1252
+ logger.debug(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
1216
1253
 
1217
1254
  # 5) Restore layers and obsm from backups if present
1218
- # expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl
1255
+ # expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl, varm_<name>_backup.pkl
1219
1256
  if os.path.isdir(backup_dir):
1220
1257
  for fname in os.listdir(backup_dir):
1221
1258
  if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
@@ -1227,7 +1264,9 @@ def safe_read_h5ad(
1227
1264
  adata.layers[layer_name] = np.asarray(val)
1228
1265
  report["restored_layers"].append((layer_name, full))
1229
1266
  if verbose:
1230
- print(f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}")
1267
+ logger.debug(
1268
+ f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}"
1269
+ )
1231
1270
  except Exception as e:
1232
1271
  report["errors"].append(
1233
1272
  f"Failed to restore layers['{layer_name}'] from {full}: {e}"
@@ -1242,12 +1281,31 @@ def safe_read_h5ad(
1242
1281
  adata.obsm[obsm_name] = np.asarray(val)
1243
1282
  report["restored_obsm"].append((obsm_name, full))
1244
1283
  if verbose:
1245
- print(f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}")
1284
+ logger.debug(
1285
+ f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}"
1286
+ )
1246
1287
  except Exception as e:
1247
1288
  report["errors"].append(
1248
1289
  f"Failed to restore obsm['{obsm_name}'] from {full}: {e}"
1249
1290
  )
1250
1291
 
1292
+ if fname.startswith("varm_") and fname.endswith("_backup.pkl"):
1293
+ varm_name = fname[len("varm_") : -len("_backup.pkl")]
1294
+ full = os.path.join(backup_dir, fname)
1295
+ val = _load_pickle_if_exists(full)
1296
+ if val is not None:
1297
+ try:
1298
+ adata.varm[varm_name] = np.asarray(val)
1299
+ report["restored_varm"].append((varm_name, full))
1300
+ if verbose:
1301
+ logger.debug(
1302
+ f"[safe_read_h5ad] restored varm['{varm_name}'] from {full}"
1303
+ )
1304
+ except Exception as e:
1305
+ report["errors"].append(
1306
+ f"Failed to restore varm['{varm_name}'] from {full}: {e}"
1307
+ )
1308
+
1251
1309
  # 6) If restore_backups True but some expected backups missing, note them
1252
1310
  if restore_backups and os.path.isdir(backup_dir):
1253
1311
  # detect common expected names from obs/var/uns/layers in adata
@@ -1276,7 +1334,7 @@ def safe_read_h5ad(
1276
1334
  if expected_missing and verbose:
1277
1335
  n = len(expected_missing)
1278
1336
  if verbose:
1279
- print(
1337
+ logger.warning(
1280
1338
  f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable."
1281
1339
  )
1282
1340
  # add to report
@@ -1284,35 +1342,37 @@ def safe_read_h5ad(
1284
1342
 
1285
1343
  # final summary print
1286
1344
  if verbose:
1287
- print("\n=== safe_read_h5ad summary ===")
1345
+ logger.info("\n=== safe_read_h5ad summary ===")
1288
1346
  if report["restored_obs_columns"]:
1289
- print("Restored obs columns:", report["restored_obs_columns"])
1347
+ logger.info("Restored obs columns:", report["restored_obs_columns"])
1290
1348
  if report["restored_var_columns"]:
1291
- print("Restored var columns:", report["restored_var_columns"])
1349
+ logger.info("Restored var columns:", report["restored_var_columns"])
1292
1350
  if report["restored_uns_keys"]:
1293
- print("Restored uns keys:", report["restored_uns_keys"])
1351
+ logger.info("Restored uns keys:", report["restored_uns_keys"])
1294
1352
  if report["parsed_uns_json_keys"]:
1295
- print("Parsed uns JSON keys:", report["parsed_uns_json_keys"])
1353
+ logger.info("Parsed uns JSON keys:", report["parsed_uns_json_keys"])
1296
1354
  if report["restored_layers"]:
1297
- print("Restored layers:", report["restored_layers"])
1355
+ logger.info("Restored layers:", report["restored_layers"])
1298
1356
  if report["restored_obsm"]:
1299
- print("Restored obsm:", report["restored_obsm"])
1357
+ logger.info("Restored obsm:", report["restored_obsm"])
1358
+ if report["restored_varm"]:
1359
+ logger.info("Restored varm:", report["restored_varm"])
1300
1360
  if report["recategorized_obs"] or report["recategorized_var"]:
1301
- print(
1361
+ logger.info(
1302
1362
  "Recategorized columns (obs/var):",
1303
1363
  report["recategorized_obs"],
1304
1364
  report["recategorized_var"],
1305
1365
  )
1306
1366
  if report["missing_backups"]:
1307
- print(
1367
+ logger.info(
1308
1368
  "Missing backups or object columns without backups (investigate):",
1309
1369
  report["missing_backups"],
1310
1370
  )
1311
1371
  if report["errors"]:
1312
- print("Errors encountered (see report['errors']):")
1372
+ logger.error("Errors encountered (see report['errors']):")
1313
1373
  for e in report["errors"]:
1314
- print(" -", e)
1315
- print("=== end summary ===\n")
1374
+ logger.error(" -", e)
1375
+ logger.info("=== end summary ===\n")
1316
1376
 
1317
1377
  return adata, report
1318
1378