smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +49 -7
- smftools/cli/hmm_adata.py +250 -32
- smftools/cli/latent_adata.py +773 -0
- smftools/cli/load_adata.py +78 -74
- smftools/cli/preprocess_adata.py +122 -58
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +74 -112
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +52 -4
- smftools/config/conversion.yaml +1 -1
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +85 -12
- smftools/config/experiment_config.py +146 -1
- smftools/constants.py +69 -0
- smftools/hmm/HMM.py +88 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +636 -175
- smftools/informatics/h5ad_functions.py +198 -2
- smftools/informatics/modkit_extract_to_adata.py +1007 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +26 -3
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +62 -1583
- smftools/plotting/hmm_plotting.py +1670 -8
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +4 -0
- smftools/preprocessing/append_base_context.py +18 -18
- smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +159 -99
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +10 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +130 -0
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +79 -80
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +872 -0
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +217 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py
CHANGED
|
@@ -9,6 +9,10 @@ import anndata as ad
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
from Bio import SeqIO
|
|
11
11
|
|
|
12
|
+
from smftools.logging_utils import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
12
16
|
|
|
13
17
|
######################################################################################################
|
|
14
18
|
## Datetime functionality
|
|
@@ -431,6 +435,8 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
431
435
|
"layers_skipped": [],
|
|
432
436
|
"obsm_converted": [],
|
|
433
437
|
"obsm_skipped": [],
|
|
438
|
+
"varm_converted": [],
|
|
439
|
+
"varm_skipped": [],
|
|
434
440
|
"X_replaced_or_converted": None,
|
|
435
441
|
"errors": [],
|
|
436
442
|
}
|
|
@@ -462,7 +468,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
462
468
|
ser = df[col]
|
|
463
469
|
# categorical handling
|
|
464
470
|
try:
|
|
465
|
-
is_cat =
|
|
471
|
+
is_cat = isinstance(ser.dtype, pd.CategoricalDtype)
|
|
466
472
|
except Exception:
|
|
467
473
|
is_cat = False
|
|
468
474
|
|
|
@@ -472,7 +478,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
472
478
|
cats_str = cats.astype(str)
|
|
473
479
|
df[col] = pd.Categorical(ser.astype(str), categories=cats_str)
|
|
474
480
|
if verbose:
|
|
475
|
-
|
|
481
|
+
logger.debug(
|
|
482
|
+
f" coerced categorical column '{which}.{col}' -> string categories"
|
|
483
|
+
)
|
|
476
484
|
if which == "obs":
|
|
477
485
|
report["obs_converted_columns"].append(col)
|
|
478
486
|
else:
|
|
@@ -487,7 +495,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
487
495
|
report["var_backed_up_columns"].append(col)
|
|
488
496
|
df[col] = ser.astype(str)
|
|
489
497
|
if verbose:
|
|
490
|
-
|
|
498
|
+
logger.debug(
|
|
491
499
|
f" coerced categorical column '{which}.{col}' -> strings (backup={backup})"
|
|
492
500
|
)
|
|
493
501
|
continue
|
|
@@ -510,7 +518,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
510
518
|
report["var_backed_up_columns"].append(col)
|
|
511
519
|
df[col] = ser.values.astype(str)
|
|
512
520
|
if verbose:
|
|
513
|
-
|
|
521
|
+
logger.debug(
|
|
514
522
|
f" converted object column '{which}.{col}' -> strings (backup={backup})"
|
|
515
523
|
)
|
|
516
524
|
if which == "obs":
|
|
@@ -535,7 +543,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
535
543
|
report["var_backed_up_columns"].append(col)
|
|
536
544
|
df[col] = [json.dumps(v, default=str) for v in ser.values]
|
|
537
545
|
if verbose:
|
|
538
|
-
|
|
546
|
+
logger.debug(
|
|
539
547
|
f" json-stringified object column '{which}.{col}' (backup={backup})"
|
|
540
548
|
)
|
|
541
549
|
if which == "obs":
|
|
@@ -552,7 +560,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
552
560
|
report["var_backed_up_columns"].append(col)
|
|
553
561
|
df[col] = ser.astype(str)
|
|
554
562
|
if verbose:
|
|
555
|
-
|
|
563
|
+
logger.debug(
|
|
556
564
|
f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up)."
|
|
557
565
|
)
|
|
558
566
|
if which == "obs":
|
|
@@ -581,7 +589,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
581
589
|
_backup(v, f"uns_{k}_backup")
|
|
582
590
|
backed_up.append(k)
|
|
583
591
|
if verbose:
|
|
584
|
-
|
|
592
|
+
logger.debug(
|
|
585
593
|
f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})"
|
|
586
594
|
)
|
|
587
595
|
report["uns_json_keys"].append(k)
|
|
@@ -592,23 +600,31 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
592
600
|
clean[k + "_str"] = str(v)
|
|
593
601
|
backed_up.append(k)
|
|
594
602
|
if verbose:
|
|
595
|
-
|
|
603
|
+
logger.debug(
|
|
604
|
+
f" uns['{k}'] stored as string under '{k}_str' (backed up)."
|
|
605
|
+
)
|
|
596
606
|
report["uns_backed_up_keys"].append(k)
|
|
597
607
|
except Exception as e:
|
|
598
608
|
msg = f"uns['{k}'] could not be preserved: {e}"
|
|
599
609
|
report["errors"].append(msg)
|
|
600
610
|
if verbose:
|
|
601
|
-
|
|
611
|
+
logger.debug(" " + msg)
|
|
602
612
|
if backed_up and verbose:
|
|
603
|
-
|
|
613
|
+
logger.debug(f"Sanitized .uns keys (backed up): {backed_up}")
|
|
604
614
|
return clean
|
|
605
615
|
|
|
606
616
|
def _sanitize_layers_obsm(src_dict, which: str):
|
|
607
617
|
"""
|
|
608
|
-
Ensure arrays in layers/obsm are numeric and non-object dtype.
|
|
618
|
+
Ensure arrays in layers/obsm/varm are numeric and non-object dtype.
|
|
609
619
|
Returns a cleaned dict suitable to pass into AnnData(...)
|
|
610
620
|
If an entry is not convertible, it is backed up & skipped.
|
|
611
621
|
"""
|
|
622
|
+
report_map = {
|
|
623
|
+
"layers": ("layers_converted", "layers_skipped"),
|
|
624
|
+
"obsm": ("obsm_converted", "obsm_skipped"),
|
|
625
|
+
"varm": ("varm_converted", "varm_skipped"),
|
|
626
|
+
}
|
|
627
|
+
converted_key, skipped_key = report_map[which]
|
|
612
628
|
cleaned = {}
|
|
613
629
|
for k, v in src_dict.items():
|
|
614
630
|
try:
|
|
@@ -618,30 +634,23 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
618
634
|
arr_f = arr.astype(float)
|
|
619
635
|
cleaned[k] = arr_f
|
|
620
636
|
report_key = f"{which}.{k}"
|
|
621
|
-
report[
|
|
622
|
-
report_key
|
|
623
|
-
) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
637
|
+
report[converted_key].append(report_key)
|
|
624
638
|
if verbose:
|
|
625
|
-
|
|
639
|
+
logger.debug(f" {which}.{k} object array coerced to float.")
|
|
626
640
|
except Exception:
|
|
627
641
|
try:
|
|
628
642
|
arr_i = arr.astype(int)
|
|
629
643
|
cleaned[k] = arr_i
|
|
630
644
|
report_key = f"{which}.{k}"
|
|
631
|
-
report[
|
|
632
|
-
report_key
|
|
633
|
-
) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
645
|
+
report[converted_key].append(report_key)
|
|
634
646
|
if verbose:
|
|
635
|
-
|
|
647
|
+
logger.debug(f" {which}.{k} object array coerced to int.")
|
|
636
648
|
except Exception:
|
|
637
649
|
if backup:
|
|
638
650
|
_backup(v, f"{which}_{k}_backup")
|
|
639
|
-
|
|
640
|
-
report["layers_skipped"].append(k)
|
|
641
|
-
else:
|
|
642
|
-
report["obsm_skipped"].append(k)
|
|
651
|
+
report[skipped_key].append(k)
|
|
643
652
|
if verbose:
|
|
644
|
-
|
|
653
|
+
logger.debug(
|
|
645
654
|
f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}"
|
|
646
655
|
)
|
|
647
656
|
continue
|
|
@@ -650,14 +659,11 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
650
659
|
except Exception as e:
|
|
651
660
|
if backup:
|
|
652
661
|
_backup(v, f"{which}_{k}_backup")
|
|
653
|
-
|
|
654
|
-
report["layers_skipped"].append(k)
|
|
655
|
-
else:
|
|
656
|
-
report["obsm_skipped"].append(k)
|
|
662
|
+
report[skipped_key].append(k)
|
|
657
663
|
msg = f" SKIPPING {which}.{k} due to conversion error: {e}"
|
|
658
664
|
report["errors"].append(msg)
|
|
659
665
|
if verbose:
|
|
660
|
-
|
|
666
|
+
logger.debug(msg)
|
|
661
667
|
continue
|
|
662
668
|
return cleaned
|
|
663
669
|
|
|
@@ -668,7 +674,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
668
674
|
msg = f"Failed to sanitize obs: {e}"
|
|
669
675
|
report["errors"].append(msg)
|
|
670
676
|
if verbose:
|
|
671
|
-
|
|
677
|
+
logger.debug(msg)
|
|
672
678
|
obs_clean = adata.obs.copy()
|
|
673
679
|
|
|
674
680
|
try:
|
|
@@ -677,7 +683,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
677
683
|
msg = f"Failed to sanitize var: {e}"
|
|
678
684
|
report["errors"].append(msg)
|
|
679
685
|
if verbose:
|
|
680
|
-
|
|
686
|
+
logger.debug(msg)
|
|
681
687
|
var_clean = adata.var.copy()
|
|
682
688
|
|
|
683
689
|
# ---------- sanitize uns ----------
|
|
@@ -687,12 +693,13 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
687
693
|
msg = f"Failed to sanitize uns: {e}"
|
|
688
694
|
report["errors"].append(msg)
|
|
689
695
|
if verbose:
|
|
690
|
-
|
|
696
|
+
logger.debug(msg)
|
|
691
697
|
uns_clean = {}
|
|
692
698
|
|
|
693
699
|
# ---------- sanitize layers and obsm ----------
|
|
694
700
|
layers_src = getattr(adata, "layers", {})
|
|
695
701
|
obsm_src = getattr(adata, "obsm", {})
|
|
702
|
+
varm_src = getattr(adata, "varm", {})
|
|
696
703
|
|
|
697
704
|
try:
|
|
698
705
|
layers_clean = _sanitize_layers_obsm(layers_src, "layers")
|
|
@@ -700,7 +707,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
700
707
|
msg = f"Failed to sanitize layers: {e}"
|
|
701
708
|
report["errors"].append(msg)
|
|
702
709
|
if verbose:
|
|
703
|
-
|
|
710
|
+
logger.debug(msg)
|
|
704
711
|
layers_clean = {}
|
|
705
712
|
|
|
706
713
|
try:
|
|
@@ -709,9 +716,18 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
709
716
|
msg = f"Failed to sanitize obsm: {e}"
|
|
710
717
|
report["errors"].append(msg)
|
|
711
718
|
if verbose:
|
|
712
|
-
|
|
719
|
+
logger.debug(msg)
|
|
713
720
|
obsm_clean = {}
|
|
714
721
|
|
|
722
|
+
try:
|
|
723
|
+
varm_clean = _sanitize_layers_obsm(varm_src, "varm")
|
|
724
|
+
except Exception as e:
|
|
725
|
+
msg = f"Failed to sanitize varm: {e}"
|
|
726
|
+
report["errors"].append(msg)
|
|
727
|
+
if verbose:
|
|
728
|
+
logger.debug(msg)
|
|
729
|
+
varm_clean = {}
|
|
730
|
+
|
|
715
731
|
# ---------- handle X ----------
|
|
716
732
|
X_to_use = adata.X
|
|
717
733
|
try:
|
|
@@ -721,21 +737,21 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
721
737
|
X_to_use = X_arr.astype(float)
|
|
722
738
|
report["X_replaced_or_converted"] = "converted_to_float"
|
|
723
739
|
if verbose:
|
|
724
|
-
|
|
740
|
+
logger.debug("Converted adata.X object-dtype -> float")
|
|
725
741
|
except Exception:
|
|
726
742
|
if backup:
|
|
727
743
|
_backup(adata.X, "X_backup")
|
|
728
744
|
X_to_use = np.zeros_like(X_arr, dtype=float)
|
|
729
745
|
report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
|
|
730
746
|
if verbose:
|
|
731
|
-
|
|
747
|
+
logger.debug(
|
|
732
748
|
"adata.X had object dtype and couldn't be converted; replaced with zeros (backup set)."
|
|
733
749
|
)
|
|
734
750
|
except Exception as e:
|
|
735
751
|
msg = f"Error handling adata.X: {e}"
|
|
736
752
|
report["errors"].append(msg)
|
|
737
753
|
if verbose:
|
|
738
|
-
|
|
754
|
+
logger.debug(msg)
|
|
739
755
|
X_to_use = adata.X
|
|
740
756
|
|
|
741
757
|
# ---------- build lightweight AnnData copy ----------
|
|
@@ -747,7 +763,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
747
763
|
layers=layers_clean,
|
|
748
764
|
uns=uns_clean,
|
|
749
765
|
obsm=obsm_clean,
|
|
750
|
-
varm=
|
|
766
|
+
varm=varm_clean,
|
|
751
767
|
)
|
|
752
768
|
|
|
753
769
|
# preserve names (as strings)
|
|
@@ -761,71 +777,71 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
761
777
|
# --- write
|
|
762
778
|
adata_copy.write_h5ad(path, compression=compression)
|
|
763
779
|
if verbose:
|
|
764
|
-
|
|
780
|
+
logger.debug(f"Saved safely to {path}")
|
|
765
781
|
except Exception as e:
|
|
766
782
|
msg = f"Failed to write h5ad: {e}"
|
|
767
783
|
report["errors"].append(msg)
|
|
768
784
|
if verbose:
|
|
769
|
-
|
|
785
|
+
logger.error(msg)
|
|
770
786
|
raise
|
|
771
787
|
|
|
772
788
|
# Print a concise interactive report
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
789
|
+
logger.info("\n=== safe_write_h5ad REPORT ===")
|
|
790
|
+
logger.info(f"Saved file: {path}")
|
|
791
|
+
logger.info(f"Adata shape: {adata.shape}")
|
|
776
792
|
if report["obs_converted_columns"] or report["obs_backed_up_columns"]:
|
|
777
|
-
|
|
778
|
-
|
|
793
|
+
logger.debug(f"obs: converted columns -> {report['obs_converted_columns']}")
|
|
794
|
+
logger.debug(f"obs: backed-up columns -> {report['obs_backed_up_columns']}")
|
|
779
795
|
else:
|
|
780
|
-
|
|
796
|
+
logger.debug("obs: no problematic columns found.")
|
|
781
797
|
|
|
782
798
|
if report["var_converted_columns"] or report["var_backed_up_columns"]:
|
|
783
|
-
|
|
784
|
-
|
|
799
|
+
logger.debug(f"var: converted columns -> {report['var_converted_columns']}")
|
|
800
|
+
logger.debug(f"var: backed-up columns -> {report['var_backed_up_columns']}")
|
|
785
801
|
else:
|
|
786
|
-
|
|
802
|
+
logger.debug("var: no problematic columns found.")
|
|
787
803
|
|
|
788
804
|
if report["uns_json_keys"] or report["uns_backed_up_keys"]:
|
|
789
|
-
|
|
790
|
-
|
|
805
|
+
logger.debug(f".uns: jsonified keys -> {report['uns_json_keys']}")
|
|
806
|
+
logger.debug(f".uns: backed-up keys -> {report['uns_backed_up_keys']}")
|
|
791
807
|
else:
|
|
792
|
-
|
|
808
|
+
logger.debug(".uns: no problematic keys found.")
|
|
793
809
|
|
|
794
810
|
if report["layers_converted"] or report["layers_skipped"]:
|
|
795
|
-
|
|
796
|
-
|
|
811
|
+
logger.debug(f"layers: converted -> {report['layers_converted']}")
|
|
812
|
+
logger.debug(f"layers: skipped -> {report['layers_skipped']}")
|
|
797
813
|
else:
|
|
798
|
-
|
|
814
|
+
logger.debug("layers: no problematic entries found.")
|
|
799
815
|
|
|
800
816
|
if report["obsm_converted"] or report["obsm_skipped"]:
|
|
801
|
-
|
|
802
|
-
|
|
817
|
+
logger.debug(f"obsm: converted -> {report['obsm_converted']}")
|
|
818
|
+
logger.debug(f"obsm: skipped -> {report['obsm_skipped']}")
|
|
803
819
|
else:
|
|
804
|
-
|
|
820
|
+
logger.debug("obsm: no problematic entries found.")
|
|
805
821
|
|
|
806
822
|
if report["X_replaced_or_converted"]:
|
|
807
|
-
|
|
823
|
+
logger.debug(f"adata.X handled: {report['X_replaced_or_converted']}")
|
|
808
824
|
else:
|
|
809
|
-
|
|
825
|
+
logger.debug("adata.X: no changes.")
|
|
810
826
|
|
|
811
827
|
if report["errors"]:
|
|
812
|
-
|
|
828
|
+
logger.error("\nWarnings / errors encountered:")
|
|
813
829
|
for e in report["errors"]:
|
|
814
|
-
|
|
830
|
+
logger.error(" -", e)
|
|
815
831
|
|
|
816
|
-
|
|
832
|
+
logger.info("=== end report ===\n")
|
|
817
833
|
|
|
818
834
|
# ---------- create CSV output directory ----------
|
|
819
835
|
try:
|
|
820
836
|
csv_dir = path.parent / "csvs"
|
|
821
837
|
csv_dir.mkdir(exist_ok=True)
|
|
822
838
|
if verbose:
|
|
823
|
-
|
|
839
|
+
logger.info(f"CSV outputs will be written to: {csv_dir}")
|
|
824
840
|
except Exception as e:
|
|
825
841
|
msg = f"Failed to create CSV output directory: {e}"
|
|
826
842
|
report["errors"].append(msg)
|
|
827
843
|
if verbose:
|
|
828
|
-
|
|
844
|
+
logger.error(msg)
|
|
829
845
|
csv_dir = path.parent # fallback just in case
|
|
830
846
|
|
|
831
847
|
# ---------- write keys summary CSV ----------
|
|
@@ -872,6 +888,26 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
872
888
|
}
|
|
873
889
|
)
|
|
874
890
|
|
|
891
|
+
# varm
|
|
892
|
+
for k, v in adata_copy.varm.items():
|
|
893
|
+
meta_rows.append(
|
|
894
|
+
{
|
|
895
|
+
"kind": "varm",
|
|
896
|
+
"name": k,
|
|
897
|
+
"dtype": str(np.asarray(v).dtype),
|
|
898
|
+
}
|
|
899
|
+
)
|
|
900
|
+
|
|
901
|
+
# obsp
|
|
902
|
+
for k, v in adata_copy.obsp.items():
|
|
903
|
+
meta_rows.append(
|
|
904
|
+
{
|
|
905
|
+
"kind": "obsp",
|
|
906
|
+
"name": k,
|
|
907
|
+
"dtype": str(np.asarray(v).dtype),
|
|
908
|
+
}
|
|
909
|
+
)
|
|
910
|
+
|
|
875
911
|
# uns
|
|
876
912
|
for k, v in adata_copy.uns.items():
|
|
877
913
|
meta_rows.append(
|
|
@@ -890,13 +926,13 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
890
926
|
|
|
891
927
|
meta_df.to_csv(meta_path, index=False)
|
|
892
928
|
if verbose:
|
|
893
|
-
|
|
929
|
+
logger.info(f"Wrote keys summary CSV to {meta_path}")
|
|
894
930
|
|
|
895
931
|
except Exception as e:
|
|
896
932
|
msg = f"Failed to write keys CSV: {e}"
|
|
897
933
|
report["errors"].append(msg)
|
|
898
934
|
if verbose:
|
|
899
|
-
|
|
935
|
+
logger.error(msg)
|
|
900
936
|
|
|
901
937
|
# ---------- write full obs and var dataframes ----------
|
|
902
938
|
try:
|
|
@@ -909,14 +945,14 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
909
945
|
adata_copy.var.to_csv(var_path, index=True)
|
|
910
946
|
|
|
911
947
|
if verbose:
|
|
912
|
-
|
|
913
|
-
|
|
948
|
+
logger.info(f"Wrote obs DataFrame to {obs_path}")
|
|
949
|
+
logger.info(f"Wrote var DataFrame to {var_path}")
|
|
914
950
|
|
|
915
951
|
except Exception as e:
|
|
916
952
|
msg = f"Failed to write obs/var CSVs: {e}"
|
|
917
953
|
report["errors"].append(msg)
|
|
918
954
|
if verbose:
|
|
919
|
-
|
|
955
|
+
logger.error(msg)
|
|
920
956
|
|
|
921
957
|
return report
|
|
922
958
|
|
|
@@ -977,6 +1013,7 @@ def safe_read_h5ad(
|
|
|
977
1013
|
"parsed_uns_json_keys": [],
|
|
978
1014
|
"restored_layers": [],
|
|
979
1015
|
"restored_obsm": [],
|
|
1016
|
+
"restored_varm": [],
|
|
980
1017
|
"recategorized_obs": [],
|
|
981
1018
|
"recategorized_var": [],
|
|
982
1019
|
"missing_backups": [],
|
|
@@ -984,7 +1021,7 @@ def safe_read_h5ad(
|
|
|
984
1021
|
}
|
|
985
1022
|
|
|
986
1023
|
if verbose:
|
|
987
|
-
|
|
1024
|
+
logger.info(f"[safe_read_h5ad] loading {path}")
|
|
988
1025
|
|
|
989
1026
|
# 1) load the cleaned h5ad
|
|
990
1027
|
try:
|
|
@@ -994,7 +1031,7 @@ def safe_read_h5ad(
|
|
|
994
1031
|
|
|
995
1032
|
# Ensure backup_dir exists (may be relative to cwd)
|
|
996
1033
|
if verbose:
|
|
997
|
-
|
|
1034
|
+
logger.debug(f"[safe_read_h5ad] looking for backups in {backup_dir}")
|
|
998
1035
|
|
|
999
1036
|
def _load_pickle_if_exists(fname):
|
|
1000
1037
|
if os.path.exists(fname):
|
|
@@ -1005,7 +1042,7 @@ def safe_read_h5ad(
|
|
|
1005
1042
|
except Exception as e:
|
|
1006
1043
|
report["errors"].append(f"Failed to load pickle {fname}: {e}")
|
|
1007
1044
|
if verbose:
|
|
1008
|
-
|
|
1045
|
+
logger.error(f" error loading {fname}: {e}")
|
|
1009
1046
|
return None
|
|
1010
1047
|
return None
|
|
1011
1048
|
|
|
@@ -1030,7 +1067,7 @@ def safe_read_h5ad(
|
|
|
1030
1067
|
report["restored_obs_columns"].append((col, bname2))
|
|
1031
1068
|
restored = True
|
|
1032
1069
|
if verbose:
|
|
1033
|
-
|
|
1070
|
+
logger.debug(f"[safe_read_h5ad] restored obs.{col} from {bname2}")
|
|
1034
1071
|
except Exception as e:
|
|
1035
1072
|
report["errors"].append(f"Failed to restore obs.{col} from {bname2}: {e}")
|
|
1036
1073
|
restored = False
|
|
@@ -1048,7 +1085,7 @@ def safe_read_h5ad(
|
|
|
1048
1085
|
report["restored_obs_columns"].append((col, bname1))
|
|
1049
1086
|
restored = True
|
|
1050
1087
|
if verbose:
|
|
1051
|
-
|
|
1088
|
+
logger.debug(f"[safe_read_h5ad] restored obs.{col} from {bname1}")
|
|
1052
1089
|
except Exception as e:
|
|
1053
1090
|
report["errors"].append(f"Failed to restore obs.{col} from {bname1}: {e}")
|
|
1054
1091
|
restored = False
|
|
@@ -1079,7 +1116,7 @@ def safe_read_h5ad(
|
|
|
1079
1116
|
report["restored_obs_columns"].append((col, "parsed_json"))
|
|
1080
1117
|
restored = True
|
|
1081
1118
|
if verbose:
|
|
1082
|
-
|
|
1119
|
+
logger.debug(
|
|
1083
1120
|
f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects"
|
|
1084
1121
|
)
|
|
1085
1122
|
|
|
@@ -1092,7 +1129,7 @@ def safe_read_h5ad(
|
|
|
1092
1129
|
adata.obs[col] = adata.obs[col].astype(str).astype("category")
|
|
1093
1130
|
report["recategorized_obs"].append(col)
|
|
1094
1131
|
if verbose:
|
|
1095
|
-
|
|
1132
|
+
logger.debug(
|
|
1096
1133
|
f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})"
|
|
1097
1134
|
)
|
|
1098
1135
|
except Exception as e:
|
|
@@ -1115,7 +1152,7 @@ def safe_read_h5ad(
|
|
|
1115
1152
|
report["restored_var_columns"].append((col, bname2))
|
|
1116
1153
|
restored = True
|
|
1117
1154
|
if verbose:
|
|
1118
|
-
|
|
1155
|
+
logger.debug(f"[safe_read_h5ad] restored var.{col} from {bname2}")
|
|
1119
1156
|
except Exception as e:
|
|
1120
1157
|
report["errors"].append(f"Failed to restore var.{col} from {bname2}: {e}")
|
|
1121
1158
|
|
|
@@ -1132,7 +1169,7 @@ def safe_read_h5ad(
|
|
|
1132
1169
|
report["restored_var_columns"].append((col, bname1))
|
|
1133
1170
|
restored = True
|
|
1134
1171
|
if verbose:
|
|
1135
|
-
|
|
1172
|
+
logger.debug(f"[safe_read_h5ad] restored var.{col} from {bname1}")
|
|
1136
1173
|
except Exception as e:
|
|
1137
1174
|
report["errors"].append(f"Failed to restore var.{col} from {bname1}: {e}")
|
|
1138
1175
|
|
|
@@ -1160,7 +1197,7 @@ def safe_read_h5ad(
|
|
|
1160
1197
|
adata.var[col] = pd.Series(parsed, index=adata.var.index)
|
|
1161
1198
|
report["restored_var_columns"].append((col, "parsed_json"))
|
|
1162
1199
|
if verbose:
|
|
1163
|
-
|
|
1200
|
+
logger.debug(
|
|
1164
1201
|
f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects"
|
|
1165
1202
|
)
|
|
1166
1203
|
|
|
@@ -1171,7 +1208,7 @@ def safe_read_h5ad(
|
|
|
1171
1208
|
adata.var[col] = adata.var[col].astype(str).astype("category")
|
|
1172
1209
|
report["recategorized_var"].append(col)
|
|
1173
1210
|
if verbose:
|
|
1174
|
-
|
|
1211
|
+
logger.debug(
|
|
1175
1212
|
f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})"
|
|
1176
1213
|
)
|
|
1177
1214
|
except Exception as e:
|
|
@@ -1189,7 +1226,7 @@ def safe_read_h5ad(
|
|
|
1189
1226
|
adata.uns[base] = parsed
|
|
1190
1227
|
report["parsed_uns_json_keys"].append(base)
|
|
1191
1228
|
if verbose:
|
|
1192
|
-
|
|
1229
|
+
logger.debug(f"[safe_read_h5ad] parsed adata.uns['{k}'] -> adata.uns['{base}']")
|
|
1193
1230
|
# remove the _json entry
|
|
1194
1231
|
try:
|
|
1195
1232
|
del adata.uns[k]
|
|
@@ -1212,10 +1249,10 @@ def safe_read_h5ad(
|
|
|
1212
1249
|
adata.uns[key] = val
|
|
1213
1250
|
report["restored_uns_keys"].append((key, full))
|
|
1214
1251
|
if verbose:
|
|
1215
|
-
|
|
1252
|
+
logger.debug(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
|
|
1216
1253
|
|
|
1217
1254
|
# 5) Restore layers and obsm from backups if present
|
|
1218
|
-
# expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl
|
|
1255
|
+
# expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl, varm_<name>_backup.pkl
|
|
1219
1256
|
if os.path.isdir(backup_dir):
|
|
1220
1257
|
for fname in os.listdir(backup_dir):
|
|
1221
1258
|
if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
|
|
@@ -1227,7 +1264,9 @@ def safe_read_h5ad(
|
|
|
1227
1264
|
adata.layers[layer_name] = np.asarray(val)
|
|
1228
1265
|
report["restored_layers"].append((layer_name, full))
|
|
1229
1266
|
if verbose:
|
|
1230
|
-
|
|
1267
|
+
logger.debug(
|
|
1268
|
+
f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}"
|
|
1269
|
+
)
|
|
1231
1270
|
except Exception as e:
|
|
1232
1271
|
report["errors"].append(
|
|
1233
1272
|
f"Failed to restore layers['{layer_name}'] from {full}: {e}"
|
|
@@ -1242,12 +1281,31 @@ def safe_read_h5ad(
|
|
|
1242
1281
|
adata.obsm[obsm_name] = np.asarray(val)
|
|
1243
1282
|
report["restored_obsm"].append((obsm_name, full))
|
|
1244
1283
|
if verbose:
|
|
1245
|
-
|
|
1284
|
+
logger.debug(
|
|
1285
|
+
f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}"
|
|
1286
|
+
)
|
|
1246
1287
|
except Exception as e:
|
|
1247
1288
|
report["errors"].append(
|
|
1248
1289
|
f"Failed to restore obsm['{obsm_name}'] from {full}: {e}"
|
|
1249
1290
|
)
|
|
1250
1291
|
|
|
1292
|
+
if fname.startswith("varm_") and fname.endswith("_backup.pkl"):
|
|
1293
|
+
varm_name = fname[len("varm_") : -len("_backup.pkl")]
|
|
1294
|
+
full = os.path.join(backup_dir, fname)
|
|
1295
|
+
val = _load_pickle_if_exists(full)
|
|
1296
|
+
if val is not None:
|
|
1297
|
+
try:
|
|
1298
|
+
adata.varm[varm_name] = np.asarray(val)
|
|
1299
|
+
report["restored_varm"].append((varm_name, full))
|
|
1300
|
+
if verbose:
|
|
1301
|
+
logger.debug(
|
|
1302
|
+
f"[safe_read_h5ad] restored varm['{varm_name}'] from {full}"
|
|
1303
|
+
)
|
|
1304
|
+
except Exception as e:
|
|
1305
|
+
report["errors"].append(
|
|
1306
|
+
f"Failed to restore varm['{varm_name}'] from {full}: {e}"
|
|
1307
|
+
)
|
|
1308
|
+
|
|
1251
1309
|
# 6) If restore_backups True but some expected backups missing, note them
|
|
1252
1310
|
if restore_backups and os.path.isdir(backup_dir):
|
|
1253
1311
|
# detect common expected names from obs/var/uns/layers in adata
|
|
@@ -1276,7 +1334,7 @@ def safe_read_h5ad(
|
|
|
1276
1334
|
if expected_missing and verbose:
|
|
1277
1335
|
n = len(expected_missing)
|
|
1278
1336
|
if verbose:
|
|
1279
|
-
|
|
1337
|
+
logger.warning(
|
|
1280
1338
|
f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable."
|
|
1281
1339
|
)
|
|
1282
1340
|
# add to report
|
|
@@ -1284,35 +1342,37 @@ def safe_read_h5ad(
|
|
|
1284
1342
|
|
|
1285
1343
|
# final summary print
|
|
1286
1344
|
if verbose:
|
|
1287
|
-
|
|
1345
|
+
logger.info("\n=== safe_read_h5ad summary ===")
|
|
1288
1346
|
if report["restored_obs_columns"]:
|
|
1289
|
-
|
|
1347
|
+
logger.info("Restored obs columns:", report["restored_obs_columns"])
|
|
1290
1348
|
if report["restored_var_columns"]:
|
|
1291
|
-
|
|
1349
|
+
logger.info("Restored var columns:", report["restored_var_columns"])
|
|
1292
1350
|
if report["restored_uns_keys"]:
|
|
1293
|
-
|
|
1351
|
+
logger.info("Restored uns keys:", report["restored_uns_keys"])
|
|
1294
1352
|
if report["parsed_uns_json_keys"]:
|
|
1295
|
-
|
|
1353
|
+
logger.info("Parsed uns JSON keys:", report["parsed_uns_json_keys"])
|
|
1296
1354
|
if report["restored_layers"]:
|
|
1297
|
-
|
|
1355
|
+
logger.info("Restored layers:", report["restored_layers"])
|
|
1298
1356
|
if report["restored_obsm"]:
|
|
1299
|
-
|
|
1357
|
+
logger.info("Restored obsm:", report["restored_obsm"])
|
|
1358
|
+
if report["restored_varm"]:
|
|
1359
|
+
logger.info("Restored varm:", report["restored_varm"])
|
|
1300
1360
|
if report["recategorized_obs"] or report["recategorized_var"]:
|
|
1301
|
-
|
|
1361
|
+
logger.info(
|
|
1302
1362
|
"Recategorized columns (obs/var):",
|
|
1303
1363
|
report["recategorized_obs"],
|
|
1304
1364
|
report["recategorized_var"],
|
|
1305
1365
|
)
|
|
1306
1366
|
if report["missing_backups"]:
|
|
1307
|
-
|
|
1367
|
+
logger.info(
|
|
1308
1368
|
"Missing backups or object columns without backups (investigate):",
|
|
1309
1369
|
report["missing_backups"],
|
|
1310
1370
|
)
|
|
1311
1371
|
if report["errors"]:
|
|
1312
|
-
|
|
1372
|
+
logger.error("Errors encountered (see report['errors']):")
|
|
1313
1373
|
for e in report["errors"]:
|
|
1314
|
-
|
|
1315
|
-
|
|
1374
|
+
logger.error(" -", e)
|
|
1375
|
+
logger.info("=== end summary ===\n")
|
|
1316
1376
|
|
|
1317
1377
|
return adata, report
|
|
1318
1378
|
|