smftools 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +18 -2
- smftools/cli/hmm_adata.py +18 -1
- smftools/cli/latent_adata.py +522 -67
- smftools/cli/load_adata.py +2 -2
- smftools/cli/preprocess_adata.py +32 -93
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +23 -109
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +41 -5
- smftools/config/conversion.yaml +0 -10
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +49 -13
- smftools/config/experiment_config.py +96 -3
- smftools/constants.py +4 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +53 -13
- smftools/informatics/h5ad_functions.py +83 -0
- smftools/informatics/modkit_extract_to_adata.py +4 -0
- smftools/plotting/__init__.py +26 -12
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +58 -3362
- smftools/plotting/hmm_plotting.py +1586 -2
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +3 -0
- smftools/preprocessing/append_base_context.py +1 -1
- smftools/preprocessing/append_mismatch_frequency_sites.py +35 -6
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +109 -85
- smftools/tools/__init__.py +6 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_nmf.py +18 -7
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +70 -154
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +640 -3
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +52 -4
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/METADATA +3 -1
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/RECORD +56 -42
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py
CHANGED
|
@@ -9,6 +9,10 @@ import anndata as ad
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
from Bio import SeqIO
|
|
11
11
|
|
|
12
|
+
from smftools.logging_utils import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
12
16
|
|
|
13
17
|
######################################################################################################
|
|
14
18
|
## Datetime functionality
|
|
@@ -464,7 +468,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
464
468
|
ser = df[col]
|
|
465
469
|
# categorical handling
|
|
466
470
|
try:
|
|
467
|
-
is_cat =
|
|
471
|
+
is_cat = isinstance(ser.dtype, pd.CategoricalDtype)
|
|
468
472
|
except Exception:
|
|
469
473
|
is_cat = False
|
|
470
474
|
|
|
@@ -474,7 +478,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
474
478
|
cats_str = cats.astype(str)
|
|
475
479
|
df[col] = pd.Categorical(ser.astype(str), categories=cats_str)
|
|
476
480
|
if verbose:
|
|
477
|
-
|
|
481
|
+
logger.debug(
|
|
482
|
+
f" coerced categorical column '{which}.{col}' -> string categories"
|
|
483
|
+
)
|
|
478
484
|
if which == "obs":
|
|
479
485
|
report["obs_converted_columns"].append(col)
|
|
480
486
|
else:
|
|
@@ -489,7 +495,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
489
495
|
report["var_backed_up_columns"].append(col)
|
|
490
496
|
df[col] = ser.astype(str)
|
|
491
497
|
if verbose:
|
|
492
|
-
|
|
498
|
+
logger.debug(
|
|
493
499
|
f" coerced categorical column '{which}.{col}' -> strings (backup={backup})"
|
|
494
500
|
)
|
|
495
501
|
continue
|
|
@@ -512,7 +518,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
512
518
|
report["var_backed_up_columns"].append(col)
|
|
513
519
|
df[col] = ser.values.astype(str)
|
|
514
520
|
if verbose:
|
|
515
|
-
|
|
521
|
+
logger.debug(
|
|
516
522
|
f" converted object column '{which}.{col}' -> strings (backup={backup})"
|
|
517
523
|
)
|
|
518
524
|
if which == "obs":
|
|
@@ -537,7 +543,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
537
543
|
report["var_backed_up_columns"].append(col)
|
|
538
544
|
df[col] = [json.dumps(v, default=str) for v in ser.values]
|
|
539
545
|
if verbose:
|
|
540
|
-
|
|
546
|
+
logger.debug(
|
|
541
547
|
f" json-stringified object column '{which}.{col}' (backup={backup})"
|
|
542
548
|
)
|
|
543
549
|
if which == "obs":
|
|
@@ -554,7 +560,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
554
560
|
report["var_backed_up_columns"].append(col)
|
|
555
561
|
df[col] = ser.astype(str)
|
|
556
562
|
if verbose:
|
|
557
|
-
|
|
563
|
+
logger.debug(
|
|
558
564
|
f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up)."
|
|
559
565
|
)
|
|
560
566
|
if which == "obs":
|
|
@@ -583,7 +589,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
583
589
|
_backup(v, f"uns_{k}_backup")
|
|
584
590
|
backed_up.append(k)
|
|
585
591
|
if verbose:
|
|
586
|
-
|
|
592
|
+
logger.debug(
|
|
587
593
|
f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})"
|
|
588
594
|
)
|
|
589
595
|
report["uns_json_keys"].append(k)
|
|
@@ -594,15 +600,17 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
594
600
|
clean[k + "_str"] = str(v)
|
|
595
601
|
backed_up.append(k)
|
|
596
602
|
if verbose:
|
|
597
|
-
|
|
603
|
+
logger.debug(
|
|
604
|
+
f" uns['{k}'] stored as string under '{k}_str' (backed up)."
|
|
605
|
+
)
|
|
598
606
|
report["uns_backed_up_keys"].append(k)
|
|
599
607
|
except Exception as e:
|
|
600
608
|
msg = f"uns['{k}'] could not be preserved: {e}"
|
|
601
609
|
report["errors"].append(msg)
|
|
602
610
|
if verbose:
|
|
603
|
-
|
|
611
|
+
logger.debug(" " + msg)
|
|
604
612
|
if backed_up and verbose:
|
|
605
|
-
|
|
613
|
+
logger.debug(f"Sanitized .uns keys (backed up): {backed_up}")
|
|
606
614
|
return clean
|
|
607
615
|
|
|
608
616
|
def _sanitize_layers_obsm(src_dict, which: str):
|
|
@@ -628,7 +636,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
628
636
|
report_key = f"{which}.{k}"
|
|
629
637
|
report[converted_key].append(report_key)
|
|
630
638
|
if verbose:
|
|
631
|
-
|
|
639
|
+
logger.debug(f" {which}.{k} object array coerced to float.")
|
|
632
640
|
except Exception:
|
|
633
641
|
try:
|
|
634
642
|
arr_i = arr.astype(int)
|
|
@@ -636,13 +644,13 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
636
644
|
report_key = f"{which}.{k}"
|
|
637
645
|
report[converted_key].append(report_key)
|
|
638
646
|
if verbose:
|
|
639
|
-
|
|
647
|
+
logger.debug(f" {which}.{k} object array coerced to int.")
|
|
640
648
|
except Exception:
|
|
641
649
|
if backup:
|
|
642
650
|
_backup(v, f"{which}_{k}_backup")
|
|
643
651
|
report[skipped_key].append(k)
|
|
644
652
|
if verbose:
|
|
645
|
-
|
|
653
|
+
logger.debug(
|
|
646
654
|
f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}"
|
|
647
655
|
)
|
|
648
656
|
continue
|
|
@@ -655,7 +663,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
655
663
|
msg = f" SKIPPING {which}.{k} due to conversion error: {e}"
|
|
656
664
|
report["errors"].append(msg)
|
|
657
665
|
if verbose:
|
|
658
|
-
|
|
666
|
+
logger.debug(msg)
|
|
659
667
|
continue
|
|
660
668
|
return cleaned
|
|
661
669
|
|
|
@@ -666,7 +674,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
666
674
|
msg = f"Failed to sanitize obs: {e}"
|
|
667
675
|
report["errors"].append(msg)
|
|
668
676
|
if verbose:
|
|
669
|
-
|
|
677
|
+
logger.debug(msg)
|
|
670
678
|
obs_clean = adata.obs.copy()
|
|
671
679
|
|
|
672
680
|
try:
|
|
@@ -675,7 +683,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
675
683
|
msg = f"Failed to sanitize var: {e}"
|
|
676
684
|
report["errors"].append(msg)
|
|
677
685
|
if verbose:
|
|
678
|
-
|
|
686
|
+
logger.debug(msg)
|
|
679
687
|
var_clean = adata.var.copy()
|
|
680
688
|
|
|
681
689
|
# ---------- sanitize uns ----------
|
|
@@ -685,7 +693,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
685
693
|
msg = f"Failed to sanitize uns: {e}"
|
|
686
694
|
report["errors"].append(msg)
|
|
687
695
|
if verbose:
|
|
688
|
-
|
|
696
|
+
logger.debug(msg)
|
|
689
697
|
uns_clean = {}
|
|
690
698
|
|
|
691
699
|
# ---------- sanitize layers and obsm ----------
|
|
@@ -699,7 +707,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
699
707
|
msg = f"Failed to sanitize layers: {e}"
|
|
700
708
|
report["errors"].append(msg)
|
|
701
709
|
if verbose:
|
|
702
|
-
|
|
710
|
+
logger.debug(msg)
|
|
703
711
|
layers_clean = {}
|
|
704
712
|
|
|
705
713
|
try:
|
|
@@ -708,7 +716,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
708
716
|
msg = f"Failed to sanitize obsm: {e}"
|
|
709
717
|
report["errors"].append(msg)
|
|
710
718
|
if verbose:
|
|
711
|
-
|
|
719
|
+
logger.debug(msg)
|
|
712
720
|
obsm_clean = {}
|
|
713
721
|
|
|
714
722
|
try:
|
|
@@ -717,7 +725,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
717
725
|
msg = f"Failed to sanitize varm: {e}"
|
|
718
726
|
report["errors"].append(msg)
|
|
719
727
|
if verbose:
|
|
720
|
-
|
|
728
|
+
logger.debug(msg)
|
|
721
729
|
varm_clean = {}
|
|
722
730
|
|
|
723
731
|
# ---------- handle X ----------
|
|
@@ -729,21 +737,21 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
729
737
|
X_to_use = X_arr.astype(float)
|
|
730
738
|
report["X_replaced_or_converted"] = "converted_to_float"
|
|
731
739
|
if verbose:
|
|
732
|
-
|
|
740
|
+
logger.debug("Converted adata.X object-dtype -> float")
|
|
733
741
|
except Exception:
|
|
734
742
|
if backup:
|
|
735
743
|
_backup(adata.X, "X_backup")
|
|
736
744
|
X_to_use = np.zeros_like(X_arr, dtype=float)
|
|
737
745
|
report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
|
|
738
746
|
if verbose:
|
|
739
|
-
|
|
747
|
+
logger.debug(
|
|
740
748
|
"adata.X had object dtype and couldn't be converted; replaced with zeros (backup set)."
|
|
741
749
|
)
|
|
742
750
|
except Exception as e:
|
|
743
751
|
msg = f"Error handling adata.X: {e}"
|
|
744
752
|
report["errors"].append(msg)
|
|
745
753
|
if verbose:
|
|
746
|
-
|
|
754
|
+
logger.debug(msg)
|
|
747
755
|
X_to_use = adata.X
|
|
748
756
|
|
|
749
757
|
# ---------- build lightweight AnnData copy ----------
|
|
@@ -769,71 +777,71 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
769
777
|
# --- write
|
|
770
778
|
adata_copy.write_h5ad(path, compression=compression)
|
|
771
779
|
if verbose:
|
|
772
|
-
|
|
780
|
+
logger.debug(f"Saved safely to {path}")
|
|
773
781
|
except Exception as e:
|
|
774
782
|
msg = f"Failed to write h5ad: {e}"
|
|
775
783
|
report["errors"].append(msg)
|
|
776
784
|
if verbose:
|
|
777
|
-
|
|
785
|
+
logger.error(msg)
|
|
778
786
|
raise
|
|
779
787
|
|
|
780
788
|
# Print a concise interactive report
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
789
|
+
logger.info("\n=== safe_write_h5ad REPORT ===")
|
|
790
|
+
logger.info(f"Saved file: {path}")
|
|
791
|
+
logger.info(f"Adata shape: {adata.shape}")
|
|
784
792
|
if report["obs_converted_columns"] or report["obs_backed_up_columns"]:
|
|
785
|
-
|
|
786
|
-
|
|
793
|
+
logger.debug(f"obs: converted columns -> {report['obs_converted_columns']}")
|
|
794
|
+
logger.debug(f"obs: backed-up columns -> {report['obs_backed_up_columns']}")
|
|
787
795
|
else:
|
|
788
|
-
|
|
796
|
+
logger.debug("obs: no problematic columns found.")
|
|
789
797
|
|
|
790
798
|
if report["var_converted_columns"] or report["var_backed_up_columns"]:
|
|
791
|
-
|
|
792
|
-
|
|
799
|
+
logger.debug(f"var: converted columns -> {report['var_converted_columns']}")
|
|
800
|
+
logger.debug(f"var: backed-up columns -> {report['var_backed_up_columns']}")
|
|
793
801
|
else:
|
|
794
|
-
|
|
802
|
+
logger.debug("var: no problematic columns found.")
|
|
795
803
|
|
|
796
804
|
if report["uns_json_keys"] or report["uns_backed_up_keys"]:
|
|
797
|
-
|
|
798
|
-
|
|
805
|
+
logger.debug(f".uns: jsonified keys -> {report['uns_json_keys']}")
|
|
806
|
+
logger.debug(f".uns: backed-up keys -> {report['uns_backed_up_keys']}")
|
|
799
807
|
else:
|
|
800
|
-
|
|
808
|
+
logger.debug(".uns: no problematic keys found.")
|
|
801
809
|
|
|
802
810
|
if report["layers_converted"] or report["layers_skipped"]:
|
|
803
|
-
|
|
804
|
-
|
|
811
|
+
logger.debug(f"layers: converted -> {report['layers_converted']}")
|
|
812
|
+
logger.debug(f"layers: skipped -> {report['layers_skipped']}")
|
|
805
813
|
else:
|
|
806
|
-
|
|
814
|
+
logger.debug("layers: no problematic entries found.")
|
|
807
815
|
|
|
808
816
|
if report["obsm_converted"] or report["obsm_skipped"]:
|
|
809
|
-
|
|
810
|
-
|
|
817
|
+
logger.debug(f"obsm: converted -> {report['obsm_converted']}")
|
|
818
|
+
logger.debug(f"obsm: skipped -> {report['obsm_skipped']}")
|
|
811
819
|
else:
|
|
812
|
-
|
|
820
|
+
logger.debug("obsm: no problematic entries found.")
|
|
813
821
|
|
|
814
822
|
if report["X_replaced_or_converted"]:
|
|
815
|
-
|
|
823
|
+
logger.debug(f"adata.X handled: {report['X_replaced_or_converted']}")
|
|
816
824
|
else:
|
|
817
|
-
|
|
825
|
+
logger.debug("adata.X: no changes.")
|
|
818
826
|
|
|
819
827
|
if report["errors"]:
|
|
820
|
-
|
|
828
|
+
logger.error("\nWarnings / errors encountered:")
|
|
821
829
|
for e in report["errors"]:
|
|
822
|
-
|
|
830
|
+
logger.error(" -", e)
|
|
823
831
|
|
|
824
|
-
|
|
832
|
+
logger.info("=== end report ===\n")
|
|
825
833
|
|
|
826
834
|
# ---------- create CSV output directory ----------
|
|
827
835
|
try:
|
|
828
836
|
csv_dir = path.parent / "csvs"
|
|
829
837
|
csv_dir.mkdir(exist_ok=True)
|
|
830
838
|
if verbose:
|
|
831
|
-
|
|
839
|
+
logger.info(f"CSV outputs will be written to: {csv_dir}")
|
|
832
840
|
except Exception as e:
|
|
833
841
|
msg = f"Failed to create CSV output directory: {e}"
|
|
834
842
|
report["errors"].append(msg)
|
|
835
843
|
if verbose:
|
|
836
|
-
|
|
844
|
+
logger.error(msg)
|
|
837
845
|
csv_dir = path.parent # fallback just in case
|
|
838
846
|
|
|
839
847
|
# ---------- write keys summary CSV ----------
|
|
@@ -890,6 +898,16 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
890
898
|
}
|
|
891
899
|
)
|
|
892
900
|
|
|
901
|
+
# obsp
|
|
902
|
+
for k, v in adata_copy.obsp.items():
|
|
903
|
+
meta_rows.append(
|
|
904
|
+
{
|
|
905
|
+
"kind": "obsp",
|
|
906
|
+
"name": k,
|
|
907
|
+
"dtype": str(np.asarray(v).dtype),
|
|
908
|
+
}
|
|
909
|
+
)
|
|
910
|
+
|
|
893
911
|
# uns
|
|
894
912
|
for k, v in adata_copy.uns.items():
|
|
895
913
|
meta_rows.append(
|
|
@@ -908,13 +926,13 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
908
926
|
|
|
909
927
|
meta_df.to_csv(meta_path, index=False)
|
|
910
928
|
if verbose:
|
|
911
|
-
|
|
929
|
+
logger.info(f"Wrote keys summary CSV to {meta_path}")
|
|
912
930
|
|
|
913
931
|
except Exception as e:
|
|
914
932
|
msg = f"Failed to write keys CSV: {e}"
|
|
915
933
|
report["errors"].append(msg)
|
|
916
934
|
if verbose:
|
|
917
|
-
|
|
935
|
+
logger.error(msg)
|
|
918
936
|
|
|
919
937
|
# ---------- write full obs and var dataframes ----------
|
|
920
938
|
try:
|
|
@@ -927,14 +945,14 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
927
945
|
adata_copy.var.to_csv(var_path, index=True)
|
|
928
946
|
|
|
929
947
|
if verbose:
|
|
930
|
-
|
|
931
|
-
|
|
948
|
+
logger.info(f"Wrote obs DataFrame to {obs_path}")
|
|
949
|
+
logger.info(f"Wrote var DataFrame to {var_path}")
|
|
932
950
|
|
|
933
951
|
except Exception as e:
|
|
934
952
|
msg = f"Failed to write obs/var CSVs: {e}"
|
|
935
953
|
report["errors"].append(msg)
|
|
936
954
|
if verbose:
|
|
937
|
-
|
|
955
|
+
logger.error(msg)
|
|
938
956
|
|
|
939
957
|
return report
|
|
940
958
|
|
|
@@ -1003,7 +1021,7 @@ def safe_read_h5ad(
|
|
|
1003
1021
|
}
|
|
1004
1022
|
|
|
1005
1023
|
if verbose:
|
|
1006
|
-
|
|
1024
|
+
logger.info(f"[safe_read_h5ad] loading {path}")
|
|
1007
1025
|
|
|
1008
1026
|
# 1) load the cleaned h5ad
|
|
1009
1027
|
try:
|
|
@@ -1013,7 +1031,7 @@ def safe_read_h5ad(
|
|
|
1013
1031
|
|
|
1014
1032
|
# Ensure backup_dir exists (may be relative to cwd)
|
|
1015
1033
|
if verbose:
|
|
1016
|
-
|
|
1034
|
+
logger.debug(f"[safe_read_h5ad] looking for backups in {backup_dir}")
|
|
1017
1035
|
|
|
1018
1036
|
def _load_pickle_if_exists(fname):
|
|
1019
1037
|
if os.path.exists(fname):
|
|
@@ -1024,7 +1042,7 @@ def safe_read_h5ad(
|
|
|
1024
1042
|
except Exception as e:
|
|
1025
1043
|
report["errors"].append(f"Failed to load pickle {fname}: {e}")
|
|
1026
1044
|
if verbose:
|
|
1027
|
-
|
|
1045
|
+
logger.error(f" error loading {fname}: {e}")
|
|
1028
1046
|
return None
|
|
1029
1047
|
return None
|
|
1030
1048
|
|
|
@@ -1049,7 +1067,7 @@ def safe_read_h5ad(
|
|
|
1049
1067
|
report["restored_obs_columns"].append((col, bname2))
|
|
1050
1068
|
restored = True
|
|
1051
1069
|
if verbose:
|
|
1052
|
-
|
|
1070
|
+
logger.debug(f"[safe_read_h5ad] restored obs.{col} from {bname2}")
|
|
1053
1071
|
except Exception as e:
|
|
1054
1072
|
report["errors"].append(f"Failed to restore obs.{col} from {bname2}: {e}")
|
|
1055
1073
|
restored = False
|
|
@@ -1067,7 +1085,7 @@ def safe_read_h5ad(
|
|
|
1067
1085
|
report["restored_obs_columns"].append((col, bname1))
|
|
1068
1086
|
restored = True
|
|
1069
1087
|
if verbose:
|
|
1070
|
-
|
|
1088
|
+
logger.debug(f"[safe_read_h5ad] restored obs.{col} from {bname1}")
|
|
1071
1089
|
except Exception as e:
|
|
1072
1090
|
report["errors"].append(f"Failed to restore obs.{col} from {bname1}: {e}")
|
|
1073
1091
|
restored = False
|
|
@@ -1098,7 +1116,7 @@ def safe_read_h5ad(
|
|
|
1098
1116
|
report["restored_obs_columns"].append((col, "parsed_json"))
|
|
1099
1117
|
restored = True
|
|
1100
1118
|
if verbose:
|
|
1101
|
-
|
|
1119
|
+
logger.debug(
|
|
1102
1120
|
f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects"
|
|
1103
1121
|
)
|
|
1104
1122
|
|
|
@@ -1111,7 +1129,7 @@ def safe_read_h5ad(
|
|
|
1111
1129
|
adata.obs[col] = adata.obs[col].astype(str).astype("category")
|
|
1112
1130
|
report["recategorized_obs"].append(col)
|
|
1113
1131
|
if verbose:
|
|
1114
|
-
|
|
1132
|
+
logger.debug(
|
|
1115
1133
|
f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})"
|
|
1116
1134
|
)
|
|
1117
1135
|
except Exception as e:
|
|
@@ -1134,7 +1152,7 @@ def safe_read_h5ad(
|
|
|
1134
1152
|
report["restored_var_columns"].append((col, bname2))
|
|
1135
1153
|
restored = True
|
|
1136
1154
|
if verbose:
|
|
1137
|
-
|
|
1155
|
+
logger.debug(f"[safe_read_h5ad] restored var.{col} from {bname2}")
|
|
1138
1156
|
except Exception as e:
|
|
1139
1157
|
report["errors"].append(f"Failed to restore var.{col} from {bname2}: {e}")
|
|
1140
1158
|
|
|
@@ -1151,7 +1169,7 @@ def safe_read_h5ad(
|
|
|
1151
1169
|
report["restored_var_columns"].append((col, bname1))
|
|
1152
1170
|
restored = True
|
|
1153
1171
|
if verbose:
|
|
1154
|
-
|
|
1172
|
+
logger.debug(f"[safe_read_h5ad] restored var.{col} from {bname1}")
|
|
1155
1173
|
except Exception as e:
|
|
1156
1174
|
report["errors"].append(f"Failed to restore var.{col} from {bname1}: {e}")
|
|
1157
1175
|
|
|
@@ -1179,7 +1197,7 @@ def safe_read_h5ad(
|
|
|
1179
1197
|
adata.var[col] = pd.Series(parsed, index=adata.var.index)
|
|
1180
1198
|
report["restored_var_columns"].append((col, "parsed_json"))
|
|
1181
1199
|
if verbose:
|
|
1182
|
-
|
|
1200
|
+
logger.debug(
|
|
1183
1201
|
f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects"
|
|
1184
1202
|
)
|
|
1185
1203
|
|
|
@@ -1190,7 +1208,7 @@ def safe_read_h5ad(
|
|
|
1190
1208
|
adata.var[col] = adata.var[col].astype(str).astype("category")
|
|
1191
1209
|
report["recategorized_var"].append(col)
|
|
1192
1210
|
if verbose:
|
|
1193
|
-
|
|
1211
|
+
logger.debug(
|
|
1194
1212
|
f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})"
|
|
1195
1213
|
)
|
|
1196
1214
|
except Exception as e:
|
|
@@ -1208,7 +1226,7 @@ def safe_read_h5ad(
|
|
|
1208
1226
|
adata.uns[base] = parsed
|
|
1209
1227
|
report["parsed_uns_json_keys"].append(base)
|
|
1210
1228
|
if verbose:
|
|
1211
|
-
|
|
1229
|
+
logger.debug(f"[safe_read_h5ad] parsed adata.uns['{k}'] -> adata.uns['{base}']")
|
|
1212
1230
|
# remove the _json entry
|
|
1213
1231
|
try:
|
|
1214
1232
|
del adata.uns[k]
|
|
@@ -1231,7 +1249,7 @@ def safe_read_h5ad(
|
|
|
1231
1249
|
adata.uns[key] = val
|
|
1232
1250
|
report["restored_uns_keys"].append((key, full))
|
|
1233
1251
|
if verbose:
|
|
1234
|
-
|
|
1252
|
+
logger.debug(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
|
|
1235
1253
|
|
|
1236
1254
|
# 5) Restore layers and obsm from backups if present
|
|
1237
1255
|
# expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl, varm_<name>_backup.pkl
|
|
@@ -1246,7 +1264,9 @@ def safe_read_h5ad(
|
|
|
1246
1264
|
adata.layers[layer_name] = np.asarray(val)
|
|
1247
1265
|
report["restored_layers"].append((layer_name, full))
|
|
1248
1266
|
if verbose:
|
|
1249
|
-
|
|
1267
|
+
logger.debug(
|
|
1268
|
+
f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}"
|
|
1269
|
+
)
|
|
1250
1270
|
except Exception as e:
|
|
1251
1271
|
report["errors"].append(
|
|
1252
1272
|
f"Failed to restore layers['{layer_name}'] from {full}: {e}"
|
|
@@ -1261,7 +1281,9 @@ def safe_read_h5ad(
|
|
|
1261
1281
|
adata.obsm[obsm_name] = np.asarray(val)
|
|
1262
1282
|
report["restored_obsm"].append((obsm_name, full))
|
|
1263
1283
|
if verbose:
|
|
1264
|
-
|
|
1284
|
+
logger.debug(
|
|
1285
|
+
f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}"
|
|
1286
|
+
)
|
|
1265
1287
|
except Exception as e:
|
|
1266
1288
|
report["errors"].append(
|
|
1267
1289
|
f"Failed to restore obsm['{obsm_name}'] from {full}: {e}"
|
|
@@ -1276,7 +1298,9 @@ def safe_read_h5ad(
|
|
|
1276
1298
|
adata.varm[varm_name] = np.asarray(val)
|
|
1277
1299
|
report["restored_varm"].append((varm_name, full))
|
|
1278
1300
|
if verbose:
|
|
1279
|
-
|
|
1301
|
+
logger.debug(
|
|
1302
|
+
f"[safe_read_h5ad] restored varm['{varm_name}'] from {full}"
|
|
1303
|
+
)
|
|
1280
1304
|
except Exception as e:
|
|
1281
1305
|
report["errors"].append(
|
|
1282
1306
|
f"Failed to restore varm['{varm_name}'] from {full}: {e}"
|
|
@@ -1310,7 +1334,7 @@ def safe_read_h5ad(
|
|
|
1310
1334
|
if expected_missing and verbose:
|
|
1311
1335
|
n = len(expected_missing)
|
|
1312
1336
|
if verbose:
|
|
1313
|
-
|
|
1337
|
+
logger.warning(
|
|
1314
1338
|
f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable."
|
|
1315
1339
|
)
|
|
1316
1340
|
# add to report
|
|
@@ -1318,37 +1342,37 @@ def safe_read_h5ad(
|
|
|
1318
1342
|
|
|
1319
1343
|
# final summary print
|
|
1320
1344
|
if verbose:
|
|
1321
|
-
|
|
1345
|
+
logger.info("\n=== safe_read_h5ad summary ===")
|
|
1322
1346
|
if report["restored_obs_columns"]:
|
|
1323
|
-
|
|
1347
|
+
logger.info("Restored obs columns:", report["restored_obs_columns"])
|
|
1324
1348
|
if report["restored_var_columns"]:
|
|
1325
|
-
|
|
1349
|
+
logger.info("Restored var columns:", report["restored_var_columns"])
|
|
1326
1350
|
if report["restored_uns_keys"]:
|
|
1327
|
-
|
|
1351
|
+
logger.info("Restored uns keys:", report["restored_uns_keys"])
|
|
1328
1352
|
if report["parsed_uns_json_keys"]:
|
|
1329
|
-
|
|
1353
|
+
logger.info("Parsed uns JSON keys:", report["parsed_uns_json_keys"])
|
|
1330
1354
|
if report["restored_layers"]:
|
|
1331
|
-
|
|
1355
|
+
logger.info("Restored layers:", report["restored_layers"])
|
|
1332
1356
|
if report["restored_obsm"]:
|
|
1333
|
-
|
|
1357
|
+
logger.info("Restored obsm:", report["restored_obsm"])
|
|
1334
1358
|
if report["restored_varm"]:
|
|
1335
|
-
|
|
1359
|
+
logger.info("Restored varm:", report["restored_varm"])
|
|
1336
1360
|
if report["recategorized_obs"] or report["recategorized_var"]:
|
|
1337
|
-
|
|
1361
|
+
logger.info(
|
|
1338
1362
|
"Recategorized columns (obs/var):",
|
|
1339
1363
|
report["recategorized_obs"],
|
|
1340
1364
|
report["recategorized_var"],
|
|
1341
1365
|
)
|
|
1342
1366
|
if report["missing_backups"]:
|
|
1343
|
-
|
|
1367
|
+
logger.info(
|
|
1344
1368
|
"Missing backups or object columns without backups (investigate):",
|
|
1345
1369
|
report["missing_backups"],
|
|
1346
1370
|
)
|
|
1347
1371
|
if report["errors"]:
|
|
1348
|
-
|
|
1372
|
+
logger.error("Errors encountered (see report['errors']):")
|
|
1349
1373
|
for e in report["errors"]:
|
|
1350
|
-
|
|
1351
|
-
|
|
1374
|
+
logger.error(" -", e)
|
|
1375
|
+
logger.info("=== end summary ===\n")
|
|
1352
1376
|
|
|
1353
1377
|
return adata, report
|
|
1354
1378
|
|
smftools/tools/__init__.py
CHANGED
|
@@ -6,6 +6,8 @@ _LAZY_ATTRS = {
|
|
|
6
6
|
"calculate_leiden": "smftools.tools.calculate_leiden",
|
|
7
7
|
"calculate_nmf": "smftools.tools.calculate_nmf",
|
|
8
8
|
"calculate_sequence_cp_decomposition": "smftools.tools.tensor_factorization",
|
|
9
|
+
"calculate_pca": "smftools.tools.calculate_pca",
|
|
10
|
+
"calculate_knn": "smftools.tools.calculate_knn",
|
|
9
11
|
"calculate_umap": "smftools.tools.calculate_umap",
|
|
10
12
|
"cluster_adata_on_methylation": "smftools.tools.cluster_adata_on_methylation",
|
|
11
13
|
"combine_layers": "smftools.tools.general_tools",
|
|
@@ -14,7 +16,11 @@ _LAZY_ATTRS = {
|
|
|
14
16
|
"calculate_relative_risk_on_activity": "smftools.tools.position_stats",
|
|
15
17
|
"compute_positionwise_statistics": "smftools.tools.position_stats",
|
|
16
18
|
"calculate_row_entropy": "smftools.tools.read_stats",
|
|
19
|
+
"align_sequences_with_mismatches": "smftools.tools.sequence_alignment",
|
|
17
20
|
"rolling_window_nn_distance": "smftools.tools.rolling_nn_distance",
|
|
21
|
+
"annotate_zero_hamming_segments": "smftools.tools.rolling_nn_distance",
|
|
22
|
+
"assign_per_read_segments_layer": "smftools.tools.rolling_nn_distance",
|
|
23
|
+
"select_top_segments_per_read": "smftools.tools.rolling_nn_distance",
|
|
18
24
|
"subset_adata": "smftools.tools.subset_adata",
|
|
19
25
|
}
|
|
20
26
|
|