smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +34 -6
- smftools/cli/hmm_adata.py +239 -33
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +167 -131
- smftools/cli/preprocess_adata.py +180 -53
- smftools/cli/spatial_adata.py +152 -100
- smftools/cli_entry.py +38 -1
- smftools/config/__init__.py +2 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +42 -2
- smftools/config/experiment_config.py +59 -1
- smftools/constants.py +65 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +97 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +59 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1093 -176
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +641 -176
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +123 -4
- smftools/informatics/modkit_extract_to_adata.py +1019 -431
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/metadata.py +1 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +41 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +2415 -629
- smftools/plotting/hmm_plotting.py +97 -9
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +36 -37
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +30 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +93 -8
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
- smftools-0.3.1.dist-info/RECORD +189 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,51 +1,64 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
# duplicate_detection_with_hier_and_plots.py
|
|
2
4
|
import copy
|
|
3
5
|
import math
|
|
4
6
|
import os
|
|
5
7
|
import warnings
|
|
6
8
|
from collections import defaultdict
|
|
7
|
-
from
|
|
9
|
+
from importlib.util import find_spec
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
8
11
|
|
|
9
|
-
import anndata as ad
|
|
10
|
-
import matplotlib.pyplot as plt
|
|
11
12
|
import numpy as np
|
|
12
13
|
import pandas as pd
|
|
13
|
-
import
|
|
14
|
+
from scipy.cluster import hierarchy as sch
|
|
15
|
+
from scipy.spatial.distance import pdist, squareform
|
|
16
|
+
from scipy.stats import gaussian_kde
|
|
14
17
|
|
|
15
18
|
from smftools.logging_utils import get_logger
|
|
19
|
+
from smftools.optional_imports import require
|
|
16
20
|
|
|
17
21
|
from ..readwrite import make_dirs
|
|
18
22
|
|
|
19
23
|
logger = get_logger(__name__)
|
|
20
24
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
from scipy.cluster import hierarchy as sch
|
|
24
|
-
from scipy.spatial.distance import pdist, squareform
|
|
25
|
-
|
|
26
|
-
SCIPY_AVAILABLE = True
|
|
27
|
-
except Exception:
|
|
28
|
-
sch = None
|
|
29
|
-
pdist = None
|
|
30
|
-
squareform = None
|
|
31
|
-
SCIPY_AVAILABLE = False
|
|
25
|
+
plt = require("matplotlib.pyplot", extra="plotting", purpose="duplicate read plots")
|
|
26
|
+
torch = require("torch", extra="torch", purpose="duplicate read detection")
|
|
32
27
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
from sklearn.decomposition import PCA
|
|
36
|
-
from sklearn.metrics import silhouette_score
|
|
37
|
-
from sklearn.mixture import GaussianMixture
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
import anndata as ad
|
|
38
30
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
PCA = None
|
|
42
|
-
KMeans = DBSCAN = GaussianMixture = silhouette_score = None
|
|
43
|
-
SKLEARN_AVAILABLE = False
|
|
31
|
+
SCIPY_AVAILABLE = True
|
|
32
|
+
SKLEARN_AVAILABLE = find_spec("sklearn") is not None
|
|
44
33
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
34
|
+
PCA = None
|
|
35
|
+
KMeans = DBSCAN = GaussianMixture = silhouette_score = None
|
|
36
|
+
if SKLEARN_AVAILABLE:
|
|
37
|
+
sklearn_cluster = require(
|
|
38
|
+
"sklearn.cluster",
|
|
39
|
+
extra="ml-base",
|
|
40
|
+
purpose="duplicate read clustering",
|
|
41
|
+
)
|
|
42
|
+
sklearn_decomp = require(
|
|
43
|
+
"sklearn.decomposition",
|
|
44
|
+
extra="ml-base",
|
|
45
|
+
purpose="duplicate read PCA",
|
|
46
|
+
)
|
|
47
|
+
sklearn_metrics = require(
|
|
48
|
+
"sklearn.metrics",
|
|
49
|
+
extra="ml-base",
|
|
50
|
+
purpose="duplicate read clustering diagnostics",
|
|
51
|
+
)
|
|
52
|
+
sklearn_mixture = require(
|
|
53
|
+
"sklearn.mixture",
|
|
54
|
+
extra="ml-base",
|
|
55
|
+
purpose="duplicate read clustering",
|
|
56
|
+
)
|
|
57
|
+
DBSCAN = sklearn_cluster.DBSCAN
|
|
58
|
+
KMeans = sklearn_cluster.KMeans
|
|
59
|
+
PCA = sklearn_decomp.PCA
|
|
60
|
+
silhouette_score = sklearn_metrics.silhouette_score
|
|
61
|
+
GaussianMixture = sklearn_mixture.GaussianMixture
|
|
49
62
|
|
|
50
63
|
|
|
51
64
|
def merge_uns_preserve(orig_uns: dict, new_uns: dict, prefer: str = "orig") -> dict:
|
|
@@ -153,24 +166,6 @@ def flag_duplicate_reads(
|
|
|
153
166
|
import numpy as np
|
|
154
167
|
import pandas as pd
|
|
155
168
|
|
|
156
|
-
# optional imports already guarded at module import time, but re-check
|
|
157
|
-
try:
|
|
158
|
-
from scipy.cluster import hierarchy as sch
|
|
159
|
-
from scipy.spatial.distance import pdist
|
|
160
|
-
|
|
161
|
-
SCIPY_AVAILABLE = True
|
|
162
|
-
except Exception:
|
|
163
|
-
sch = None
|
|
164
|
-
pdist = None
|
|
165
|
-
SCIPY_AVAILABLE = False
|
|
166
|
-
try:
|
|
167
|
-
from sklearn.decomposition import PCA
|
|
168
|
-
|
|
169
|
-
SKLEARN_AVAILABLE = True
|
|
170
|
-
except Exception:
|
|
171
|
-
PCA = None
|
|
172
|
-
SKLEARN_AVAILABLE = False
|
|
173
|
-
|
|
174
169
|
# -------- helper: demux-aware keeper selection --------
|
|
175
170
|
def _choose_keeper_with_demux_preference(
|
|
176
171
|
members_idx: List[int],
|
|
@@ -1577,13 +1572,6 @@ def _run_clustering(
|
|
|
1577
1572
|
Run clustering on 2D points (x,y). Returns labels (len = npoints) and diagnostics dict.
|
|
1578
1573
|
Labels follow sklearn conventions (noise -> -1 for DBSCAN/HDBSCAN).
|
|
1579
1574
|
"""
|
|
1580
|
-
try:
|
|
1581
|
-
from sklearn.cluster import DBSCAN, KMeans
|
|
1582
|
-
from sklearn.metrics import silhouette_score
|
|
1583
|
-
from sklearn.mixture import GaussianMixture
|
|
1584
|
-
except Exception:
|
|
1585
|
-
KMeans = DBSCAN = GaussianMixture = silhouette_score = None
|
|
1586
|
-
|
|
1587
1575
|
pts = np.column_stack([x, y])
|
|
1588
1576
|
diagnostics: Dict[str, Any] = {"method": method, "n_input": len(x)}
|
|
1589
1577
|
if len(x) < min_points:
|
smftools/readwrite.py
CHANGED
|
@@ -431,6 +431,8 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
431
431
|
"layers_skipped": [],
|
|
432
432
|
"obsm_converted": [],
|
|
433
433
|
"obsm_skipped": [],
|
|
434
|
+
"varm_converted": [],
|
|
435
|
+
"varm_skipped": [],
|
|
434
436
|
"X_replaced_or_converted": None,
|
|
435
437
|
"errors": [],
|
|
436
438
|
}
|
|
@@ -605,10 +607,16 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
605
607
|
|
|
606
608
|
def _sanitize_layers_obsm(src_dict, which: str):
|
|
607
609
|
"""
|
|
608
|
-
Ensure arrays in layers/obsm are numeric and non-object dtype.
|
|
610
|
+
Ensure arrays in layers/obsm/varm are numeric and non-object dtype.
|
|
609
611
|
Returns a cleaned dict suitable to pass into AnnData(...)
|
|
610
612
|
If an entry is not convertible, it is backed up & skipped.
|
|
611
613
|
"""
|
|
614
|
+
report_map = {
|
|
615
|
+
"layers": ("layers_converted", "layers_skipped"),
|
|
616
|
+
"obsm": ("obsm_converted", "obsm_skipped"),
|
|
617
|
+
"varm": ("varm_converted", "varm_skipped"),
|
|
618
|
+
}
|
|
619
|
+
converted_key, skipped_key = report_map[which]
|
|
612
620
|
cleaned = {}
|
|
613
621
|
for k, v in src_dict.items():
|
|
614
622
|
try:
|
|
@@ -618,9 +626,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
618
626
|
arr_f = arr.astype(float)
|
|
619
627
|
cleaned[k] = arr_f
|
|
620
628
|
report_key = f"{which}.{k}"
|
|
621
|
-
report[
|
|
622
|
-
report_key
|
|
623
|
-
) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
629
|
+
report[converted_key].append(report_key)
|
|
624
630
|
if verbose:
|
|
625
631
|
print(f" {which}.{k} object array coerced to float.")
|
|
626
632
|
except Exception:
|
|
@@ -628,18 +634,13 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
628
634
|
arr_i = arr.astype(int)
|
|
629
635
|
cleaned[k] = arr_i
|
|
630
636
|
report_key = f"{which}.{k}"
|
|
631
|
-
report[
|
|
632
|
-
report_key
|
|
633
|
-
) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
637
|
+
report[converted_key].append(report_key)
|
|
634
638
|
if verbose:
|
|
635
639
|
print(f" {which}.{k} object array coerced to int.")
|
|
636
640
|
except Exception:
|
|
637
641
|
if backup:
|
|
638
642
|
_backup(v, f"{which}_{k}_backup")
|
|
639
|
-
|
|
640
|
-
report["layers_skipped"].append(k)
|
|
641
|
-
else:
|
|
642
|
-
report["obsm_skipped"].append(k)
|
|
643
|
+
report[skipped_key].append(k)
|
|
643
644
|
if verbose:
|
|
644
645
|
print(
|
|
645
646
|
f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}"
|
|
@@ -650,10 +651,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
650
651
|
except Exception as e:
|
|
651
652
|
if backup:
|
|
652
653
|
_backup(v, f"{which}_{k}_backup")
|
|
653
|
-
|
|
654
|
-
report["layers_skipped"].append(k)
|
|
655
|
-
else:
|
|
656
|
-
report["obsm_skipped"].append(k)
|
|
654
|
+
report[skipped_key].append(k)
|
|
657
655
|
msg = f" SKIPPING {which}.{k} due to conversion error: {e}"
|
|
658
656
|
report["errors"].append(msg)
|
|
659
657
|
if verbose:
|
|
@@ -693,6 +691,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
693
691
|
# ---------- sanitize layers and obsm ----------
|
|
694
692
|
layers_src = getattr(adata, "layers", {})
|
|
695
693
|
obsm_src = getattr(adata, "obsm", {})
|
|
694
|
+
varm_src = getattr(adata, "varm", {})
|
|
696
695
|
|
|
697
696
|
try:
|
|
698
697
|
layers_clean = _sanitize_layers_obsm(layers_src, "layers")
|
|
@@ -712,6 +711,15 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
712
711
|
print(msg)
|
|
713
712
|
obsm_clean = {}
|
|
714
713
|
|
|
714
|
+
try:
|
|
715
|
+
varm_clean = _sanitize_layers_obsm(varm_src, "varm")
|
|
716
|
+
except Exception as e:
|
|
717
|
+
msg = f"Failed to sanitize varm: {e}"
|
|
718
|
+
report["errors"].append(msg)
|
|
719
|
+
if verbose:
|
|
720
|
+
print(msg)
|
|
721
|
+
varm_clean = {}
|
|
722
|
+
|
|
715
723
|
# ---------- handle X ----------
|
|
716
724
|
X_to_use = adata.X
|
|
717
725
|
try:
|
|
@@ -747,7 +755,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
747
755
|
layers=layers_clean,
|
|
748
756
|
uns=uns_clean,
|
|
749
757
|
obsm=obsm_clean,
|
|
750
|
-
varm=
|
|
758
|
+
varm=varm_clean,
|
|
751
759
|
)
|
|
752
760
|
|
|
753
761
|
# preserve names (as strings)
|
|
@@ -872,6 +880,16 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
872
880
|
}
|
|
873
881
|
)
|
|
874
882
|
|
|
883
|
+
# varm
|
|
884
|
+
for k, v in adata_copy.varm.items():
|
|
885
|
+
meta_rows.append(
|
|
886
|
+
{
|
|
887
|
+
"kind": "varm",
|
|
888
|
+
"name": k,
|
|
889
|
+
"dtype": str(np.asarray(v).dtype),
|
|
890
|
+
}
|
|
891
|
+
)
|
|
892
|
+
|
|
875
893
|
# uns
|
|
876
894
|
for k, v in adata_copy.uns.items():
|
|
877
895
|
meta_rows.append(
|
|
@@ -977,6 +995,7 @@ def safe_read_h5ad(
|
|
|
977
995
|
"parsed_uns_json_keys": [],
|
|
978
996
|
"restored_layers": [],
|
|
979
997
|
"restored_obsm": [],
|
|
998
|
+
"restored_varm": [],
|
|
980
999
|
"recategorized_obs": [],
|
|
981
1000
|
"recategorized_var": [],
|
|
982
1001
|
"missing_backups": [],
|
|
@@ -1215,7 +1234,7 @@ def safe_read_h5ad(
|
|
|
1215
1234
|
print(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
|
|
1216
1235
|
|
|
1217
1236
|
# 5) Restore layers and obsm from backups if present
|
|
1218
|
-
# expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl
|
|
1237
|
+
# expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl, varm_<name>_backup.pkl
|
|
1219
1238
|
if os.path.isdir(backup_dir):
|
|
1220
1239
|
for fname in os.listdir(backup_dir):
|
|
1221
1240
|
if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
|
|
@@ -1248,6 +1267,21 @@ def safe_read_h5ad(
|
|
|
1248
1267
|
f"Failed to restore obsm['{obsm_name}'] from {full}: {e}"
|
|
1249
1268
|
)
|
|
1250
1269
|
|
|
1270
|
+
if fname.startswith("varm_") and fname.endswith("_backup.pkl"):
|
|
1271
|
+
varm_name = fname[len("varm_") : -len("_backup.pkl")]
|
|
1272
|
+
full = os.path.join(backup_dir, fname)
|
|
1273
|
+
val = _load_pickle_if_exists(full)
|
|
1274
|
+
if val is not None:
|
|
1275
|
+
try:
|
|
1276
|
+
adata.varm[varm_name] = np.asarray(val)
|
|
1277
|
+
report["restored_varm"].append((varm_name, full))
|
|
1278
|
+
if verbose:
|
|
1279
|
+
print(f"[safe_read_h5ad] restored varm['{varm_name}'] from {full}")
|
|
1280
|
+
except Exception as e:
|
|
1281
|
+
report["errors"].append(
|
|
1282
|
+
f"Failed to restore varm['{varm_name}'] from {full}: {e}"
|
|
1283
|
+
)
|
|
1284
|
+
|
|
1251
1285
|
# 6) If restore_backups True but some expected backups missing, note them
|
|
1252
1286
|
if restore_backups and os.path.isdir(backup_dir):
|
|
1253
1287
|
# detect common expected names from obs/var/uns/layers in adata
|
|
@@ -1297,6 +1331,8 @@ def safe_read_h5ad(
|
|
|
1297
1331
|
print("Restored layers:", report["restored_layers"])
|
|
1298
1332
|
if report["restored_obsm"]:
|
|
1299
1333
|
print("Restored obsm:", report["restored_obsm"])
|
|
1334
|
+
if report["restored_varm"]:
|
|
1335
|
+
print("Restored varm:", report["restored_varm"])
|
|
1300
1336
|
if report["recategorized_obs"] or report["recategorized_var"]:
|
|
1301
1337
|
print(
|
|
1302
1338
|
"Recategorized columns (obs/var):",
|
|
@@ -60,6 +60,20 @@ stages:
|
|
|
60
60
|
notes: "Mapping quality score."
|
|
61
61
|
requires: []
|
|
62
62
|
optional_inputs: []
|
|
63
|
+
reference_start:
|
|
64
|
+
dtype: "float"
|
|
65
|
+
created_by: "smftools.informatics.h5ad_functions.add_read_length_and_mapping_qc"
|
|
66
|
+
modified_by: []
|
|
67
|
+
notes: "0-based reference start position for the alignment."
|
|
68
|
+
requires: []
|
|
69
|
+
optional_inputs: []
|
|
70
|
+
reference_end:
|
|
71
|
+
dtype: "float"
|
|
72
|
+
created_by: "smftools.informatics.h5ad_functions.add_read_length_and_mapping_qc"
|
|
73
|
+
modified_by: []
|
|
74
|
+
notes: "0-based reference end position (exclusive) for the alignment."
|
|
75
|
+
requires: []
|
|
76
|
+
optional_inputs: []
|
|
63
77
|
read_length_to_reference_length_ratio:
|
|
64
78
|
dtype: "float"
|
|
65
79
|
created_by: "smftools.informatics.h5ad_functions.add_read_length_and_mapping_qc"
|
|
@@ -179,7 +193,7 @@ stages:
|
|
|
179
193
|
obs:
|
|
180
194
|
leiden:
|
|
181
195
|
dtype: "category"
|
|
182
|
-
created_by: "smftools.tools.
|
|
196
|
+
created_by: "smftools.tools.calculate_leiden"
|
|
183
197
|
modified_by: []
|
|
184
198
|
notes: "Leiden cluster assignments."
|
|
185
199
|
requires: [["obsm.X_umap"]]
|
smftools/tools/__init__.py
CHANGED
|
@@ -1,19 +1,31 @@
|
|
|
1
|
-
from
|
|
2
|
-
from .cluster_adata_on_methylation import cluster_adata_on_methylation
|
|
3
|
-
from .general_tools import combine_layers, create_nan_mask_from_X, create_nan_or_non_gpc_mask
|
|
4
|
-
from .position_stats import calculate_relative_risk_on_activity, compute_positionwise_statistics
|
|
5
|
-
from .read_stats import calculate_row_entropy
|
|
6
|
-
from .spatial_autocorrelation import *
|
|
7
|
-
from .subset_adata import subset_adata
|
|
1
|
+
from __future__ import annotations
|
|
8
2
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"combine_layers",
|
|
18
|
-
"
|
|
19
|
-
|
|
3
|
+
from importlib import import_module
|
|
4
|
+
|
|
5
|
+
_LAZY_ATTRS = {
|
|
6
|
+
"calculate_leiden": "smftools.tools.calculate_leiden",
|
|
7
|
+
"calculate_nmf": "smftools.tools.calculate_nmf",
|
|
8
|
+
"calculate_sequence_cp_decomposition": "smftools.tools.tensor_factorization",
|
|
9
|
+
"calculate_umap": "smftools.tools.calculate_umap",
|
|
10
|
+
"cluster_adata_on_methylation": "smftools.tools.cluster_adata_on_methylation",
|
|
11
|
+
"combine_layers": "smftools.tools.general_tools",
|
|
12
|
+
"create_nan_mask_from_X": "smftools.tools.general_tools",
|
|
13
|
+
"create_nan_or_non_gpc_mask": "smftools.tools.general_tools",
|
|
14
|
+
"calculate_relative_risk_on_activity": "smftools.tools.position_stats",
|
|
15
|
+
"compute_positionwise_statistics": "smftools.tools.position_stats",
|
|
16
|
+
"calculate_row_entropy": "smftools.tools.read_stats",
|
|
17
|
+
"rolling_window_nn_distance": "smftools.tools.rolling_nn_distance",
|
|
18
|
+
"subset_adata": "smftools.tools.subset_adata",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def __getattr__(name: str):
|
|
23
|
+
if name in _LAZY_ATTRS:
|
|
24
|
+
module = import_module(_LAZY_ATTRS[name])
|
|
25
|
+
attr = getattr(module, name)
|
|
26
|
+
globals()[name] = attr
|
|
27
|
+
return attr
|
|
28
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
__all__ = list(_LAZY_ATTRS.keys())
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from smftools.logging_utils import get_logger
|
|
9
|
+
from smftools.optional_imports import require
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
import anndata as ad
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def calculate_leiden(
|
|
18
|
+
adata: "ad.AnnData",
|
|
19
|
+
*,
|
|
20
|
+
resolution: float = 0.1,
|
|
21
|
+
key_added: str = "leiden",
|
|
22
|
+
connectivities_key: str = "connectivities",
|
|
23
|
+
) -> "ad.AnnData":
|
|
24
|
+
"""Compute Leiden clusters from a connectivity graph.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
adata: AnnData object with ``obsp[connectivities_key]`` set.
|
|
28
|
+
resolution: Resolution parameter for Leiden clustering.
|
|
29
|
+
key_added: Column name to store cluster assignments in ``adata.obs``.
|
|
30
|
+
connectivities_key: Key in ``adata.obsp`` containing a sparse adjacency matrix.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Updated AnnData object with Leiden labels in ``adata.obs``.
|
|
34
|
+
"""
|
|
35
|
+
if connectivities_key not in adata.obsp:
|
|
36
|
+
raise KeyError(f"Missing connectivities '{connectivities_key}' in adata.obsp.")
|
|
37
|
+
|
|
38
|
+
igraph = require("igraph", extra="cluster", purpose="Leiden clustering")
|
|
39
|
+
leidenalg = require("leidenalg", extra="cluster", purpose="Leiden clustering")
|
|
40
|
+
|
|
41
|
+
connectivities = adata.obsp[connectivities_key]
|
|
42
|
+
coo = connectivities.tocoo()
|
|
43
|
+
edges = list(zip(coo.row.tolist(), coo.col.tolist()))
|
|
44
|
+
graph = igraph.Graph(n=connectivities.shape[0], edges=edges, directed=False)
|
|
45
|
+
graph.es["weight"] = coo.data.tolist()
|
|
46
|
+
|
|
47
|
+
partition = leidenalg.find_partition(
|
|
48
|
+
graph,
|
|
49
|
+
leidenalg.RBConfigurationVertexPartition,
|
|
50
|
+
weights=graph.es["weight"],
|
|
51
|
+
resolution_parameter=resolution,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
labels = np.array(partition.membership, dtype=str)
|
|
55
|
+
adata.obs[key_added] = pd.Categorical(labels)
|
|
56
|
+
logger.info("Stored Leiden clusters in adata.obs['%s'].", key_added)
|
|
57
|
+
return adata
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Sequence
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from smftools.logging_utils import get_logger
|
|
8
|
+
from smftools.optional_imports import require
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import anndata as ad
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def calculate_nmf(
|
|
17
|
+
adata: "ad.AnnData",
|
|
18
|
+
layer: str | None = "nan_half",
|
|
19
|
+
var_filters: Sequence[str] | None = None,
|
|
20
|
+
n_components: int = 2,
|
|
21
|
+
max_iter: int = 200,
|
|
22
|
+
random_state: int = 0,
|
|
23
|
+
overwrite: bool = True,
|
|
24
|
+
embedding_key: str = "X_nmf",
|
|
25
|
+
components_key: str = "H_nmf",
|
|
26
|
+
uns_key: str = "nmf",
|
|
27
|
+
) -> "ad.AnnData":
|
|
28
|
+
"""Compute a low-dimensional NMF embedding.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
adata: AnnData object to update.
|
|
32
|
+
layer: Layer name to use for NMF (``None`` uses ``adata.X``).
|
|
33
|
+
var_filters: Optional list of var masks to subset features.
|
|
34
|
+
n_components: Number of NMF components to compute.
|
|
35
|
+
max_iter: Maximum number of NMF iterations.
|
|
36
|
+
random_state: Random seed for the NMF initializer.
|
|
37
|
+
overwrite: Whether to recompute if the embedding already exists.
|
|
38
|
+
embedding_key: Key for the embedding in ``adata.obsm``.
|
|
39
|
+
components_key: Key for the components matrix in ``adata.varm``.
|
|
40
|
+
uns_key: Key for metadata stored in ``adata.uns``.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
anndata.AnnData: Updated AnnData object.
|
|
44
|
+
"""
|
|
45
|
+
from scipy.sparse import issparse
|
|
46
|
+
|
|
47
|
+
require("sklearn", extra="ml-base", purpose="NMF calculation")
|
|
48
|
+
from sklearn.decomposition import NMF
|
|
49
|
+
|
|
50
|
+
has_embedding = embedding_key in adata.obsm
|
|
51
|
+
has_components = components_key in adata.varm
|
|
52
|
+
if has_embedding and has_components and not overwrite:
|
|
53
|
+
logger.info("NMF embedding and components already present; skipping recomputation.")
|
|
54
|
+
return adata
|
|
55
|
+
if has_embedding and not has_components and not overwrite:
|
|
56
|
+
logger.info("NMF embedding present without components; recomputing to store components.")
|
|
57
|
+
|
|
58
|
+
subset_mask = None
|
|
59
|
+
if var_filters:
|
|
60
|
+
subset_mask = np.logical_or.reduce([adata.var[f].values for f in var_filters])
|
|
61
|
+
adata_subset = adata[:, subset_mask].copy()
|
|
62
|
+
logger.info(
|
|
63
|
+
"Subsetting adata: retained %s features based on filters %s",
|
|
64
|
+
adata_subset.shape[1],
|
|
65
|
+
var_filters,
|
|
66
|
+
)
|
|
67
|
+
else:
|
|
68
|
+
adata_subset = adata.copy()
|
|
69
|
+
logger.info("No var filters provided. Using all features.")
|
|
70
|
+
|
|
71
|
+
data = adata_subset.layers[layer] if layer else adata_subset.X
|
|
72
|
+
if issparse(data):
|
|
73
|
+
data = data.copy()
|
|
74
|
+
if data.data.size and np.isnan(data.data).any():
|
|
75
|
+
logger.warning("NaNs detected in sparse data, filling with 0.5 before NMF.")
|
|
76
|
+
data.data = np.nan_to_num(data.data, nan=0.5)
|
|
77
|
+
if data.data.size and (data.data < 0).any():
|
|
78
|
+
logger.warning("Negative values detected in sparse data, clipping to 0 for NMF.")
|
|
79
|
+
data.data[data.data < 0] = 0
|
|
80
|
+
else:
|
|
81
|
+
if np.isnan(data).any():
|
|
82
|
+
logger.warning("NaNs detected, filling with 0.5 before NMF.")
|
|
83
|
+
data = np.nan_to_num(data, nan=0.5)
|
|
84
|
+
if (data < 0).any():
|
|
85
|
+
logger.warning("Negative values detected, clipping to 0 for NMF.")
|
|
86
|
+
data = np.clip(data, a_min=0, a_max=None)
|
|
87
|
+
|
|
88
|
+
model = NMF(
|
|
89
|
+
n_components=n_components,
|
|
90
|
+
init="nndsvda",
|
|
91
|
+
max_iter=max_iter,
|
|
92
|
+
random_state=random_state,
|
|
93
|
+
)
|
|
94
|
+
embedding = model.fit_transform(data)
|
|
95
|
+
components = model.components_.T
|
|
96
|
+
|
|
97
|
+
if subset_mask is not None:
|
|
98
|
+
components_matrix = np.zeros((adata.shape[1], components.shape[1]))
|
|
99
|
+
components_matrix[subset_mask, :] = components
|
|
100
|
+
else:
|
|
101
|
+
components_matrix = components
|
|
102
|
+
|
|
103
|
+
adata.obsm[embedding_key] = embedding
|
|
104
|
+
adata.varm[components_key] = components_matrix
|
|
105
|
+
adata.uns[uns_key] = {
|
|
106
|
+
"n_components": n_components,
|
|
107
|
+
"max_iter": max_iter,
|
|
108
|
+
"random_state": random_state,
|
|
109
|
+
"layer": layer,
|
|
110
|
+
"var_filters": list(var_filters) if var_filters else None,
|
|
111
|
+
"components_key": components_key,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
logger.info(
|
|
115
|
+
"Stored: adata.obsm['%s'] and adata.varm['%s']",
|
|
116
|
+
embedding_key,
|
|
117
|
+
components_key,
|
|
118
|
+
)
|
|
119
|
+
return adata
|