PyPI - smftools - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

smftools/__init__.py +43 -13
smftools/_settings.py +6 -6
smftools/_version.py +3 -1
smftools/cli/__init__.py +1 -0
smftools/cli/archived/cli_flows.py +2 -0
smftools/cli/helpers.py +9 -1
smftools/cli/hmm_adata.py +905 -242
smftools/cli/load_adata.py +432 -280
smftools/cli/preprocess_adata.py +287 -171
smftools/cli/spatial_adata.py +141 -53
smftools/cli_entry.py +119 -178
smftools/config/__init__.py +3 -1
smftools/config/conversion.yaml +5 -1
smftools/config/deaminase.yaml +1 -1
smftools/config/default.yaml +26 -18
smftools/config/direct.yaml +8 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +511 -276
smftools/constants.py +37 -0
smftools/datasets/__init__.py +4 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2133 -1428
smftools/hmm/__init__.py +24 -14
smftools/hmm/archived/apply_hmm_batched.py +2 -0
smftools/hmm/archived/calculate_distances.py +2 -0
smftools/hmm/archived/call_hmm_peaks.py +18 -1
smftools/hmm/archived/train_hmm.py +2 -0
smftools/hmm/call_hmm_peaks.py +176 -193
smftools/hmm/display_hmm.py +23 -7
smftools/hmm/hmm_readwrite.py +20 -6
smftools/hmm/nucleosome_hmm_refinement.py +104 -14
smftools/informatics/__init__.py +55 -13
smftools/informatics/archived/bam_conversion.py +2 -0
smftools/informatics/archived/bam_direct.py +2 -0
smftools/informatics/archived/basecall_pod5s.py +2 -0
smftools/informatics/archived/basecalls_to_adata.py +2 -0
smftools/informatics/archived/conversion_smf.py +2 -0
smftools/informatics/archived/deaminase_smf.py +1 -0
smftools/informatics/archived/direct_smf.py +2 -0
smftools/informatics/archived/fast5_to_pod5.py +2 -0
smftools/informatics/archived/helpers/archived/__init__.py +2 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
smftools/informatics/archived/helpers/archived/informatics.py +2 -0
smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
smftools/informatics/archived/helpers/archived/modQC.py +2 -0
smftools/informatics/archived/helpers/archived/modcall.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +9 -1
smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
smftools/informatics/archived/subsample_pod5.py +2 -0
smftools/informatics/bam_functions.py +1059 -269
smftools/informatics/basecalling.py +53 -9
smftools/informatics/bed_functions.py +357 -114
smftools/informatics/binarize_converted_base_identities.py +21 -7
smftools/informatics/complement_base_list.py +9 -6
smftools/informatics/converted_BAM_to_adata.py +324 -137
smftools/informatics/fasta_functions.py +251 -89
smftools/informatics/h5ad_functions.py +202 -30
smftools/informatics/modkit_extract_to_adata.py +623 -274
smftools/informatics/modkit_functions.py +87 -44
smftools/informatics/ohe.py +46 -21
smftools/informatics/pod5_functions.py +114 -74
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +23 -12
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +157 -50
smftools/machine_learning/data/preprocessing.py +4 -1
smftools/machine_learning/evaluation/__init__.py +3 -1
smftools/machine_learning/evaluation/eval_utils.py +13 -14
smftools/machine_learning/evaluation/evaluators.py +52 -34
smftools/machine_learning/inference/__init__.py +3 -1
smftools/machine_learning/inference/inference_utils.py +9 -4
smftools/machine_learning/inference/lightning_inference.py +14 -13
smftools/machine_learning/inference/sklearn_inference.py +8 -8
smftools/machine_learning/inference/sliding_window_inference.py +37 -25
smftools/machine_learning/models/__init__.py +12 -5
smftools/machine_learning/models/base.py +34 -43
smftools/machine_learning/models/cnn.py +22 -13
smftools/machine_learning/models/lightning_base.py +78 -42
smftools/machine_learning/models/mlp.py +18 -5
smftools/machine_learning/models/positional.py +10 -4
smftools/machine_learning/models/rnn.py +8 -3
smftools/machine_learning/models/sklearn_models.py +46 -24
smftools/machine_learning/models/transformer.py +75 -55
smftools/machine_learning/models/wrappers.py +8 -3
smftools/machine_learning/training/__init__.py +4 -2
smftools/machine_learning/training/train_lightning_model.py +42 -23
smftools/machine_learning/training/train_sklearn_model.py +11 -15
smftools/machine_learning/utils/__init__.py +3 -1
smftools/machine_learning/utils/device.py +12 -5
smftools/machine_learning/utils/grl.py +8 -2
smftools/metadata.py +443 -0
smftools/optional_imports.py +31 -0
smftools/plotting/__init__.py +32 -17
smftools/plotting/autocorrelation_plotting.py +153 -48
smftools/plotting/classifiers.py +175 -73
smftools/plotting/general_plotting.py +350 -168
smftools/plotting/hmm_plotting.py +53 -14
smftools/plotting/position_stats.py +155 -87
smftools/plotting/qc_plotting.py +25 -12
smftools/preprocessing/__init__.py +35 -37
smftools/preprocessing/append_base_context.py +105 -79
smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +127 -31
smftools/preprocessing/binary_layers_to_ohe.py +18 -11
smftools/preprocessing/calculate_complexity_II.py +89 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +44 -22
smftools/preprocessing/calculate_pairwise_differences.py +4 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
smftools/preprocessing/calculate_position_Youden.py +110 -55
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
smftools/preprocessing/flag_duplicate_reads.py +708 -303
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +9 -3
smftools/preprocessing/min_non_diagonal.py +4 -1
smftools/preprocessing/recipes.py +58 -23
smftools/preprocessing/reindex_references_adata.py +93 -27
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +264 -109
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +25 -18
smftools/tools/archived/apply_hmm.py +2 -0
smftools/tools/archived/classifiers.py +165 -0
smftools/tools/archived/classify_methylated_features.py +2 -0
smftools/tools/archived/classify_non_methylated_features.py +2 -0
smftools/tools/archived/subset_adata_v1.py +12 -1
smftools/tools/archived/subset_adata_v2.py +14 -1
smftools/tools/calculate_umap.py +56 -15
smftools/tools/cluster_adata_on_methylation.py +122 -47
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +220 -99
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
smftools-0.3.0.dist-info/METADATA +147 -0
smftools-0.3.0.dist-info/RECORD +182 -0
smftools-0.2.4.dist-info/METADATA +0 -141
smftools-0.2.4.dist-info/RECORD +0 -176
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0

smftools/machine_learning/evaluation/evaluators.py CHANGED Viewed

@@ -1,15 +1,26 @@
+from __future__ import annotations
 import numpy as np
 import pandas as pd
-import matplotlib.pyplot as plt
-from sklearn.metrics import (
-    roc_auc_score, precision_recall_curve, auc, f1_score, confusion_matrix, roc_curve
-)
+from smftools.optional_imports import require
+plt = require("matplotlib.pyplot", extra="plotting", purpose="evaluation plots")
+sklearn_metrics = require("sklearn.metrics", extra="ml-base", purpose="model evaluation")
+auc = sklearn_metrics.auc
+confusion_matrix = sklearn_metrics.confusion_matrix
+f1_score = sklearn_metrics.f1_score
+precision_recall_curve = sklearn_metrics.precision_recall_curve
+roc_auc_score = sklearn_metrics.roc_auc_score
+roc_curve = sklearn_metrics.roc_curve
 class ModelEvaluator:
     """
     A model evaluator for consolidating Sklearn and Lightning model evaluation metrics on testing data
     """
     def __init__(self):
         self.results = []
         self.pos_freq = None
@@ -21,41 +32,45 @@ class ModelEvaluator:
         """
         if is_torch:
             entry = {
-                'name': name,
-                'f1': model.test_f1,
-                'auc': model.test_roc_auc,
-                'pr_auc': model.test_pr_auc,
-                'pr_auc_norm': model.test_pr_auc / model.test_pos_freq if model.test_pos_freq > 0 else np.nan,
-                'pr_curve': model.test_pr_curve,
-                'roc_curve': model.test_roc_curve,
-                'num_pos': model.test_num_pos,
-                'pos_freq': model.test_pos_freq
+                "name": name,
+                "f1": model.test_f1,
+                "auc": model.test_roc_auc,
+                "pr_auc": model.test_pr_auc,
+                "pr_auc_norm": model.test_pr_auc / model.test_pos_freq
+                if model.test_pos_freq > 0
+                else np.nan,
+                "pr_curve": model.test_pr_curve,
+                "roc_curve": model.test_roc_curve,
+                "num_pos": model.test_num_pos,
+                "pos_freq": model.test_pos_freq,
             }
         else:
             entry = {
-                'name': name,
-                'f1': model.test_f1,
-                'auc': model.test_roc_auc,
-                'pr_auc': model.test_pr_auc,
-                'pr_auc_norm': model.test_pr_auc / model.test_pos_freq if model.test_pos_freq > 0 else np.nan,
-                'pr_curve': model.test_pr_curve,
-                'roc_curve': model.test_roc_curve,
-                'num_pos': model.test_num_pos,
-                'pos_freq': model.test_pos_freq
+                "name": name,
+                "f1": model.test_f1,
+                "auc": model.test_roc_auc,
+                "pr_auc": model.test_pr_auc,
+                "pr_auc_norm": model.test_pr_auc / model.test_pos_freq
+                if model.test_pos_freq > 0
+                else np.nan,
+                "pr_curve": model.test_pr_curve,
+                "roc_curve": model.test_roc_curve,
+                "num_pos": model.test_num_pos,
+                "pos_freq": model.test_pos_freq,
             }
         self.results.append(entry)
         if not self.pos_freq:
-            self.pos_freq = entry['pos_freq']
-            self.num_pos = entry['num_pos']
+            self.pos_freq = entry["pos_freq"]
+            self.num_pos = entry["num_pos"]
     def get_metrics_dataframe(self):
         """
         Return all metrics as pandas DataFrame.
         """
         df = pd.DataFrame(self.results)
-        return df[['name', 'f1', 'auc', 'pr_auc', 'pr_auc_norm', 'num_pos', 'pos_freq']]
+        return df[["name", "f1", "auc", "pr_auc", "pr_auc_norm", "num_pos", "pos_freq"]]
     def plot_all_curves(self):
         """
@@ -66,30 +81,31 @@ class ModelEvaluator:
         # ROC
         plt.subplot(1, 2, 1)
         for res in self.results:
-            fpr, tpr = res['roc_curve']
+            fpr, tpr = res["roc_curve"]
             plt.plot(fpr, tpr, label=f"{res['name']} (AUC={res['auc']:.3f})")
         plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
         plt.xlabel("False Positive Rate")
         plt.ylabel("True Positive Rate")
-        plt.ylim(0,1.05)
+        plt.ylim(0, 1.05)
         plt.title(f"ROC Curves - {self.num_pos} positive instances")
         plt.legend()
         # PR
         plt.subplot(1, 2, 2)
         for res in self.results:
-            rc, pr = res['pr_curve']
+            rc, pr = res["pr_curve"]
             plt.plot(rc, pr, label=f"{res['name']} (AUPRC={res['pr_auc']:.3f})")
         plt.xlabel("Recall")
         plt.ylabel("Precision")
-        plt.ylim(0,1.05)
-        plt.axhline(self.pos_freq, linestyle='--', color='grey')
+        plt.ylim(0, 1.05)
+        plt.axhline(self.pos_freq, linestyle="--", color="grey")
         plt.title(f"Precision-Recall Curves - {self.num_pos} positive instances")
         plt.legend()
         plt.tight_layout()
         plt.show()
 class PostInferenceModelEvaluator:
     def __init__(self, adata, models, target_eval_freq=None, max_eval_positive=None):
         """
@@ -179,12 +195,14 @@ class PostInferenceModelEvaluator:
             "pos_freq": pos_freq,
             "confusion_matrix": cm,
             "pr_rc_curve": (pr, rc),
-            "roc_curve": (tpr, fpr)
+            "roc_curve": (tpr, fpr),
         }
         return metrics
-    def _subsample_for_fixed_positive_frequency(self, binary_labels, target_freq=0.3, max_positive=None):
+    def _subsample_for_fixed_positive_frequency(
+        self, binary_labels, target_freq=0.3, max_positive=None
+    ):
         pos_idx = np.where(binary_labels == 1)[0]
         neg_idx = np.where(binary_labels == 0)[0]

smftools/machine_learning/inference/__init__.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 from .lightning_inference import run_lightning_inference
+from .sklearn_inference import run_sklearn_inference
 from .sliding_window_inference import sliding_window_inference
-from .sklearn_inference import run_sklearn_inference

smftools/machine_learning/inference/inference_utils.py CHANGED Viewed

@@ -1,5 +1,8 @@
+from __future__ import annotations
 import pandas as pd
 def annotate_split_column(adata, model, split_col="split"):
     """
     Annotate adata.obs with train/val/test/new labels based on model's stored obs_names.
@@ -8,7 +11,7 @@ def annotate_split_column(adata, model, split_col="split"):
     train_set = set(model.train_obs_names)
     val_set = set(model.val_obs_names)
     test_set = set(model.test_obs_names)
     # Create array for split labels
     split_labels = []
     for obs in adata.obs_names:
@@ -20,8 +23,10 @@ def annotate_split_column(adata, model, split_col="split"):
             split_labels.append("testing")
         else:
             split_labels.append("new")
     # Store in AnnData.obs
-    adata.obs[split_col] = pd.Categorical(split_labels, categories=["training", "validation", "testing", "new"])
+    adata.obs[split_col] = pd.Categorical(
+        split_labels, categories=["training", "validation", "testing", "new"]
+    )
     print(f"Annotated {split_col} column with training/validation/testing/new status.")

smftools/machine_learning/inference/lightning_inference.py CHANGED Viewed

@@ -1,17 +1,16 @@
-import torch
-import pandas as pd
+from __future__ import annotations
 import numpy as np
-from pytorch_lightning import Trainer
+import pandas as pd
+from smftools.optional_imports import require
 from .inference_utils import annotate_split_column
-def run_lightning_inference(
-    adata,
-    model,
-    datamodule,
-    trainer,
-    prefix="model",
-    devices=1
-):
+torch = require("torch", extra="ml-base", purpose="Lightning inference")
+def run_lightning_inference(adata, model, datamodule, trainer, prefix="model", devices=1):
     """
     Run inference on AnnData using TorchClassifierWrapper + AnnDataModule (in inference mode).
     """
@@ -57,7 +56,9 @@ def run_lightning_inference(
     full_prefix = f"{prefix}_{label_col}"
     adata.obs[f"{full_prefix}_pred"] = pred_class_idx
-    adata.obs[f"{full_prefix}_pred_label"] = pd.Categorical(pred_class_labels, categories=class_labels)
+    adata.obs[f"{full_prefix}_pred_label"] = pd.Categorical(
+        pred_class_labels, categories=class_labels
+    )
     adata.obs[f"{full_prefix}_pred_prob"] = pred_class_probs
     for i, class_name in enumerate(class_labels):
@@ -65,4 +66,4 @@ def run_lightning_inference(
     adata.obsm[f"{full_prefix}_pred_prob_all"] = probs_all
-    print(f"Inference complete: stored under prefix '{full_prefix}'")
+    print(f"Inference complete: stored under prefix '{full_prefix}'")

smftools/machine_learning/inference/sklearn_inference.py CHANGED Viewed

@@ -1,14 +1,12 @@
-import pandas as pd
+from __future__ import annotations
 import numpy as np
+import pandas as pd
 from .inference_utils import annotate_split_column
-def run_sklearn_inference(
-    adata,
-    model,
-    datamodule,
-    prefix="model"
-):
+def run_sklearn_inference(adata, model, datamodule, prefix="model"):
     """
     Run inference on AnnData using SklearnModelWrapper.
     """
@@ -44,7 +42,9 @@ def run_sklearn_inference(
     full_prefix = f"{prefix}_{label_col}"
     adata.obs[f"{full_prefix}_pred"] = pred_class_idx
-    adata.obs[f"{full_prefix}_pred_label"] = pd.Categorical(pred_class_labels, categories=class_labels)
+    adata.obs[f"{full_prefix}_pred_label"] = pd.Categorical(
+        pred_class_labels, categories=class_labels
+    )
     adata.obs[f"{full_prefix}_pred_prob"] = pred_class_probs
     for i, class_name in enumerate(class_labels):

smftools/machine_learning/inference/sliding_window_inference.py CHANGED Viewed

@@ -1,18 +1,21 @@
+from __future__ import annotations
 from ..data import AnnDataModule
 from ..evaluation import PostInferenceModelEvaluator
 from .lightning_inference import run_lightning_inference
 from .sklearn_inference import run_sklearn_inference
 def sliding_window_inference(
-    adata,
-    trained_results,
-    tensor_source='X',
+    adata,
+    trained_results,
+    tensor_source="X",
     tensor_key=None,
-    label_col='activity_status',
+    label_col="activity_status",
     batch_size=64,
     cleanup=False,
-    target_eval_freq=None,
-    max_eval_positive=None
+    target_eval_freq=None,
+    max_eval_positive=None,
 ):
     """
     Apply trained sliding window models to an AnnData object (Lightning or Sklearn).
@@ -24,11 +27,11 @@ def sliding_window_inference(
         for window_size, window_data in model_dict.items():
             for center_varname, run in window_data.items():
                 print(f"\nEvaluating {model_name} window {window_size} around {center_varname}")
                 # Extract window start from varname
                 center_idx = adata.var_names.get_loc(center_varname)
                 window_start = center_idx - window_size // 2
                 # Build datamodule for window
                 datamodule = AnnDataModule(
                     adata,
@@ -38,31 +41,31 @@ def sliding_window_inference(
                     batch_size=batch_size,
                     window_start=window_start,
                     window_size=window_size,
-                    inference_mode=True
+                    inference_mode=True,
                 )
                 datamodule.setup()
                 # Extract model + detect type
-                model = run['model']
+                model = run["model"]
                 # Lightning models
-                if hasattr(run, 'trainer') or 'trainer' in run:
-                    trainer = run['trainer']
+                if hasattr(run, "trainer") or "trainer" in run:
+                    trainer = run["trainer"]
                     run_lightning_inference(
                         adata,
                         model=model,
                         datamodule=datamodule,
                         trainer=trainer,
-                        prefix=f"{model_name}_w{window_size}_c{center_varname}"
+                        prefix=f"{model_name}_w{window_size}_c{center_varname}",
                     )
                 # Sklearn models
                 else:
                     run_sklearn_inference(
                         adata,
                         model=model,
                         datamodule=datamodule,
-                        prefix=f"{model_name}_w{window_size}_c{center_varname}"
+                        prefix=f"{model_name}_w{window_size}_c{center_varname}",
                     )
     print("Inference complete across all models.")
@@ -77,27 +80,36 @@ def sliding_window_inference(
                 prefix = f"{model_name}_w{window_size}_c{center_varname}"
                 # Use full key for uniqueness
                 key = prefix
-                model_wrappers[key] = run['model']
+                model_wrappers[key] = run["model"]
     # Run evaluator
-    evaluator = PostInferenceModelEvaluator(adata, model_wrappers, target_eval_freq=target_eval_freq, max_eval_positive=max_eval_positive)
+    evaluator = PostInferenceModelEvaluator(
+        adata,
+        model_wrappers,
+        target_eval_freq=target_eval_freq,
+        max_eval_positive=max_eval_positive,
+    )
     evaluator.evaluate_all()
     # Get results
     df = evaluator.to_dataframe()
-    df[['model_name', 'window_size', 'center']] = df['model'].str.extract(r'(\w+)_w(\d+)_c(\d+)_activity_status')
+    df[["model_name", "window_size", "center"]] = df["model"].str.extract(
+        r"(\w+)_w(\d+)_c(\d+)_activity_status"
+    )
     # Cast window_size and center to integers for plotting
-    df['window_size'] = df['window_size'].astype(int)
-    df['center'] = df['center'].astype(int)
+    df["window_size"] = df["window_size"].astype(int)
+    df["center"] = df["center"].astype(int)
     ## Optional cleanup:
     if cleanup:
-        prefixes = [f"{model_name}_w{window_size}_c{center_varname}"
-                    for model_name, model_dict in trained_results.items()
-                    for window_size, window_data in model_dict.items()
-                    for center_varname in window_data.keys()]
+        prefixes = [
+            f"{model_name}_w{window_size}_c{center_varname}"
+            for model_name, model_dict in trained_results.items()
+            for window_size, window_data in model_dict.items()
+            for center_varname in window_data.keys()
+        ]
         # Remove matching obs columns
         for prefix in prefixes:
@@ -111,4 +123,4 @@ def sliding_window_inference(
         print(f"Cleaned up {len(prefixes)} model prefixes from AnnData.")
-    return df
+    return df

smftools/machine_learning/models/__init__.py CHANGED Viewed

@@ -1,9 +1,16 @@
+from __future__ import annotations
 from .base import BaseTorchModel
-from .mlp import MLPClassifier
 from .cnn import CNNClassifier
-from .rnn import RNNClassifier
-from .transformer import BaseTransformer, TransformerClassifier, DANNTransformerClassifier, MaskedTransformerPretrainer
+from .lightning_base import TorchClassifierWrapper
+from .mlp import MLPClassifier
 from .positional import PositionalEncoding
+from .rnn import RNNClassifier
+from .sklearn_models import SklearnModelWrapper
+from .transformer import (
+    BaseTransformer,
+    DANNTransformerClassifier,
+    MaskedTransformerPretrainer,
+    TransformerClassifier,
+)
 from .wrappers import ScaledModel
-from .lightning_base import TorchClassifierWrapper
-from .sklearn_models import SklearnModelWrapper

smftools/machine_learning/models/base.py CHANGED Viewed

@@ -1,17 +1,25 @@
-import torch
-import torch.nn as nn
+from __future__ import annotations
 import numpy as np
+from smftools.optional_imports import require
 from ..utils.device import detect_device
+torch = require("torch", extra="ml-base", purpose="ML base models")
+nn = torch.nn
 class BaseTorchModel(nn.Module):
     """
     Minimal base class for torch models that:
     - Stores device and dropout regularization
     """
     def __init__(self, dropout_rate=0.0):
         super().__init__()
-        self.device = detect_device() # detects available devices
-        self.dropout_rate = dropout_rate # default dropout rate to be used in regularization.
+        self.device = detect_device()  # detects available devices
+        self.dropout_rate = dropout_rate  # default dropout rate to be used in regularization.
     def compute_saliency(
         self,
@@ -21,11 +29,11 @@ class BaseTorchModel(nn.Module):
         smoothgrad=False,
         smooth_samples=25,
         smooth_noise=0.1,
-        signed=True
+        signed=True,
     ):
         """
         Compute vanilla saliency or SmoothGrad saliency.
         Arguments:
         ----------
         x : torch.Tensor
@@ -43,7 +51,7 @@ class BaseTorchModel(nn.Module):
         """
         self.eval()
         x = x.clone().detach().requires_grad_(True)
         if smoothgrad:
             saliency_accum = torch.zeros_like(x)
             for i in range(smooth_samples):
@@ -56,7 +64,7 @@ class BaseTorchModel(nn.Module):
                 if logits.shape[1] == 1:
                     scores = logits.squeeze(1)
                 else:
-                    scores = logits[torch.arange(x.shape[0]), target_class]
+                    scores = logits[torch.arange(x.shape[0]), target_class]
                 scores.sum().backward()
                 saliency_accum += x_noisy.grad.detach()
             saliency = saliency_accum / smooth_samples
@@ -69,17 +77,17 @@ class BaseTorchModel(nn.Module):
                 scores = logits[torch.arange(x.shape[0]), target_class]
             scores.sum().backward()
             saliency = x.grad.detach()
         if not signed:
             saliency = saliency.abs()
         if reduction == "sum" and x.ndim == 3:
             return saliency.sum(dim=-1)
         elif reduction == "mean" and x.ndim == 3:
             return saliency.mean(dim=-1)
         else:
             return saliency
     def compute_gradient_x_input(self, x, target_class=None):
         """
         Computes gradient × input attribution.
@@ -118,22 +126,11 @@ class BaseTorchModel(nn.Module):
             baseline = torch.zeros_like(x)
         attributions, delta = ig.attribute(
-            x,
-            baselines=baseline,
-            target=target_class,
-            n_steps=steps,
-            return_convergence_delta=True
+            x, baselines=baseline, target=target_class, n_steps=steps, return_convergence_delta=True
         )
         return attributions, delta
-    def compute_deeplift(
-        self,
-        x,
-        baseline=None,
-        target_class=None,
-        reduction="sum",
-        signed=True
-    ):
+    def compute_deeplift(self, x, baseline=None, target_class=None, reduction="sum", signed=True):
         """
         Compute DeepLIFT scores using captum.
@@ -158,21 +155,15 @@ class BaseTorchModel(nn.Module):
         if not signed:
             attr = attr.abs()
         if reduction == "sum" and x.ndim == 3:
             return attr.sum(dim=-1)
         elif reduction == "mean" and x.ndim == 3:
             return attr.mean(dim=-1)
         else:
             return attr
-    def compute_occlusion(
-        self,
-        x,
-        target_class=None,
-        window_size=5,
-        baseline=None
-    ):
+    def compute_occlusion(self, x, target_class=None, window_size=5, baseline=None):
         """
         Computes per-sample occlusion attribution.
         Supports 2D [B, S] or 3D [B, S, D] inputs.
@@ -208,9 +199,7 @@ class BaseTorchModel(nn.Module):
                     x_occluded[left:right, :] = baseline[left:right, :]
                 x_tensor = torch.tensor(
-                    x_occluded,
-                    device=self.device,
-                    dtype=torch.float32
+                    x_occluded, device=self.device, dtype=torch.float32
                 ).unsqueeze(0)
                 logits = self.forward(x_tensor)
@@ -235,7 +224,7 @@ class BaseTorchModel(nn.Module):
         device="cpu",
         target_class=None,
         normalize=True,
-        signed=True
+        signed=True,
     ):
         """
         Apply a chosen attribution method to a dataloader and store results in adata.
@@ -252,7 +241,9 @@ class BaseTorchModel(nn.Module):
                 attr = model.compute_saliency(x, target_class=target_class, signed=signed)
             elif method == "smoothgrad":
-                attr = model.compute_saliency(x, smoothgrad=True, target_class=target_class, signed=signed)
+                attr = model.compute_saliency(
+                    x, smoothgrad=True, target_class=target_class, signed=signed
+                )
             elif method == "IG":
                 attributions, delta = model.compute_integrated_gradients(
@@ -261,15 +252,15 @@ class BaseTorchModel(nn.Module):
                 attr = attributions
             elif method == "deeplift":
-                attr = model.compute_deeplift(x, baseline=baseline, target_class=target_class, signed=signed)
+                attr = model.compute_deeplift(
+                    x, baseline=baseline, target_class=target_class, signed=signed
+                )
             elif method == "gradxinput":
                 attr = model.compute_gradient_x_input(x, target_class=target_class)
             elif method == "occlusion":
-                attr = model.compute_occlusion(
-                    x, target_class=target_class, baseline=baseline
-                )
+                attr = model.compute_occlusion(x, target_class=target_class, baseline=baseline)
             else:
                 raise ValueError(f"Unknown method {method}")
@@ -292,4 +283,4 @@ class BaseTorchModel(nn.Module):
             return target_class
         if logits.shape[1] == 1:
             return (logits > 0).long().squeeze(1)
-        return logits.argmax(dim=1)
+        return logits.argmax(dim=1)

smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl