PyPI - scunveil - Versions diffs - 0.1.0__tar.gz - Mend

scunveil 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

scunveil-0.1.0/LICENSE +21 -0
scunveil-0.1.0/PKG-INFO +22 -0
scunveil-0.1.0/README.md +1 -0
scunveil-0.1.0/pyproject.toml +33 -0
scunveil-0.1.0/setup.cfg +4 -0
scunveil-0.1.0/src/scunveil/__init__.py +3 -0
scunveil-0.1.0/src/scunveil/_data_operations.py +100 -0
scunveil-0.1.0/src/scunveil/_inference.py +235 -0
scunveil-0.1.0/src/scunveil/_layers.py +35 -0
scunveil-0.1.0/src/scunveil/_model.py +71 -0
scunveil-0.1.0/src/scunveil.egg-info/PKG-INFO +22 -0
scunveil-0.1.0/src/scunveil.egg-info/SOURCES.txt +13 -0
scunveil-0.1.0/src/scunveil.egg-info/dependency_links.txt +1 -0
scunveil-0.1.0/src/scunveil.egg-info/requires.txt +13 -0
scunveil-0.1.0/src/scunveil.egg-info/top_level.txt +1 -0

scunveil-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 thonzyk
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

scunveil-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,22 @@
+Metadata-Version: 2.4
+Name: scunveil
+Version: 0.1.0
+Summary: Inference package for scUnveil single-cell embeddings and gene-expression prediction.
+Author: Tomáš Honzík
+License-Expression: MIT
+Requires-Python: <3.12,>=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: tensorflow==2.15.*
+Requires-Dist: numpy<2,>=1.23.5
+Requires-Dist: pandas<2.3,>=2.0
+Requires-Dist: scipy<1.14,>=1.10
+Requires-Dist: h5py<4,>=3.8
+Requires-Dist: anndata<0.12,>=0.10
+Requires-Dist: tqdm<5,>=4.66
+Requires-Dist: huggingface_hub>=0.23
+Provides-Extra: cuda
+Requires-Dist: tensorflow[and-cuda]==2.15.*; (platform_system == "Linux" and platform_machine == "x86_64") and extra == "cuda"
+Dynamic: license-file
+# scUnveil

scunveil-0.1.0/README.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ # scUnveil

scunveil-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,33 @@
+[build-system]
+requires = ["setuptools>=77.0.3", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "scunveil"
+version = "0.1.0"
+description = "Inference package for scUnveil single-cell embeddings and gene-expression prediction."
+readme = "README.md"
+requires-python = ">=3.9,<3.12"
+license = "MIT"
+license-files = ["LICENSE"]
+authors = [
+  { name = "Tomáš Honzík" }
+]
+dependencies = [
+  "tensorflow==2.15.*",
+  "numpy>=1.23.5,<2",
+  "pandas>=2.0,<2.3",
+  "scipy>=1.10,<1.14",
+  "h5py>=3.8,<4",
+  "anndata>=0.10,<0.12",
+  "tqdm>=4.66,<5",
+  "huggingface_hub>=0.23"
+]
+[project.optional-dependencies]
+cuda = [
+  "tensorflow[and-cuda]==2.15.* ; platform_system == 'Linux' and platform_machine == 'x86_64'"
+]
+[tool.setuptools.packages.find]
+where = ["src"]

scunveil-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

scunveil-0.1.0/src/scunveil/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from ._inference import scUnveil
+__all__ = ["scUnveil"]

scunveil-0.1.0/src/scunveil/_data_operations.py ADDED Viewed

@@ -0,0 +1,100 @@
+import tensorflow as tf
+import numpy as np
+def logits_to_CPM(pred, add_oder_of_magnitude=6.0):
+    pred = tf.constant(pred)
+    pred = tf.cast(pred, 'float32')
+    pred = tf.nn.softmax(pred)
+    log10_pred = tf.math.log(pred) / tf.math.log(10.0)
+    pred = log10_pred + add_oder_of_magnitude
+    pred = pred.numpy()
+    return pred
+def simple_scipy_norm_x(x):
+    x = x.tocoo()
+    idx   = np.stack([x.row, x.col], axis=1).astype(np.int64)
+              # counts as float32 (required by stateless_binomial)
+    shape = np.array(x.shape, dtype=np.int64)
+    vals  = tf.constant(x.data.astype(np.float16))
+    vals = tf.math.log1p(vals)
+    sp = tf.sparse.SparseTensor(idx, vals, shape)
+    sp = tf.sparse.reorder(sp)
+    x_tf_dense = tf.sparse.to_dense(sp)
+    return x_tf_dense
+def pretrain_batch_from_x_tf(x_batch, variable_dilution=False):
+    # scipy sparse -> TF SparseTensor (COO)
+    x = x_batch.tocoo()
+    idx   = np.stack([x.row, x.col], axis=1).astype(np.int64)
+    vals  = x.data.astype(np.float32)          # counts as float32 (required by stateless_binomial)
+    shape = np.array(x.shape, dtype=np.int64)
+    sp = tf.sparse.SparseTensor(idx, vals, shape)
+    sp = tf.sparse.reorder(sp)
+    B = tf.cast(sp.dense_shape[0], tf.int32)
+    # dilution and keep-prob
+    if variable_dilution:
+        dilution = tf.random.uniform([B, 1], minval=0.1, maxval=1.0, dtype=tf.float32)
+        p = 1.0 - tf.squeeze(dilution, axis=1)  # [B]
+        # binomial thinning ONLY on nnz
+        row = tf.cast(sp.indices[:, 0], tf.int32)
+        p_vals = tf.gather(p, row)
+    else:
+        p_vals = 0.5
+    seed = tf.random.uniform([2], maxval=2**31 - 1, dtype=tf.int32)
+    diluted_i = tf.random.stateless_binomial(
+        shape=tf.shape(sp.values),
+        seed=seed,
+        counts=sp.values,          # float32
+        probs=p_vals,              # float32
+        output_dtype=tf.int32
+    )
+    diluted = tf.cast(diluted_i, tf.float32)
+    # x_diluted sparse (drop zeros)
+    keep = diluted > 0.0
+    x_dil_sp = tf.sparse.reorder(tf.sparse.SparseTensor(
+        indices=tf.boolean_mask(sp.indices, keep),
+        values=tf.boolean_mask(diluted, keep),
+        dense_shape=sp.dense_shape
+    ))
+    # complement on original support
+    comp_vals = sp.values - diluted
+    comp_sp = tf.sparse.SparseTensor(sp.indices, comp_vals, sp.dense_shape)
+    # y = complement / row-sum (sparse -> dense)
+    row2 = tf.cast(comp_sp.indices[:, 0], tf.int32)
+    y_sum = tf.math.unsorted_segment_sum(comp_sp.values, row2, num_segments=B)  # [B]
+    valid_y = y_sum > 0.0
+    y_vals = tf.math.divide_no_nan(comp_sp.values, tf.gather(y_sum, row2))
+    y_sp = tf.sparse.reorder(tf.sparse.SparseTensor(comp_sp.indices, y_vals, comp_sp.dense_shape))
+    # log1p norm
+    x_dil_sp = tf.SparseTensor(
+        indices=x_dil_sp.indices,
+        values=tf.cast(tf.math.log1p(x_dil_sp.values), tf.float16),
+        dense_shape=x_dil_sp.dense_shape
+    )
+    x_diluted = tf.sparse.to_dense(x_dil_sp)
+    y = tf.sparse.to_dense(y_sp)
+    # Replace zero-target rows with uniform labels.
+    G = tf.cast(tf.shape(y)[1], y.dtype)
+    uniform_value = tf.cast(1.0, y.dtype) / G
+    y = y + tf.cast(~valid_y[:, None], y.dtype) * uniform_value
+    return x_diluted, y

scunveil-0.1.0/src/scunveil/_inference.py ADDED Viewed

@@ -0,0 +1,235 @@
+import json
+from types import SimpleNamespace
+# from src.scunveil.model import RNABagModel
+# from src.scunveil.layers import PCAProjection
+import pandas as pd
+import tensorflow as tf
+import numpy as np
+from tqdm import tqdm
+from scipy.sparse import csc_matrix
+from pathlib import Path
+from huggingface_hub import snapshot_download
+from ._model import RNABagModel
+from ._layers import PCAProjection
+from ._data_operations import simple_scipy_norm_x, logits_to_CPM
+SCUNVEIL_MODEL_REPO = "thonzik/sc-unveil"
+NO_INPUT_ANNDATA_TEXT = 'No Input AnnData found, please run first "set_input_anndata(...)".'
+@tf.function
+def run_tf_model_pred(tf_model, x_input):
+    return tf_model(x_input, training=False)
+class scUnveil:
+    def __init__(self):
+        """
+        reference_var_path: is used for maping model gene permutation onto input samples gene permutation
+        """
+        self.input_anndata = None
+        checkpoint_path = snapshot_download(
+            repo_id=SCUNVEIL_MODEL_REPO,
+            repo_type="model",
+            allow_patterns=[
+                "var_sorted.csv",
+                "config.json",
+                "weights.h5",
+                "pca_mean.npy",
+                "pca_mat.npy",
+            ],
+        )
+        checkpoint_path = Path(checkpoint_path)
+        self.checkpoint_path = checkpoint_path
+        self.reference_var = pd.read_csv(checkpoint_path / 'var_sorted.csv')
+        with open(checkpoint_path / 'config.json', 'r', encoding='utf-8') as fr:
+            CONFIG = json.load(fr)
+        CF = SimpleNamespace(**CONFIG)
+        self.config = CF
+        print('Model Initialization...')
+        m = RNABagModel(n_vars=CF.n_genes, n_layers=CF.n_layers, emb_dim=CF.emb_dim)
+        print('Loading Weights...')
+        m.model.load_weights(checkpoint_path / 'weights.h5')
+        self.raw_embedder = tf.keras.Model(
+            inputs=m.model.input,
+            outputs=m.model.layers[-2].output
+        )
+        self.pca_mean = np.load(checkpoint_path / 'pca_mean.npy').astype(np.float32)[None, :]
+        self.pca_mat = np.load(checkpoint_path / f'pca_mat.npy').astype(np.float32)
+        self.pca_projector = tf.keras.Sequential([PCAProjection()])
+        self.pca_projector.build((None, CF.emb_dim))
+        self.pca_projector.set_weights([self.pca_mean, self.pca_mat])
+        self.expression_predictor = tf.keras.Sequential([m.model.layers[-1]])
+    def set_input_anndata(self, input_anndata, batch_size=32):
+        self.input_anndata = input_anndata
+        self._process_anndata(batch_size=batch_size)
+    def _process_anndata(self, batch_size):
+        self._calculate_gene_sort()
+        self._calculate_embeddings(batch_size)
+    def _detect_gene_column(self):
+        """Scan .var index + columns to find which holds gene names or Ensembl IDs."""
+        ref_names = set(list(self.reference_var['feature_name'])[:100])
+        ref_ids = set(list(self.reference_var['feature_id'])[:100])
+        candidates = {"__index__": self.input_anndata.var.index.astype(str)}
+        for col in self.input_anndata.var.columns:
+            candidates[col] = self.input_anndata.var[col].astype(str)
+        best_col, best_score, best_type = None, 0, None
+        for name, values in candidates.items():
+            vals = set(values)
+            for ref_set, id_type in [(ref_names, "feature_name"), (ref_ids, "feature_id")]:
+                overlap = len(vals & ref_set)
+                if overlap > best_score:
+                    best_col, best_score, best_type = name, overlap, id_type
+        if best_score < 10:
+            raise ValueError(
+                f"Could not find gene identifiers in input .var "
+                f"(best match: column='{best_col}', overlap={best_score}/100). "
+                f"Ensure .var.index or a .var column contains gene symbols "
+                f"(e.g. TP53, CD3D) or Ensembl IDs (e.g. ENSG00000141510)."
+            )
+        vals = self.input_anndata.var.index.astype(str) if best_col == "__index__" else self.input_anndata.var[best_col].astype(str)
+        ref_col = self.reference_var[best_type]
+        return list(vals), {name: i for i, name in enumerate(ref_col[:self.config.n_genes])}
+    def _calculate_gene_sort(self):
+        if self.input_anndata is None:
+            print(NO_INPUT_ANNDATA_TEXT)
+            return
+        print('Mapping gene permutation...')
+        this_ad_vars, ref_var_lookup = self._detect_gene_column()
+        n_this_ad_vars = len(this_ad_vars)
+        this_indices = []
+        reference_indices = []
+        for this_var_i, this_var_name in enumerate(this_ad_vars):
+            if this_var_name not in ref_var_lookup:
+                continue
+            ref_var_i = ref_var_lookup[this_var_name]
+            reference_indices.append(ref_var_i)
+            this_indices.append(this_var_i)
+        this_indices = np.asarray(this_indices, dtype=np.int64)
+        reference_indices = np.asarray(reference_indices, dtype=np.int64)
+        arr_of_ones = np.ones(len(this_indices), dtype=np.float32)
+        self.var_map_matrix = csc_matrix((arr_of_ones, (this_indices, reference_indices)),
+                    shape=(n_this_ad_vars, self.config.n_genes))
+        assert self.var_map_matrix.max() < 1.5, "Duplicate gene mapping detected in var_map_matrix"
+    def _calculate_embeddings(self, batch_size):
+        print('Processing cells...')
+        n_cells_to_process = self.input_anndata.X.shape[0]
+        self.raw_embeddings = np.zeros((n_cells_to_process, self.config.emb_dim), dtype=np.float16)
+        self.pca_embeddings = np.zeros((n_cells_to_process, self.config.emb_dim), dtype=np.float16)
+        pbar = tqdm(total=n_cells_to_process)
+        for i in range(0, n_cells_to_process, batch_size):
+            x = self.input_anndata.X[i:i+batch_size].copy()
+            x = x @ self.var_map_matrix  # still sparse, shape (n_rows, n_genes)
+            x = simple_scipy_norm_x(x)
+            n_batch = x.shape[0]
+            this_raw_emb = run_tf_model_pred(self.raw_embedder, x)
+            self.raw_embeddings[i:i+n_batch] = this_raw_emb.numpy()
+            this_pca_emb = run_tf_model_pred(self.pca_projector, this_raw_emb)
+            self.pca_embeddings[i:i+n_batch] = this_pca_emb.numpy()
+            pbar.update(n_batch)
+    def get_raw_embeddings(self):
+        return self.raw_embeddings.copy()
+    def get_embeddings(self, n_features=512):
+        """
+        if n_features is None return full raw embeddings, if integer, top_n_features from PCA projection is returned instead
+        """
+        if self.input_anndata is None:
+            print(NO_INPUT_ANNDATA_TEXT)
+            return
+        if n_features is None:
+            return self.pca_embeddings.copy()
+        assert n_features <= self.config.emb_dim
+        return self.pca_embeddings[:, :n_features].copy()
+    def get_all_genes_prediction(self, batch_size=128):
+        if self.input_anndata is None:
+            print(NO_INPUT_ANNDATA_TEXT)
+            return
+        n_cells_to_process = self.raw_embeddings.shape[0]
+        gene_expressions = np.zeros((n_cells_to_process, self.config.n_genes), dtype=np.float16)
+        pbar = tqdm(total=n_cells_to_process)
+        for i in range(0, n_cells_to_process, batch_size):
+            this_raw_emb = self.raw_embeddings[i:i+batch_size].copy()
+            this_prediction = run_tf_model_pred(self.expression_predictor, this_raw_emb)
+            this_prediction = logits_to_CPM(this_prediction)
+            gene_expressions[i:i+batch_size] = this_prediction
+            pbar.update(this_raw_emb.shape[0])
+        return gene_expressions
+    def get_specific_genes_prediction(self, list_of_gene_names):
+        pass
+    def get_genes_embeddings(self):
+        if self.input_anndata is None:
+            print(NO_INPUT_ANNDATA_TEXT)
+            return
+        pass

scunveil-0.1.0/src/scunveil/_layers.py ADDED Viewed

@@ -0,0 +1,35 @@
+import tensorflow as tf
+class GatedSkipAdd(tf.keras.layers.Layer):
+    def __init__(self, alpha_init=0.0, eps=1e-6, **kwargs):
+        super().__init__(**kwargs)
+        self.alpha_init = alpha_init
+        self.eps = eps
+    def build(self, input_shape):
+        self.alpha = self.add_weight(
+            name="alpha",
+            shape=(),
+            initializer=tf.keras.initializers.Constant(self.alpha_init),
+            trainable=True,
+        )
+        super().build(input_shape)
+    def call(self, inputs):
+        x, skip = inputs
+        a = self.alpha
+        y = x + a * skip
+        denom = tf.sqrt(1.0 + tf.square(a) + self.eps)
+        return y / denom
+class PCAProjection(tf.keras.layers.Layer):
+    def build(self, input_shape):
+        n = input_shape[-1]
+        self.mean = self.add_weight("mean", (1, n), trainable=False)
+        self.pca = self.add_weight("pca", (n, n), trainable=False)
+    def call(self, x):
+        return (x - self.mean) @ self.pca

scunveil-0.1.0/src/scunveil/_model.py ADDED Viewed

@@ -0,0 +1,71 @@
+import tensorflow as tf
+from tensorflow.keras.layers import Dense, Input, LayerNormalization, Add
+from tensorflow.keras.models import Model
+from tensorflow.keras.optimizers import AdamW
+from tensorflow.keras.losses import categorical_crossentropy
+# from src.scunveil.layers import GatedSkipAdd
+from ._layers import GatedSkipAdd
+def c_xent(y_true, y_pred):
+    losss = categorical_crossentropy(tf.cast(y_true, 'float32'), tf.cast(y_pred, 'float32'), from_logits=True)
+    return tf.reduce_mean(losss)
+class RNABagModel:
+    def __init__(self, n_vars, n_layers, emb_dim, ff_dim=None, activation='relu'):
+        assert n_layers >= 2
+        if ff_dim is None:
+            ff_dim = emb_dim * 4
+        self.n_vars = n_vars
+        self.n_layers = n_layers
+        self.emb_dim = emb_dim
+        input_x = Input(shape=(n_vars,), name="x")
+        emb_x = Dense(emb_dim, use_bias=True, name="emb_0")(input_x)
+        x = emb_x
+        u_net_residuals = []
+        half = n_layers // 2
+        for layer_i in range(n_layers):
+            if layer_i < half:
+                u_net_residuals.append(x)
+            if layer_i >= n_layers - half:
+                skip = u_net_residuals.pop()
+                x = GatedSkipAdd()([x, skip])
+            x_add = LayerNormalization(epsilon=1e-6)(x)
+            x_add = Dense(ff_dim, activation=activation)(x_add)
+            x_add = Dense(emb_dim)(x_add)
+            x = Add(name=f"emb_{layer_i+1}")([x, x_add])
+        output_layer = Dense(n_vars, name="out")(x)
+        inputs = input_x
+        self.model = Model(
+            inputs=inputs,
+            outputs=output_layer
+        )
+    def compile_pretrain(self, lr=1e-3, wd=0.0, clipnorm=None):
+        optimizer = AdamW(
+            learning_rate=lr,
+            weight_decay=wd,
+            clipnorm=clipnorm,
+        )
+        self.model.compile(
+            optimizer=optimizer,
+            loss=c_xent,
+        )
+        print(f'Number of parameters: {self.model.count_params():,}')

scunveil-0.1.0/src/scunveil.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,22 @@
+Metadata-Version: 2.4
+Name: scunveil
+Version: 0.1.0
+Summary: Inference package for scUnveil single-cell embeddings and gene-expression prediction.
+Author: Tomáš Honzík
+License-Expression: MIT
+Requires-Python: <3.12,>=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: tensorflow==2.15.*
+Requires-Dist: numpy<2,>=1.23.5
+Requires-Dist: pandas<2.3,>=2.0
+Requires-Dist: scipy<1.14,>=1.10
+Requires-Dist: h5py<4,>=3.8
+Requires-Dist: anndata<0.12,>=0.10
+Requires-Dist: tqdm<5,>=4.66
+Requires-Dist: huggingface_hub>=0.23
+Provides-Extra: cuda
+Requires-Dist: tensorflow[and-cuda]==2.15.*; (platform_system == "Linux" and platform_machine == "x86_64") and extra == "cuda"
+Dynamic: license-file
+# scUnveil

scunveil-0.1.0/src/scunveil.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,13 @@
+LICENSE
+README.md
+pyproject.toml
+src/scunveil/__init__.py
+src/scunveil/_data_operations.py
+src/scunveil/_inference.py
+src/scunveil/_layers.py
+src/scunveil/_model.py
+src/scunveil.egg-info/PKG-INFO
+src/scunveil.egg-info/SOURCES.txt
+src/scunveil.egg-info/dependency_links.txt
+src/scunveil.egg-info/requires.txt
+src/scunveil.egg-info/top_level.txt

scunveil-0.1.0/src/scunveil.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

scunveil-0.1.0/src/scunveil.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,13 @@
+tensorflow==2.15.*
+numpy<2,>=1.23.5
+pandas<2.3,>=2.0
+scipy<1.14,>=1.10
+h5py<4,>=3.8
+anndata<0.12,>=0.10
+tqdm<5,>=4.66
+huggingface_hub>=0.23
+[cuda]
+[cuda:platform_system == "Linux" and platform_machine == "x86_64"]
+tensorflow[and-cuda]==2.15.*

scunveil-0.1.0/src/scunveil.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ scunveil