PyPI - scdataloader - Versions diffs - 1.9.1__tar.gz → 2.0.0__tar.gz - Mend

scdataloader 1.9.1tar.gz → 2.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{scdataloader-1.9.1 → scdataloader-2.0.0}/.gitignore RENAMED Viewed

@@ -135,3 +135,6 @@ figures/*/*.png
 figures/*.png
 figures/add_postp_clust.py
 figures/age_relabel.py
+notebooks/figures/umap_*.png
+notebooks/data/
+data/gene_names/

scdataloader-2.0.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Jérémie Kalfon
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

{scdataloader-1.9.1 → scdataloader-2.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scdataloader
-Version: 1.9.1
+Version: 2.0.0
 Summary: a dataloader for single cell data in lamindb
 Project-URL: repository, https://github.com/jkobject/scDataLoader
 Author-email: jkobject <jkobject@gmail.com>
@@ -12,14 +12,13 @@ Requires-Dist: anndata>=0.9.0
 Requires-Dist: biomart>=0.9.0
 Requires-Dist: cellxgene-census>=0.1.0
 Requires-Dist: django>=4.0.0
-Requires-Dist: harmonypy>=0.0.10
 Requires-Dist: ipykernel>=6.20.0
 Requires-Dist: jupytext>=1.16.0
-Requires-Dist: lamindb[bionty,cellregistry,jupyter,ourprojects,zarr]<2,>=1.0.4
+Requires-Dist: lamindb[bionty,cellregistry,jupyter,zarr]==1.0.4
 Requires-Dist: leidenalg>=0.8.0
+Requires-Dist: lightning>=2.3.0
 Requires-Dist: matplotlib>=3.5.0
 Requires-Dist: numpy==1.26.0
-Requires-Dist: palantir>=1.3.3
 Requires-Dist: pandas>=2.0.0
 Requires-Dist: pytorch-lightning>=2.3.0
 Requires-Dist: scikit-misc>=0.5.0
@@ -71,7 +70,16 @@ It allows you to:
 3. create a more complex single cell dataset
 4. extend it to your need
-built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
+built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
+```
+Portions of the mapped.py file are derived from Lamin Labs
+Copyright 2024 Lamin Labs
+Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+The rest of the package is licensed under MIT License, see LICENSE for details
+Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_collection.py
+for the original implementation
+```
 The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).

{scdataloader-1.9.1 → scdataloader-2.0.0}/README.md RENAMED Viewed

@@ -28,7 +28,16 @@ It allows you to:
 3. create a more complex single cell dataset
 4. extend it to your need
-built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
+built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
+```
+Portions of the mapped.py file are derived from Lamin Labs
+Copyright 2024 Lamin Labs
+Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+The rest of the package is licensed under MIT License, see LICENSE for details
+Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_collection.py
+for the original implementation
+```
 The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).

{scdataloader-1.9.1 → scdataloader-2.0.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "scdataloader"
-version = "1.9.1"
+version = "2.0.0"
 description = "a dataloader for single cell data in lamindb"
 authors = [
     {name = "jkobject", email = "jkobject@gmail.com"}
@@ -11,7 +11,7 @@ requires-python = ">=3.10,<3.14"
 keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
 dependencies = [
     "numpy==1.26.0",
-    "lamindb[bionty,ourprojects,jupyter,cellregistry,zarr]>=1.0.4,<2",
+    "lamindb[bionty,jupyter,cellregistry,zarr]==1.0.4",
     "cellxgene-census>=0.1.0",
     "torch==2.2.0",
     "pytorch-lightning>=2.3.0",
@@ -26,10 +26,9 @@ dependencies = [
     "leidenalg>=0.8.0",
     "django>=4.0.0",
     "scikit-misc>=0.5.0",
-    "palantir>=1.3.3",
-    "harmonypy>=0.0.10",
     "jupytext>=1.16.0",
+    "lightning>=2.3.0",
+    "pytorch-lightning>=2.3.0",
 ]
 [project.optional-dependencies]

{scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/__init__.py RENAMED Viewed

@@ -1,7 +1,8 @@
+from importlib.metadata import version
 from .collator import Collator
 from .data import Dataset, SimpleAnnDataset
 from .datamodule import DataModule
 from .preprocess import Preprocessor
-from importlib.metadata import version
 __version__ = version("scdataloader")

{scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/collator.py RENAMED Viewed

@@ -3,7 +3,7 @@ from typing import Optional
 import numpy as np
 from torch import Tensor, long
-from .utils import downsample_profile, load_genes
+from .utils import load_genes
 class Collator:
@@ -22,8 +22,6 @@ class Collator:
         organism_name: str = "organism_ontology_term_id",
         class_names: list[str] = [],
         genelist: list[str] = [],
-        downsample: Optional[float] = None,  # don't use it for training!
-        save_output: Optional[str] = None,
     ):
         """
         This class is responsible for collating data for the scPRINT model. It handles the
@@ -57,13 +55,8 @@ class Collator:
             class_names (list, optional): List of other classes to be considered. Defaults to [].
             genelist (list, optional): List of genes to be considered. Defaults to [].
                 If [] all genes will be considered
-            downsample (float, optional): Downsample the profile to a certain number of cells. Defaults to None.
-                This is usually done by the scPRINT model during training but this option allows you to do it directly from the collator
-            save_output (str, optional): If not None, saves the output to a file. Defaults to None.
-                This is mainly for debugging purposes
         """
         self.organisms = organisms
-        self.genedf = load_genes(organisms)
         self.max_len = max_len
         self.n_bins = n_bins
         self.add_zero_genes = add_zero_genes
@@ -75,14 +68,14 @@ class Collator:
         self.organism_name = organism_name
         self.tp_name = tp_name
         self.class_names = class_names
-        self.save_output = save_output
         self.start_idx = {}
         self.accepted_genes = {}
-        self.downsample = downsample
         self.to_subset = {}
-        self._setup(org_to_id, valid_genes, genelist)
+        self._setup(None, org_to_id, valid_genes, genelist)
-    def _setup(self, org_to_id=None, valid_genes=[], genelist=[]):
+    def _setup(self, genedf=None, org_to_id=None, valid_genes=[], genelist=[]):
+        if genedf is None:
+            genedf = load_genes(self.organisms)
         self.org_to_id = org_to_id
         self.to_subset = {}
         self.accepted_genes = {}
@@ -92,14 +85,17 @@ class Collator:
             if org_to_id is not None
             else set(self.organisms)
         )
+        if len(valid_genes) > 0:
+            if len(set(valid_genes) - set(genedf.index)) > 0:
+                print("Some valid genes are not in the genedf!!!")
+            tot = genedf[genedf.index.isin(valid_genes)]
+        else:
+            tot = genedf
         for organism in self.organisms:
-            ogenedf = self.genedf[self.genedf.organism == organism]
-            if len(valid_genes) > 0:
-                tot = self.genedf[self.genedf.index.isin(valid_genes)]
-            else:
-                tot = self.genedf
             org = org_to_id[organism] if org_to_id is not None else organism
             self.start_idx.update({org: np.where(tot.organism == organism)[0][0]})
+            ogenedf = genedf[genedf.organism == organism]
             if len(valid_genes) > 0:
                 self.accepted_genes.update({org: ogenedf.index.isin(valid_genes)})
             if len(genelist) > 0:
@@ -148,7 +144,6 @@ class Collator:
                         :, self.accepted_genes[organism_id]
                     ]
             if self.how == "most expr":
-                nnz_loc = np.where(expr > 0)[0]
                 if "knn_cells" in elem:
                     nnz_loc = np.where(expr + elem["knn_cells"].sum(0) > 0)[0]
                     ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
@@ -161,14 +156,18 @@ class Collator:
                 # loc = np.argsort(expr)[-(self.max_len) :][::-1]
             elif self.how == "random expr":
                 nnz_loc = np.where(expr > 0)[0]
-                loc = nnz_loc[
-                    np.random.choice(
-                        len(nnz_loc),
-                        self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc),
-                        replace=False,
-                        # p=(expr.max() + (expr[nnz_loc])*19) / expr.max(), # 20 at most times more likely to be selected
-                    )
-                ]
+                loc = (
+                    nnz_loc[
+                        np.random.choice(
+                            len(nnz_loc),
+                            self.max_len,
+                            replace=False,
+                            # p=(expr.max() + (expr[nnz_loc])*19) / expr.max(), # 20 at most times more likely to be selected
+                        )
+                    ]
+                    if self.max_len < len(nnz_loc)
+                    else nnz_loc
+                )
             elif self.how in ["all", "some"]:
                 loc = np.arange(len(expr))
             else:
@@ -179,23 +178,19 @@ class Collator:
                 "all",
                 "some",
             ]:
+                ma = self.add_zero_genes + (
+                    0 if self.max_len < len(nnz_loc) else self.max_len - len(nnz_loc)
+                )
                 if "knn_cells" in elem:
                     # we complete with genes expressed in the knn
-                    nnz_loc = np.where(elem["knn_cells"].sum(0) > 0)[0]
-                    ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
                     # which is not a zero_loc in this context
-                    zero_loc = np.argsort(elem["knn_cells"].sum(0))[-(ma):][::-1]
+                    zero_loc = np.argsort(elem["knn_cells"].sum(0))[-ma:][::-1]
                 else:
                     zero_loc = np.where(expr == 0)[0]
                     zero_loc = zero_loc[
                         np.random.choice(
                             len(zero_loc),
-                            self.add_zero_genes
-                            + (
-                                0
-                                if self.max_len < len(nnz_loc)
-                                else self.max_len - len(nnz_loc)
-                            ),
+                            ma,
                             replace=False,
                         )
                     ]
@@ -255,13 +250,6 @@ class Collator:
             ret.update({"knn_cells": Tensor(knn_cells)})
         if len(dataset) > 0:
             ret.update({"dataset": Tensor(dataset).to(long)})
-        if self.downsample is not None:
-            ret["x"] = downsample_profile(ret["x"], self.downsample)
-        if self.save_output is not None:
-            with open(self.save_output, "a") as f:
-                np.savetxt(f, ret["x"].numpy())
-            with open(self.save_output + "_loc", "a") as f:
-                np.savetxt(f, gene_locs)
         return ret

{scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/config.py RENAMED Viewed

@@ -113,26 +113,34 @@ COARSE_ASSAY = {
 MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
-    "HsapDv:0010000": [
+    "HsapDv:0010000": [  # postnatal stage
         "MmusDv:0000092",  # postnatal stage
     ],
-    "HsapDv:0000258": [  # mature stage
+    "HsapDv:0000258": [  # mature stage >15
         "MmusDv:0000110",  # mature stage
-        "HsapDv:0000204", #
+        "HsapDv:0000204",  #
     ],
-    "HsapDv:0000227": [  # late adult stage
+    "HsapDv:0000087": [],  # adult stage >19
+    "HsapDv:0000227": [  # late adult stage > 40
         "MmusDv:0000091",  # 20 month-old stage
         "MmusDv:0000089",  # 18 month-old stage
+        "HsapDv:0000091",  # > 45
+        "HsapDv:0000093",  # > 65
+    ],
+    "HsapDv:0000272": [  # 60-79 year-old stage
+        "HsapDv:0000094",  # 60-79 year-old stage
     ],
-    "HsapDv:0000272": [],  # 60-79 year-old stage
     "HsapDv:0000095": [],  # 80 year-old and over stage
-    "HsapDv:0000267": [  # middle aged stage
+    "HsapDv:0000267": [  # middle aged stage >40 <60
         "MmusDv:0000087",  # 16 month-old stage
         "UBERON:0018241",  # prime adult stage
         "MmusDv:0000083",  # 12 month-old stage
         "HsapDv:0000092",  # same
     ],
-    "HsapDv:0000266": [  # young adult stage
+    "HsapDv:0000266": [  # young adult stage <40
+        "HsapDv:0000088",  # mature stage
+        "HsapDv:0000090",  # 25 - 44
+        "HsapDv:0000086",  # adolescent stage
         "MmusDv:0000050",  # 6 weeks
         "HsapDv:0000089",  # same
         "MmusDv:0000051",  # 7 weeks
@@ -163,22 +171,30 @@ MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
         "MmusDv:0000099",  # 26 weeks
         "MmusDv:0000102",  # 29 weeks
     ],
-    "HsapDv:0000265": [],  # child stage (1-4 yo)
+    "HsapDv:0000265": [  # child stage (1-4 yo)
+        "HsapDv:0000084",  # 2-5 yo
+    ],
     "HsapDv:0000271": [  # juvenile stage (5-14 yo)
         "MmusDv:0000048",  # 4 weeks
         "MmusDv:0000049",  # 5 weeks
+        "HsapDv:0000081",  # child
+        "HsapDv:0000085",  # 6-11 yo
     ],
-    "HsapDv:0000260": [  # infant stage
+    "HsapDv:0000260": [  # infant stage <2
         "MmusDv:0000046",  # 2 weeks
         "MmusDv:0000045",  # 1 week
         "MmusDv:0000047",  # 3 weeks
         "HsapDv:0000083",
+        "HsapDv:0000256",  # under 1 yo
     ],
     "HsapDv:0000262": [  # newborn stage (0-28 days)
         "MmusDv:0000036",  # Theiler stage 27
         "MmusDv:0000037",  # Theiler stage 28
         "MmusDv:0000113",  # 4-7 days
+        "HsapDv:0000174",  # 1 month-old stage
+        "HsapDv:0000082",  # newborn stage
     ],
+    "HsapDv:0000002": [],  # embryonic stage
     "HsapDv:0000007": [],  # Carnegie stage 03
     "HsapDv:0000008": [],  # Carnegie stage 04
     "HsapDv:0000009": [],  # Carnegie stage 05

scdataloader 1.9.1__tar.gz → 2.0.0__tar.gz

scdataloader 1.9.1tar.gz → 2.0.0tar.gz