PyPI - scdataloader - Versions diffs - 0.0.4__tar.gz → 1.0.5__tar.gz - Mend

scdataloader 0.0.4tar.gz → 1.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{scdataloader-0.0.4 → scdataloader-1.0.5}/PKG-INFO +82 -26
scdataloader-1.0.5/README.md +154 -0
scdataloader-1.0.5/pyproject.toml +62 -0
scdataloader-1.0.5/scdataloader/VERSION +1 -0
scdataloader-1.0.5/scdataloader/__init__.py +4 -0
{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/__main__.py +3 -0
{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/collator.py +61 -96
{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/config.py +6 -0
{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/data.py +138 -90
{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/datamodule.py +67 -39
scdataloader-1.0.5/scdataloader/mapped.py +540 -0
{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/preprocess.py +4 -213
{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/utils.py +128 -92
scdataloader-0.0.4/README.md +0 -107
scdataloader-0.0.4/pyproject.toml +0 -53
scdataloader-0.0.4/scdataloader/VERSION +0 -1
scdataloader-0.0.4/scdataloader/__init__.py +0 -4
scdataloader-0.0.4/scdataloader/mapped.py +0 -358
{scdataloader-0.0.4 → scdataloader-1.0.5}/LICENSE +0 -0
{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/base.py +0 -0

{scdataloader-0.0.4 → scdataloader-1.0.5}/PKG-INFO RENAMED Viewed

@@ -1,28 +1,37 @@
 Metadata-Version: 2.1
 Name: scdataloader
-Version: 0.0.4
+Version: 1.0.5
 Summary: a dataloader for single cell data in lamindb
 Home-page: https://github.com/jkobject/scDataLoader
 License: GPL3
-Keywords: scRNAseq,dataloader,pytorch,lamindb,scPrint
+Keywords: scRNAseq,dataloader,pytorch,lamindb,scPRINT
 Author: jkobject
 Requires-Python: ==3.10.*
 Classifier: License :: Other/Proprietary License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
+Provides-Extra: dev
 Requires-Dist: anndata
 Requires-Dist: biomart
-Requires-Dist: bionty
+Requires-Dist: bionty (==0.48.0)
+Requires-Dist: black (>=23.10.1,<24.0.0) ; extra == "dev"
 Requires-Dist: cellxgene-census
+Requires-Dist: coverage (>=7.3.2,<8.0.0) ; extra == "dev"
 Requires-Dist: decoupler
 Requires-Dist: django
+Requires-Dist: flake8 (>=6.1.0,<7.0.0) ; extra == "dev"
+Requires-Dist: gitchangelog (>=3.0.4,<4.0.0) ; extra == "dev"
 Requires-Dist: ipykernel
-Requires-Dist: lamindb
+Requires-Dist: isort (>=5.12.0,<6.0.0) ; extra == "dev"
+Requires-Dist: lamindb (==0.75.1)
 Requires-Dist: leidenalg
 Requires-Dist: lightning
-Requires-Dist: lnschema-bionty
 Requires-Dist: matplotlib
+Requires-Dist: mkdocs (>=1.5.3,<2.0.0) ; extra == "dev"
+Requires-Dist: mypy (>=1.6.1,<2.0.0) ; extra == "dev"
 Requires-Dist: pandas (>=2.0.0)
+Requires-Dist: pytest (>=7.4.3,<8.0.0) ; extra == "dev"
+Requires-Dist: pytest-cov (>=4.1.0,<5.0.0) ; extra == "dev"
 Requires-Dist: scikit-misc
 Requires-Dist: seaborn
 Requires-Dist: torch
@@ -34,14 +43,16 @@ Description-Content-Type: text/markdown
 [![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
 [![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
-[![DOI](https://zenodo.org/badge/731248665.svg)](https://zenodo.org/doi/10.5281/zenodo.10573143)
+[![PyPI version](https://badge.fury.io/py/scDataLoader.svg)](https://badge.fury.io/py/scDataLoader)
+[![Documentation Status](https://readthedocs.org/projects/scDataLoader/badge/?version=latest)](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
+[![Downloads](https://pepy.tech/badge/scDataLoader)](https://pepy.tech/project/scDataLoader)
+[![Downloads](https://pepy.tech/badge/scDataLoader/month)](https://pepy.tech/project/scDataLoader)
+[![Downloads](https://pepy.tech/badge/scDataLoader/week)](https://pepy.tech/project/scDataLoader)
+[![GitHub issues](https://img.shields.io/github/issues/jkobject/scDataLoader)](https://img.shields.io/github/issues/jkobject/scDataLoader)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![DOI](https://img.shields.io/badge/DOI-10.1101%2F2024.07.29.605556-blue)](https://doi.org/10.1101/2024.07.29.605556)
-Awesome single cell dataloader created by @jkobject
-built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
-This data loader is designed to be used with:
+This single cell pytorch dataloader / lighting datamodule is designed to be used with:
 - [lamindb](https://lamin.ai/)
@@ -57,18 +68,15 @@ It allows you to:
 3. create a more complex single cell dataset
 4. extend it to your need
-## About
+built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
-the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
+The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
-1. loading from lamin
-2. doing some dataset specific preprocessing if needed
-3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
-4. passing it to a dataloader object that can work with it correctly
+## More
-Currently one would have to use the preprocess function to make the dataset fit for different tools like scGPT / Geneformer. But I would want to enable it through different Collators. This is still missing and a WIP... (please do contribute!)
+I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
-![docs/scdataloader.drawio.png](docs/scdataloader.drawio.png)
+![scdataloader.drawio.png](docs/scdataloader.drawio.png)
 ## Install it from PyPI
@@ -80,13 +88,13 @@ pip install scdataloader
 ```bash
 git clone https://github.com/jkobject/scDataLoader.git
-cd scDataLoader
-poetry install
+pip install -e scDataLoader
 ```
-then run the notebooks with the poetry installed environment
 ## Usage
+### Direct Usage
 ```python
 # initialize a local lamin database
 # !lamin init --storage ~/scdataloader --schema bionty
@@ -129,15 +137,63 @@ for i in tqdm.tqdm(datamodule.train_dataloader()):
 ```
-see the notebooks in [docs](https://jkobject.github.io/scDataLoader/):
+see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
+1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
+2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
-1. [load a dataset](https://jkobject.github.io/scDataLoader/notebooks/01_load_dataset.html)
-2. [create a dataset](https://jkobject.github.io/scDataLoader/notebooks/02_create_dataset.html)
+### command line preprocessing
+You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
+```bash
+scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
+```
+### command line usage
+The main way to use
+> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
+## FAQ
+### how to update my ontologies?
+```bash
+import bionty as bt
+bt.reset_sources()
+# Run via CLI: lamin load <your instance>
+import lnschema_bionty as lb
+lb.dev.sync_bionty_source_to_latest()
+```
+### how to load all ontologies?
+```python
+from scdataloader import utils
+utils.populate_ontologies() # this might take from 5-20mins
+```
 ## Development
 Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## Acknowledgments
+- [lamin.ai](https://lamin.ai/)
+- [scanpy](https://scanpy.readthedocs.io/en/stable/)
+- [anndata](https://anndata.readthedocs.io/en/latest/)
+- [scprint](https://www.jkobject.com/scPRINT/)
+Awesome single cell dataloader created by @jkobject
                     GNU GENERAL PUBLIC LICENSE
                        Version 3, 29 June 2007

scdataloader-1.0.5/README.md ADDED Viewed

@@ -0,0 +1,154 @@
+# scdataloader
+[![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
+[![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
+[![PyPI version](https://badge.fury.io/py/scDataLoader.svg)](https://badge.fury.io/py/scDataLoader)
+[![Documentation Status](https://readthedocs.org/projects/scDataLoader/badge/?version=latest)](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
+[![Downloads](https://pepy.tech/badge/scDataLoader)](https://pepy.tech/project/scDataLoader)
+[![Downloads](https://pepy.tech/badge/scDataLoader/month)](https://pepy.tech/project/scDataLoader)
+[![Downloads](https://pepy.tech/badge/scDataLoader/week)](https://pepy.tech/project/scDataLoader)
+[![GitHub issues](https://img.shields.io/github/issues/jkobject/scDataLoader)](https://img.shields.io/github/issues/jkobject/scDataLoader)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![DOI](https://img.shields.io/badge/DOI-10.1101%2F2024.07.29.605556-blue)](https://doi.org/10.1101/2024.07.29.605556)
+This single cell pytorch dataloader / lighting datamodule is designed to be used with:
+- [lamindb](https://lamin.ai/)
+and:
+- [scanpy](https://scanpy.readthedocs.io/en/stable/)
+- [anndata](https://anndata.readthedocs.io/en/latest/)
+It allows you to:
+1. load thousands of datasets containing millions of cells in a few seconds.
+2. preprocess the data per dataset and download it locally (normalization, filtering, etc.)
+3. create a more complex single cell dataset
+4. extend it to your need
+built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
+The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
+## More
+I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
+![scdataloader.drawio.png](docs/scdataloader.drawio.png)
+## Install it from PyPI
+```bash
+pip install scdataloader
+```
+### Install it locally and run the notebooks:
+```bash
+git clone https://github.com/jkobject/scDataLoader.git
+pip install -e scDataLoader
+```
+## Usage
+### Direct Usage
+```python
+# initialize a local lamin database
+# !lamin init --storage ~/scdataloader --schema bionty
+from scdataloader import utils
+from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
+# preprocess datasets
+DESCRIPTION='preprocessed by scDataLoader'
+cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
+cx_dataset, len(cx_dataset.artifacts.all())
+do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
+preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
+# create dataloaders
+from scdataloader import DataModule
+import tqdm
+datamodule = DataModule(
+    collection_name="preprocessed dataset",
+    organisms=["NCBITaxon:9606"], #organism that we will work on
+    how="most expr", # for the collator (most expr genes only will be selected)
+    max_len=1000, # only the 1000 most expressed
+    batch_size=64,
+    num_workers=1,
+    validation_split=0.1,
+    test_split=0)
+for i in tqdm.tqdm(datamodule.train_dataloader()):
+    # pass #or do pass
+    print(i)
+    break
+# with lightning:
+# Trainer(model, datamodule)
+```
+see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
+1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
+2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
+### command line preprocessing
+You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
+```bash
+scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
+```
+### command line usage
+The main way to use
+> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
+## FAQ
+### how to update my ontologies?
+```bash
+import bionty as bt
+bt.reset_sources()
+# Run via CLI: lamin load <your instance>
+import lnschema_bionty as lb
+lb.dev.sync_bionty_source_to_latest()
+```
+### how to load all ontologies?
+```python
+from scdataloader import utils
+utils.populate_ontologies() # this might take from 5-20mins
+```
+## Development
+Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## Acknowledgments
+- [lamin.ai](https://lamin.ai/)
+- [scanpy](https://scanpy.readthedocs.io/en/stable/)
+- [anndata](https://anndata.readthedocs.io/en/latest/)
+- [scprint](https://www.jkobject.com/scPRINT/)
+Awesome single cell dataloader created by @jkobject

scdataloader-1.0.5/pyproject.toml ADDED Viewed

@@ -0,0 +1,62 @@
+[tool.poetry]
+name = "scdataloader"
+version = "1.0.5"
+description = "a dataloader for single cell data in lamindb"
+authors = ["jkobject"]
+license = "GPL3"
+readme = ["README.md", "LICENSE"]
+repository = "https://github.com/jkobject/scDataLoader"
+keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
+[tool.poetry.dependencies]
+python = "3.10.*"
+lamindb = "0.75.1"
+bionty = "0.48.0"
+cellxgene-census = "*"
+torch = "*"
+lightning = "*"
+anndata = "*"
+matplotlib = "*"
+seaborn = "*"
+ipykernel = "*"
+torchdata = "*"
+biomart = "*"
+pandas = ">=2.0.0"
+leidenalg = "*"
+decoupler = "*"
+django = "*"
+scikit-misc = "*"
+pytest = { version = "^7.4.3", optional = true }
+coverage = { version = "^7.3.2", optional = true }
+flake8 = { version = "^6.1.0", optional = true }
+black = { version = "^23.10.1", optional = true }
+isort = { version = "^5.12.0", optional = true }
+pytest-cov = { version = "^4.1.0", optional = true }
+mypy = { version = "^1.6.1", optional = true }
+gitchangelog = { version = "^3.0.4", optional = true }
+mkdocs = { version = "^1.5.3", optional = true }
+[tool.poetry.extras]
+dev = [
+    "pytest",
+    "coverage",
+    "flake8",
+    "black",
+    "isort",
+    "pytest-cov",
+    "mypy",
+    "gitchangelog",
+    "mkdocs",
+    "mkdocs-git-revision-date-localized-plugin",
+    "mkdocstrings",
+    "mkdocs-git-authors-plugin",
+    "mkdocs-jupyter",
+    "mkdocstrings-python"
+]
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+[tool.poetry.scripts]
+scdataloader = 'scdataloader.__main__:main'

scdataloader-1.0.5/scdataloader/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1.0.5

scdataloader-1.0.5/scdataloader/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .data import Dataset, SimpleAnnDataset
+from .datamodule import DataModule
+from .preprocess import Preprocessor
+from .collator import Collator

{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/__main__.py RENAMED Viewed

@@ -10,6 +10,9 @@ from typing import Optional, Union
 # scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
 def main():
+    """
+    main function to preprocess datasets in a given lamindb collection.
+    """
     parser = argparse.ArgumentParser(
         description="Preprocess datasets in a given lamindb collection."
     )

{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/collator.py RENAMED Viewed

@@ -1,26 +1,27 @@
 import numpy as np
-from .utils import load_genes
+from .utils import load_genes, downsample_profile
 from torch import Tensor, long
-# class SimpleCollator:
+from typing import Optional
 class Collator:
     def __init__(
         self,
-        organisms: list,
-        how="all",
-        org_to_id: dict = None,
-        valid_genes: list = [],
-        max_len=2000,
-        add_zero_genes=0,
-        logp1=False,
-        norm_to=None,
-        n_bins=0,
-        tp_name=None,
-        organism_name="organism_ontology_term_id",
-        class_names=[],
-        genelist=[],
+        organisms: list[str],
+        how: str = "all",
+        org_to_id: dict[str, int] = None,
+        valid_genes: list[str] = [],
+        max_len: int = 2000,
+        add_zero_genes: int = 0,
+        logp1: bool = False,
+        norm_to: Optional[float] = None,
+        n_bins: int = 0,
+        tp_name: Optional[str] = None,
+        organism_name: str = "organism_ontology_term_id",
+        class_names: list[str] = [],
+        genelist: list[str] = [],
+        downsample: Optional[float] = None,  # don't use it for training!
+        save_output: bool = False,
     ):
         """
         This class is responsible for collating data for the scPRINT model. It handles the
@@ -44,38 +45,57 @@ class Collator:
             org_to_id (dict): Dictionary mapping organisms to their respective IDs.
             valid_genes (list, optional): List of genes from the datasets, to be considered. Defaults to [].
                 it will drop any other genes from the input expression data (usefull when your model only works on some genes)
-            max_len (int, optional): Maximum number of genes to use (for random expr and most expr). Defaults to 2000.
+            max_len (int, optional): Total number of genes to use (for random expr and most expr). Defaults to 2000.
             n_bins (int, optional): Number of bins for binning the data. Defaults to 0. meaning, no binning of expression.
             add_zero_genes (int, optional): Number of additional unexpressed genes to add to the input data. Defaults to 0.
             logp1 (bool, optional): If True, logp1 normalization is applied. Defaults to False.
-            norm_to (str, optional): Normalization method to be applied. Defaults to None.
+            norm_to (float, optional): Rescaling value of the normalization to be applied. Defaults to None.
+            organism_name (str, optional): Name of the organism ontology term id. Defaults to "organism_ontology_term_id".
+            tp_name (str, optional): Name of the heat diff. Defaults to None.
+            class_names (list, optional): List of other classes to be considered. Defaults to [].
+            genelist (list, optional): List of genes to be considered. Defaults to [].
+                If [] all genes will be considered
+            downsample (float, optional): Downsample the profile to a certain number of cells. Defaults to None.
+                This is usually done by the scPRINT model during training but this option allows you to do it directly from the collator
+            save_output (bool, optional): If True, saves the output to a file. Defaults to False.
+                This is mainly for debugging purposes
         """
         self.organisms = organisms
+        self.genedf = load_genes(organisms)
         self.max_len = max_len
         self.n_bins = n_bins
         self.add_zero_genes = add_zero_genes
         self.logp1 = logp1
         self.norm_to = norm_to
-        self.org_to_id = org_to_id
         self.how = how
-        self.organism_ids = (
-            set([org_to_id[k] for k in organisms])
-            if org_to_id is not None
-            else set(organisms)
-        )
         if self.how == "some":
             assert len(genelist) > 0, "if how is some, genelist must be provided"
         self.organism_name = organism_name
         self.tp_name = tp_name
         self.class_names = class_names
+        self.save_output = save_output
         self.start_idx = {}
         self.accepted_genes = {}
-        self.genedf = load_genes(organisms)
+        self.downsample = downsample
         self.to_subset = {}
-        for organism in set(self.genedf.organism):
+        self._setup(org_to_id, valid_genes, genelist)
+    def _setup(self, org_to_id=None, valid_genes=[], genelist=[]):
+        self.org_to_id = org_to_id
+        self.to_subset = {}
+        self.accepted_genes = {}
+        self.start_idx = {}
+        self.organism_ids = (
+            set([org_to_id[k] for k in self.organisms])
+            if org_to_id is not None
+            else set(self.organisms)
+        )
+        for organism in self.organisms:
             ogenedf = self.genedf[self.genedf.organism == organism]
-            tot = self.genedf[self.genedf.index.isin(valid_genes)]
+            if len(valid_genes) > 0:
+                tot = self.genedf[self.genedf.index.isin(valid_genes)]
+            else:
+                tot = self.genedf
             org = org_to_id[organism] if org_to_id is not None else organism
             self.start_idx.update({org: np.where(tot.organism == organism)[0][0]})
             if len(valid_genes) > 0:
@@ -84,14 +104,14 @@ class Collator:
                 df = ogenedf[ogenedf.index.isin(valid_genes)]
                 self.to_subset.update({org: df.index.isin(genelist)})
-    def __call__(self, batch):
+    def __call__(self, batch) -> dict[str, Tensor]:
         """
         __call__ applies the collator to a minibatch of data
         Args:
             batch (list[dict[str: array]]): List of dicts of arrays containing gene expression data.
                 the first list is for the different samples, the second list is for the different elements with
-                elem["x"]: gene expression
+                elem["X"]: gene expression
                 elem["organism_name"]: organism ontology term id
                 elem["tp_name"]: heat diff
                 elem["class_names.."]: other classes
@@ -113,9 +133,9 @@ class Collator:
             organism_id = elem[self.organism_name]
             if organism_id not in self.organism_ids:
                 continue
-            if "dataset" in elem:
-                dataset.append(elem["dataset"])
-            expr = np.array(elem["x"])
+            if "_storage_idx" in elem:
+                dataset.append(elem["_storage_idx"])
+            expr = np.array(elem["X"])
             total_count.append(expr.sum())
             if len(self.accepted_genes) > 0:
                 expr = expr[self.accepted_genes[organism_id]]
@@ -206,72 +226,17 @@ class Collator:
         }
         if len(dataset) > 0:
             ret.update({"dataset": Tensor(dataset).to(long)})
+        if self.downsample is not None:
+            ret["x"] = downsample_profile(ret["x"], self.downsample)
+        if self.save_output:
+            with open("collator_output.txt", "a") as f:
+                np.savetxt(f, ret["x"].numpy())
         return ret
-class AnnDataCollator(Collator):
-    def __init__(self, *args, **kwargs):
-        """
-        AnnDataCollator Collator to use if working with AnnData's experimental dataloader (it is very slow!!!)
-        Args:
-            @see Collator
-        """
-        super().__init__(*args, **kwargs)
-    def __call__(self, batch):
-        exprs = []
-        total_count = []
-        other_classes = []
-        gene_locs = []
-        tp = []
-        for elem in batch:
-            organism_id = elem.obs[self.organism_name]
-            if organism_id.item() not in self.organism_ids:
-                print(organism_id)
-            expr = np.array(elem.X[0])
-            total_count.append(expr.sum())
-            if len(self.accepted_genes) > 0:
-                expr = expr[self.accepted_genes[organism_id]]
-            if self.how == "most expr":
-                loc = np.argsort(expr)[-(self.max_len) :][::-1]
-            elif self.how == "random expr":
-                nnz_loc = np.where(expr > 0)[0]
-                loc = nnz_loc[
-                    np.random.choice(len(nnz_loc), self.max_len, replace=False)
-                ]
-            else:
-                raise ValueError("how must be either most expr or random expr")
-            if self.add_zero_genes > 0:
-                zero_loc = np.where(expr == 0)[0]
-                zero_loc = [
-                    np.random.choice(len(zero_loc), self.add_zero_genes, replace=False)
-                ]
-                loc = np.concatenate((loc, zero_loc), axis=None)
-            exprs.append(expr[loc])
-            gene_locs.append(loc + self.start_idx[organism_id.item()])
-            if self.tp_name is not None:
-                tp.append(elem.obs[self.tp_name])
-            else:
-                tp.append(0)
-            other_classes.append([elem.obs[i].values[0] for i in self.class_names])
-        expr = np.array(exprs)
-        tp = np.array(tp)
-        gene_locs = np.array(gene_locs)
-        total_count = np.array(total_count)
-        other_classes = np.array(other_classes)
-        return {
-            "x": Tensor(expr),
-            "genes": Tensor(gene_locs).int(),
-            "depth": Tensor(total_count),
-            "class": Tensor(other_classes),
-        }
+#############
+#### WIP ####
+#############
 class GeneformerCollator(Collator):
     def __init__(self, *args, gene_norm_list: list, **kwargs):
         """

{scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/config.py RENAMED Viewed

@@ -1,3 +1,9 @@
+"""
+Configuration file for scDataLoader
+Missing labels are added to the dataset to complete a better hierarchical tree
+"""
 LABELS_TOADD = {
     "assay_ontology_term_id": {
         "10x transcription profiling": "EFO:0030003",

scdataloader 0.0.4__tar.gz → 1.0.5__tar.gz

scdataloader 0.0.4tar.gz → 1.0.5tar.gz