PyPI - scdataloader - Versions diffs - 0.0.2__tar.gz → 0.0.4__tar.gz - Mend

scdataloader 0.0.2tar.gz → 0.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{scdataloader-0.0.2 → scdataloader-0.0.4}/PKG-INFO +77 -7
scdataloader-0.0.4/README.md +107 -0
{scdataloader-0.0.2 → scdataloader-0.0.4}/pyproject.toml +9 -10
scdataloader-0.0.4/scdataloader/VERSION +1 -0
scdataloader-0.0.4/scdataloader/__init__.py +4 -0
scdataloader-0.0.4/scdataloader/__main__.py +209 -0
scdataloader-0.0.4/scdataloader/collator.py +307 -0
scdataloader-0.0.4/scdataloader/config.py +106 -0
scdataloader-0.0.4/scdataloader/data.py +316 -0
scdataloader-0.0.4/scdataloader/datamodule.py +375 -0
{scdataloader-0.0.2 → scdataloader-0.0.4}/scdataloader/mapped.py +46 -32
scdataloader-0.0.4/scdataloader/preprocess.py +900 -0
{scdataloader-0.0.2 → scdataloader-0.0.4}/scdataloader/utils.py +189 -123
scdataloader-0.0.2/README.md +0 -39
scdataloader-0.0.2/scdataloader/VERSION +0 -1
scdataloader-0.0.2/scdataloader/__init__.py +0 -0
scdataloader-0.0.2/scdataloader/data.py +0 -353
scdataloader-0.0.2/scdataloader/preprocess.py +0 -584
{scdataloader-0.0.2 → scdataloader-0.0.4}/LICENSE +0 -0
{scdataloader-0.0.2 → scdataloader-0.0.4}/scdataloader/base.py +0 -0

{scdataloader-0.0.2 → scdataloader-0.0.4}/PKG-INFO RENAMED Viewed

@@ -1,39 +1,45 @@
 Metadata-Version: 2.1
 Name: scdataloader
-Version: 0.0.2
+Version: 0.0.4
 Summary: a dataloader for single cell data in lamindb
-Home-page: https://github.com/jkobject/scPrint
+Home-page: https://github.com/jkobject/scDataLoader
 License: GPL3
 Keywords: scRNAseq,dataloader,pytorch,lamindb,scPrint
 Author: jkobject
-Requires-Python: >=3.10,<4.0
+Requires-Python: ==3.10.*
 Classifier: License :: Other/Proprietary License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
 Requires-Dist: anndata
 Requires-Dist: biomart
+Requires-Dist: bionty
 Requires-Dist: cellxgene-census
 Requires-Dist: decoupler
 Requires-Dist: django
 Requires-Dist: ipykernel
 Requires-Dist: lamindb
 Requires-Dist: leidenalg
+Requires-Dist: lightning
+Requires-Dist: lnschema-bionty
 Requires-Dist: matplotlib
 Requires-Dist: pandas (>=2.0.0)
+Requires-Dist: scikit-misc
 Requires-Dist: seaborn
 Requires-Dist: torch
 Requires-Dist: torchdata
-Project-URL: Repository, https://github.com/jkobject/scPrint
+Project-URL: Repository, https://github.com/jkobject/scDataLoader
 Description-Content-Type: text/markdown
 # scdataloader
 [![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
 [![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
+[![DOI](https://zenodo.org/badge/731248665.svg)](https://zenodo.org/doi/10.5281/zenodo.10573143)
-Awesome single cell dataloader created by @jkobject
+Awesome single cell dataloader created by @jkobject
+built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
 This data loader is designed to be used with:
@@ -51,14 +57,78 @@ It allows you to:
 3. create a more complex single cell dataset
 4. extend it to your need
+## About
+the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
+1. loading from lamin
+2. doing some dataset specific preprocessing if needed
+3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
+4. passing it to a dataloader object that can work with it correctly
+Currently one would have to use the preprocess function to make the dataset fit for different tools like scGPT / Geneformer. But I would want to enable it through different Collators. This is still missing and a WIP... (please do contribute!)
+![docs/scdataloader.drawio.png](docs/scdataloader.drawio.png)
 ## Install it from PyPI
 ```bash
 pip install scdataloader
 ```
+### Install it locally and run the notebooks:
+```bash
+git clone https://github.com/jkobject/scDataLoader.git
+cd scDataLoader
+poetry install
+```
+then run the notebooks with the poetry installed environment
 ## Usage
+```python
+# initialize a local lamin database
+# !lamin init --storage ~/scdataloader --schema bionty
+from scdataloader import utils
+from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
+# preprocess datasets
+DESCRIPTION='preprocessed by scDataLoader'
+cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
+cx_dataset, len(cx_dataset.artifacts.all())
+do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
+preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
+# create dataloaders
+from scdataloader import DataModule
+import tqdm
+datamodule = DataModule(
+    collection_name="preprocessed dataset",
+    organisms=["NCBITaxon:9606"], #organism that we will work on
+    how="most expr", # for the collator (most expr genes only will be selected)
+    max_len=1000, # only the 1000 most expressed
+    batch_size=64,
+    num_workers=1,
+    validation_split=0.1,
+    test_split=0)
+for i in tqdm.tqdm(datamodule.train_dataloader()):
+    # pass #or do pass
+    print(i)
+    break
+# with lightning:
+# Trainer(model, datamodule)
+```
 see the notebooks in [docs](https://jkobject.github.io/scDataLoader/):
 1. [load a dataset](https://jkobject.github.io/scDataLoader/notebooks/01_load_dataset.html)

scdataloader-0.0.4/README.md ADDED Viewed

@@ -0,0 +1,107 @@
+# scdataloader
+[![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
+[![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
+[![DOI](https://zenodo.org/badge/731248665.svg)](https://zenodo.org/doi/10.5281/zenodo.10573143)
+Awesome single cell dataloader created by @jkobject
+built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
+This data loader is designed to be used with:
+- [lamindb](https://lamin.ai/)
+and:
+- [scanpy](https://scanpy.readthedocs.io/en/stable/)
+- [anndata](https://anndata.readthedocs.io/en/latest/)
+It allows you to:
+1. load thousands of datasets containing millions of cells in a few seconds.
+2. preprocess the data per dataset and download it locally (normalization, filtering, etc.)
+3. create a more complex single cell dataset
+4. extend it to your need
+## About
+the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
+1. loading from lamin
+2. doing some dataset specific preprocessing if needed
+3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
+4. passing it to a dataloader object that can work with it correctly
+Currently one would have to use the preprocess function to make the dataset fit for different tools like scGPT / Geneformer. But I would want to enable it through different Collators. This is still missing and a WIP... (please do contribute!)
+![docs/scdataloader.drawio.png](docs/scdataloader.drawio.png)
+## Install it from PyPI
+```bash
+pip install scdataloader
+```
+### Install it locally and run the notebooks:
+```bash
+git clone https://github.com/jkobject/scDataLoader.git
+cd scDataLoader
+poetry install
+```
+then run the notebooks with the poetry installed environment
+## Usage
+```python
+# initialize a local lamin database
+# !lamin init --storage ~/scdataloader --schema bionty
+from scdataloader import utils
+from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
+# preprocess datasets
+DESCRIPTION='preprocessed by scDataLoader'
+cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
+cx_dataset, len(cx_dataset.artifacts.all())
+do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
+preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
+# create dataloaders
+from scdataloader import DataModule
+import tqdm
+datamodule = DataModule(
+    collection_name="preprocessed dataset",
+    organisms=["NCBITaxon:9606"], #organism that we will work on
+    how="most expr", # for the collator (most expr genes only will be selected)
+    max_len=1000, # only the 1000 most expressed
+    batch_size=64,
+    num_workers=1,
+    validation_split=0.1,
+    test_split=0)
+for i in tqdm.tqdm(datamodule.train_dataloader()):
+    # pass #or do pass
+    print(i)
+    break
+# with lightning:
+# Trainer(model, datamodule)
+```
+see the notebooks in [docs](https://jkobject.github.io/scDataLoader/):
+1. [load a dataset](https://jkobject.github.io/scDataLoader/notebooks/01_load_dataset.html)
+2. [create a dataset](https://jkobject.github.io/scDataLoader/notebooks/02_create_dataset.html)
+## Development
+Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.

{scdataloader-0.0.2 → scdataloader-0.0.4}/pyproject.toml RENAMED Viewed

@@ -1,24 +1,19 @@
 [tool.poetry]
 name = "scdataloader"
-version = "0.0.2"
+version = "0.0.4"
 description = "a dataloader for single cell data in lamindb"
 authors = ["jkobject"]
 license = "GPL3"
 readme = ["README.md", "LICENSE"]
-repository = "https://github.com/jkobject/scPrint"
-keywords = [
-  "scRNAseq",
-  "dataloader",
-  "pytorch",
-  "lamindb",
-  "scPrint",
-]
+repository = "https://github.com/jkobject/scDataLoader"
+keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPrint"]
 [tool.poetry.dependencies]
-python = "^3.10"
+python = "3.10.*"
 lamindb = "*"
 cellxgene-census = "*"
 torch = "*"
+lightning = "*"
 anndata = "*"
 matplotlib = "*"
 seaborn = "*"
@@ -29,6 +24,9 @@ pandas = ">=2.0.0"
 leidenalg = "*"
 decoupler = "*"
 django = "*"
+lnschema-bionty = "*"
+bionty = "*"
+scikit-misc = "*"
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.3"
@@ -46,6 +44,7 @@ mkdocs-git-authors-plugin = "*"
 mkdocs-jupyter = "*"
 mkdocstrings-python = "*"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

scdataloader-0.0.4/scdataloader/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.7.0

scdataloader-0.0.4/scdataloader/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .data import Dataset
+from .datamodule import DataModule
+from .preprocess import Preprocessor
+from .collator import *

scdataloader-0.0.4/scdataloader/__main__.py ADDED Viewed

@@ -0,0 +1,209 @@
+import argparse
+from scdataloader.preprocess import (
+    LaminPreprocessor,
+    additional_preprocess,
+    additional_postprocess,
+)
+import lamindb as ln
+from typing import Optional, Union
+# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess datasets in a given lamindb collection."
+    )
+    parser.add_argument(
+        "--name", type=str, required=True, help="Name of the input dataset"
+    )
+    parser.add_argument(
+        "--new_name",
+        type=str,
+        default="preprocessed dataset",
+        help="Name of the preprocessed dataset.",
+    )
+    parser.add_argument(
+        "--description",
+        type=str,
+        default="preprocessed by scDataLoader",
+        help="Description of the preprocessed dataset.",
+    )
+    parser.add_argument(
+        "--start_at", type=int, default=0, help="Position to start preprocessing at."
+    )
+    parser.add_argument(
+        "--new_version",
+        type=str,
+        default="2",
+        help="Version of the output dataset and files.",
+    )
+    parser.add_argument(
+        "--instance",
+        type=str,
+        default=None,
+        help="Instance storing the input dataset, if not local",
+    )
+    parser.add_argument(
+        "--version", type=str, default=None, help="Version of the input dataset."
+    )
+    parser.add_argument(
+        "--filter_gene_by_counts",
+        type=Union[int, bool],
+        default=False,
+        help="Determines whether to filter genes by counts.",
+    )
+    parser.add_argument(
+        "--filter_cell_by_counts",
+        type=Union[int, bool],
+        default=False,
+        help="Determines whether to filter cells by counts.",
+    )
+    parser.add_argument(
+        "--normalize_sum",
+        type=float,
+        default=1e4,
+        help="Determines whether to normalize the total counts of each cell to a specific value.",
+    )
+    parser.add_argument(
+        "--subset_hvg",
+        type=int,
+        default=0,
+        help="Determines whether to subset highly variable genes.",
+    )
+    parser.add_argument(
+        "--hvg_flavor",
+        type=str,
+        default="seurat_v3",
+        help="Specifies the flavor of highly variable genes selection.",
+    )
+    parser.add_argument(
+        "--binning",
+        type=Optional[int],
+        default=None,
+        help="Determines whether to bin the data into discrete values of number of bins provided.",
+    )
+    parser.add_argument(
+        "--result_binned_key",
+        type=str,
+        default="X_binned",
+        help="Specifies the key of AnnData to store the binned data.",
+    )
+    parser.add_argument(
+        "--length_normalize",
+        type=bool,
+        default=False,
+        help="Determines whether to normalize the length.",
+    )
+    parser.add_argument(
+        "--force_preprocess",
+        type=bool,
+        default=False,
+        help="Determines whether to force preprocessing.",
+    )
+    parser.add_argument(
+        "--min_dataset_size",
+        type=int,
+        default=100,
+        help="Specifies the minimum dataset size.",
+    )
+    parser.add_argument(
+        "--min_valid_genes_id",
+        type=int,
+        default=10_000,
+        help="Specifies the minimum valid genes id.",
+    )
+    parser.add_argument(
+        "--min_nnz_genes",
+        type=int,
+        default=400,
+        help="Specifies the minimum non-zero genes.",
+    )
+    parser.add_argument(
+        "--maxdropamount",
+        type=int,
+        default=50,
+        help="Specifies the maximum drop amount.",
+    )
+    parser.add_argument(
+        "--madoutlier", type=int, default=5, help="Specifies the MAD outlier."
+    )
+    parser.add_argument(
+        "--pct_mt_outlier",
+        type=int,
+        default=8,
+        help="Specifies the percentage of MT outlier.",
+    )
+    parser.add_argument(
+        "--batch_key", type=Optional[str], default=None, help="Specifies the batch key."
+    )
+    parser.add_argument(
+        "--skip_validate",
+        type=bool,
+        default=False,
+        help="Determines whether to skip validation.",
+    )
+    parser.add_argument(
+        "--do_postp",
+        type=bool,
+        default=False,
+        help="Determines whether to do postprocessing.",
+    )
+    args = parser.parse_args()
+    # Load the collection
+    # if not args.preprocess:
+    #    print("Only preprocess is available for now")
+    #    return
+    if args.instance is not None:
+        collection = (
+            ln.Collection.using(instance=args.instance)
+            .filter(name=args.name, version=args.version)
+            .first()
+        )
+    else:
+        collection = ln.Collection.filter(name=args.name, version=args.version).first()
+    print(
+        "using the dataset ", collection, " of size ", len(collection.artifacts.all())
+    )
+    # Initialize the preprocessor
+    preprocessor = LaminPreprocessor(
+        filter_gene_by_counts=args.filter_gene_by_counts,
+        filter_cell_by_counts=args.filter_cell_by_counts,
+        normalize_sum=args.normalize_sum,
+        subset_hvg=args.subset_hvg,
+        hvg_flavor=args.hvg_flavor,
+        binning=args.binning,
+        result_binned_key=args.result_binned_key,
+        length_normalize=args.length_normalize,
+        force_preprocess=args.force_preprocess,
+        min_dataset_size=args.min_dataset_size,
+        min_valid_genes_id=args.min_valid_genes_id,
+        min_nnz_genes=args.min_nnz_genes,
+        maxdropamount=args.maxdropamount,
+        madoutlier=args.madoutlier,
+        pct_mt_outlier=args.pct_mt_outlier,
+        batch_key=args.batch_key,
+        skip_validate=args.skip_validate,
+        do_postp=args.do_postp,
+        additional_preprocess=additional_preprocess,
+        additional_postprocess=additional_postprocess,
+        keep_files=False,
+    )
+    # Preprocess the dataset
+    preprocessor(
+        collection,
+        name=args.new_name,
+        description=args.description,
+        start_at=args.start_at,
+        version=args.new_version,
+    )
+    print(
+        f"Preprocessed dataset saved with version {args.version} and name {args.new_name}."
+    )
+if __name__ == "__main__":
+    main()

scdataloader 0.0.2__tar.gz → 0.0.4__tar.gz

scdataloader 0.0.2tar.gz → 0.0.4tar.gz