PyPI - scdataloader - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

scdataloader 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

scdataloader/VERSION +1 -1
scdataloader/__init__.py +4 -0
scdataloader/__main__.py +209 -0
scdataloader/collator.py +307 -0
scdataloader/config.py +106 -0
scdataloader/data.py +181 -218
scdataloader/datamodule.py +375 -0
scdataloader/mapped.py +46 -32
scdataloader/preprocess.py +524 -208
scdataloader/utils.py +189 -123
{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/METADATA +77 -7
scdataloader-0.0.4.dist-info/RECORD +16 -0
{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/WHEEL +1 -1
scdataloader-0.0.2.dist-info/RECORD +0 -12
{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/LICENSE +0 -0
{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/entry_points.txt +0 -0

scdataloader/utils.py CHANGED Viewed

@@ -10,9 +10,15 @@ from biomart import BiomartServer
 from django.db import IntegrityError
 from scipy.sparse import csr_matrix
 from scipy.stats import median_abs_deviation
+from functools import lru_cache
+from collections import Counter
+from typing import Union, List, Optional
-def createFoldersFor(filepath):
+from anndata import AnnData
+def createFoldersFor(filepath: str):
     """
     will recursively create folders if needed until having all the folders required to save the file in this filepath
     """
@@ -23,15 +29,25 @@ def createFoldersFor(filepath):
             os.mkdir(prevval)
-def _fetchFromServer(ensemble_server, attributes):
+def _fetchFromServer(
+    ensemble_server: str, attributes: list, database: str = "hsapiens_gene_ensembl"
+):
+    """
+    Fetches data from the specified ensemble server.
+    Args:
+        ensemble_server (str): The URL of the ensemble server to fetch data from.
+        attributes (list): The list of attributes to fetch from the server.
+    Returns:
+        pd.DataFrame: A pandas DataFrame containing the fetched data.
+    """
     server = BiomartServer(ensemble_server)
-    ensmbl = server.datasets["hsapiens_gene_ensembl"]
+    ensmbl = server.datasets[database]
     print(attributes)
     res = pd.read_csv(
         io.StringIO(
-            ensmbl.search(
-                {"attributes": attributes}, header=1
-            ).content.decode()
+            ensmbl.search({"attributes": attributes}, header=1).content.decode()
         ),
         sep="\t",
     )
@@ -39,11 +55,12 @@ def _fetchFromServer(ensemble_server, attributes):
 def getBiomartTable(
-    ensemble_server="http://jul2023.archive.ensembl.org/biomart",
-    useCache=False,
-    cache_folder="/tmp/biomart/",
-    attributes=[],
-    bypass_attributes=False,
+    ensemble_server: str = "http://jul2023.archive.ensembl.org/biomart",
+    useCache: bool = False,
+    cache_folder: str = "/tmp/biomart/",
+    attributes: List[str] = [],
+    bypass_attributes: bool = False,
+    database: str = "hsapiens_gene_ensembl",
 ):
     """generate a genelist dataframe from ensembl's biomart
@@ -79,7 +96,7 @@ def getBiomartTable(
     else:
         print("downloading gene names from biomart")
-        res = _fetchFromServer(ensemble_server, attr + attributes)
+        res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
         res.to_csv(cachefile, index=False)
     res.columns = attr + attributes
@@ -93,7 +110,7 @@ def getBiomartTable(
     return res
-def validate(adata, lb, organism):
+def validate(adata: AnnData, organism: str):
     """
     validate checks if the adata object is valid for lamindb
@@ -116,8 +133,7 @@ def validate(adata, lb, organism):
     Returns:
         bool: True if the adata object is valid
     """
-    organism = lb.Organism.filter(ontology_id=organism).one().name
-    lb.settings.organism = organism
+    organism = bt.Organism.filter(ontology_id=organism).one().name
     if adata.var.index.duplicated().any():
         raise ValueError("Duplicate gene names found in adata.var.index")
@@ -136,70 +152,61 @@ def validate(adata, lb, organism):
             raise ValueError(
                 f"Column '{val}' is missing in the provided anndata object."
             )
-    bionty_source = lb.BiontySource.filter(
-        entity="DevelopmentalStage", organism=organism
-    ).one()
-    if not lb.Ethnicity.validate(
+    if not bt.Ethnicity.validate(
         adata.obs["self_reported_ethnicity_ontology_term_id"],
         field="ontology_id",
     ).all():
         raise ValueError("Invalid ethnicity ontology term id found")
-    if not lb.Organism.validate(
+    if not bt.Organism.validate(
         adata.obs["organism_ontology_term_id"], field="ontology_id"
     ).all():
         raise ValueError("Invalid organism ontology term id found")
-    if not lb.Phenotype.validate(
+    if not bt.Phenotype.validate(
         adata.obs["sex_ontology_term_id"], field="ontology_id"
     ).all():
         raise ValueError("Invalid sex ontology term id found")
-    if not lb.Disease.validate(
+    if not bt.Disease.validate(
         adata.obs["disease_ontology_term_id"], field="ontology_id"
     ).all():
         raise ValueError("Invalid disease ontology term id found")
-    if not lb.CellType.validate(
+    if not bt.CellType.validate(
         adata.obs["cell_type_ontology_term_id"], field="ontology_id"
     ).all():
         raise ValueError("Invalid cell type ontology term id found")
-    if (
-        not lb.DevelopmentalStage.filter(bionty_source=bionty_source)
-        .validate(
-            adata.obs["development_stage_ontology_term_id"],
-            field="ontology_id",
-        )
-        .all()
-    ):
+    if not bt.DevelopmentalStage.validate(
+        adata.obs["development_stage_ontology_term_id"],
+        field="ontology_id",
+    ).all():
         raise ValueError("Invalid dev stage ontology term id found")
-    if not lb.Tissue.validate(
+    if not bt.Tissue.validate(
         adata.obs["tissue_ontology_term_id"], field="ontology_id"
     ).all():
         raise ValueError("Invalid tissue ontology term id found")
-    if not lb.ExperimentalFactor.validate(
+    if not bt.ExperimentalFactor.validate(
         adata.obs["assay_ontology_term_id"], field="ontology_id"
     ).all():
         raise ValueError("Invalid assay ontology term id found")
-    if (
-        not lb.Gene.filter(organism=lb.settings.organism)
-        .validate(adata.var.index, field="ensembl_gene_id")
-        .all()
-    ):
+    if not bt.Gene.validate(
+        adata.var.index, field="ensembl_gene_id", organism=organism
+    ).all():
         raise ValueError("Invalid gene ensembl id found")
     return True
-def get_all_ancestors(val, df):
+# setting a cache of 200 elements
+# @lru_cache(maxsize=200)
+def get_all_ancestors(val: str, df: pd.DataFrame):
     if val not in df.index:
         return set()
     parents = df.loc[val].parents__ontology_id
     if parents is None or len(parents) == 0:
         return set()
     else:
-        return set.union(
-            set(parents), *[get_all_ancestors(val, df) for val in parents]
-        )
+        return set.union(set(parents), *[get_all_ancestors(val, df) for val in parents])
-def get_ancestry_mapping(all_elem, onto_df):
+def get_ancestry_mapping(all_elem: list, onto_df: pd.DataFrame):
     """
     This function generates a mapping of all elements to their ancestors in the ontology dataframe.
@@ -234,13 +241,12 @@ def get_ancestry_mapping(all_elem, onto_df):
 def load_dataset_local(
-    lb,
-    remote_dataset,
-    download_folder,
-    name,
-    description,
-    use_cache=True,
-    only=None,
+    remote_dataset: ln.Collection,
+    download_folder: str,
+    name: str,
+    description: str,
+    use_cache: bool = True,
+    only: Optional[List[int]] = None,
 ):
     """
     This function loads a remote lamindb dataset to local.
@@ -258,9 +264,7 @@ def load_dataset_local(
         lamindb.Dataset: The local dataset.
     """
     saved_files = []
-    default_storage = ln.Storage.filter(
-        root=ln.settings.storage.as_posix()
-    ).one()
+    default_storage = ln.Storage.filter(root=ln.settings.storage.as_posix()).one()
     files = (
         remote_dataset.artifacts.all()
         if not only
@@ -275,17 +279,15 @@ def load_dataset_local(
         if len(organism) == 0:
             print("No organism detected")
             continue
-        organism = lb.Organism.filter(ontology_id=organism[0]).one().name
-        # lb.settings.organism = organism
+        organism = bt.Organism.filter(ontology_id=organism[0]).one().name
+        # bt.settings.organism = organism
         path = file.path
         try:
             file.save()
         except IntegrityError:
             print(f"File {file.key} already exists in storage")
         # if location already has a file, don't save again
-        if use_cache and os.path.exists(
-            os.path.expanduser(download_folder + file.key)
-        ):
+        if use_cache and os.path.exists(os.path.expanduser(download_folder + file.key)):
             print(f"File {file.key} already exists in storage")
         else:
             path.download_to(download_folder + file.key)
@@ -295,32 +297,53 @@ def load_dataset_local(
         except IntegrityError:
             print(f"File {file.key} already exists in storage")
         saved_files.append(file)
-    dataset = ln.Dataset(saved_files, name=name, description=description)
+    dataset = ln.Collection(saved_files, name=name, description=description)
     dataset.save()
     return dataset
+def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"):  # "NCBITaxon:10090",
+    organismdf = []
+    if type(organisms) == str:
+        organisms = [organisms]
+    for organism in organisms:
+        genesdf = bt.Gene.filter(
+            organism_id=bt.Organism.filter(ontology_id=organism).first().id
+        ).df()
+        genesdf = genesdf[~genesdf["public_source_id"].isna()]
+        genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
+        genesdf = genesdf.set_index("ensembl_gene_id").sort_index()
+        # mitochondrial genes
+        genesdf["mt"] = genesdf.symbol.astype(str).str.startswith("MT-")
+        # ribosomal genes
+        genesdf["ribo"] = genesdf.symbol.astype(str).str.startswith(("RPS", "RPL"))
+        # hemoglobin genes.
+        genesdf["hb"] = genesdf.symbol.astype(str).str.contains(("^HB[^(P)]"))
+        genesdf["organism"] = organism
+        organismdf.append(genesdf)
+    return pd.concat(organismdf)
 def populate_my_ontology(
-    lb,
-    organisms=["NCBITaxon:10090", "NCBITaxon:9606"],
-    sex=["PATO:0000384", "PATO:0000383"],
-    celltypes=[],
-    ethnicities=[],
-    assays=[],
-    tissues=[],
-    diseases=[],
-    dev_stages=[],
+    organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
+    sex: List[str] = ["PATO:0000384", "PATO:0000383"],
+    celltypes: List[str] = [],
+    ethnicities: List[str] = [],
+    assays: List[str] = [],
+    tissues: List[str] = [],
+    diseases: List[str] = [],
+    dev_stages: List[str] = [],
 ):
     """
     creates a local version of the lamin ontologies and add the required missing values in base ontologies
     run this function just one for each new lamin storage
-    erase everything with lb.$ontology.filter().delete()
+    erase everything with bt.$ontology.filter().delete()
     add whatever value you need afterward like it is done here with:
-    `lb.$ontology(name="ddd", ontology_id="ddddd").save()`
+    `bt.$ontology(name="ddd", ontology_id="ddddd").save()`
     `df["assay_ontology_term_id"].unique()`
@@ -336,78 +359,88 @@ def populate_my_ontology(
         dev_stages (list, optional): List of developmental stages. Defaults to [].
     """
-    names = bt.CellType().df().index if not celltypes else celltypes
-    records = lb.CellType.from_values(names, field=lb.CellType.ontology_id)
-    ln.save(records)
-    lb.CellType(name="unknown", ontology_id="unknown").save()
+    names = bt.CellType.public().df().index if not celltypes else celltypes
+    records = bt.CellType.from_values(names, field="ontology_id")
+    ln.save(records, parents=bool(celltypes))
+    bt.CellType(name="unknown", ontology_id="unknown").save()
     # Organism
-    # names = bt.Organism().df().index if not organisms else organisms
-    # records = lb.Organism.from_values(names, field=lb.Organism.ontology_id)
-    # ln.save(records)
-    # lb.Organism(name="unknown", ontology_id="unknown").save()
+    names = bt.Organism.public().df().index if not organisms else organisms
+    records = [
+        i[0] if type(i) is list else i
+        for i in [bt.Organism.from_public(ontology_id=i) for i in names]
+    ]
+    ln.save(records, parents=bool(organisms))
+    bt.Organism(name="unknown", ontology_id="unknown").save()
     # Phenotype
-    name = bt.Phenotype().df().index if not sex else sex
-    records = lb.Phenotype.from_values(
-        name,
-        field=lb.Phenotype.ontology_id,
-        bionty_source=lb.BiontySource.filter(
-            entity="Phenotype", source="pato"
-        ).one(),
-    )
-    ln.save(records)
-    lb.Phenotype(name="unknown", ontology_id="unknown").save()
+    names = bt.Phenotype.public().df().index if not sex else sex
+    records = [
+        bt.Phenotype.from_public(
+            ontology_id=i,
+            public_source=bt.PublicSource.filter(
+                entity="Phenotype", source="pato"
+            ).one(),
+        )
+        for i in names
+    ]
+    ln.save(records, parents=bool(sex))
+    bt.Phenotype(name="unknown", ontology_id="unknown").save()
     # ethnicity
-    names = bt.Ethnicity().df().index if not ethnicities else ethnicities
-    records = lb.Ethnicity.from_values(names, field=lb.Ethnicity.ontology_id)
-    ln.save(records)
-    lb.Ethnicity(
+    names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
+    records = bt.Ethnicity.from_values(names, field="ontology_id")
+    ln.save(records, parents=bool(ethnicities))
+    bt.Ethnicity(
         name="unknown", ontology_id="unknown"
     ).save()  # multi ethnic will have to get renamed
     # ExperimentalFactor
-    names = bt.ExperimentalFactor().df().index if not assays else assays
-    records = lb.ExperimentalFactor.from_values(
-        names, field=lb.ExperimentalFactor.ontology_id
-    )
-    ln.save(records)
-    lb.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
-    # lookup = lb.ExperimentalFactor.lookup()
+    names = bt.ExperimentalFactor.public().df().index if not assays else assays
+    records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
+    ln.save(records, parents=bool(assays))
+    bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
+    # lookup = bt.ExperimentalFactor.lookup()
     # lookup.smart_seq_v4.parents.add(lookup.smart_like)
     # Tissue
-    names = bt.Tissue().df().index if not tissues else tissues
-    records = lb.Tissue.from_values(names, field=lb.Tissue.ontology_id)
-    ln.save(records)
-    lb.Tissue(name="unknown", ontology_id="unknown").save()
+    names = bt.Tissue.public().df().index if not tissues else tissues
+    records = bt.Tissue.from_values(names, field="ontology_id")
+    ln.save(records, parents=bool(tissues))
+    bt.Tissue(name="unknown", ontology_id="unknown").save()
     # DevelopmentalStage
     names = (
-        bt.DevelopmentalStage().df().index if not dev_stages else dev_stages
-    )
-    records = lb.DevelopmentalStage.from_values(
-        names, field=lb.DevelopmentalStage.ontology_id
+        bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
     )
+    records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
+    ln.save(records, parents=bool(dev_stages))
+    bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
+    names = bt.DevelopmentalStage.public(organism="mouse").df().name
+    bionty_source = bt.PublicSource.filter(
+        entity="DevelopmentalStage", organism="mouse"
+    ).one()
+    records = [
+        bt.DevelopmentalStage.from_public(name=i, public_source=bionty_source)
+        for i in names.tolist()
+    ]
+    records[-4] = records[-4][0]
     ln.save(records)
-    lb.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
     # Disease
-    names = bt.Disease().df().index if not diseases else diseases
-    records = lb.Disease.from_values(names, field=lb.Disease.ontology_id)
-    ln.save(records)
-    lb.Disease(name="normal", ontology_id="PATO:0000461").save()
-    lb.Disease(name="unknown", ontology_id="unknown").save()
+    names = bt.Disease.public().df().index if not diseases else diseases
+    records = bt.Disease.from_values(names, field="ontology_id")
+    ln.save(records, parents=bool(diseases))
+    bt.Disease(name="normal", ontology_id="PATO:0000461").save()
+    bt.Disease(name="unknown", ontology_id="unknown").save()
     # genes
-    for organism in organisms:
+    for organism in ["NCBITaxon:10090", "NCBITaxon:9606"]:
         # convert onto to name
-        organism = lb.Organism.filter(ontology_id=organism).one().name
-        names = bt.Gene(organism=organism).df()["ensembl_gene_id"]
-        records = lb.Gene.from_values(
+        organism = bt.Organism.filter(ontology_id=organism).one().name
+        names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]
+        records = bt.Gene.from_values(
             names,
             field="ensembl_gene_id",
-            bionty_source=lb.BiontySource.filter(
-                entity="Gene", organism=organism
-            ).first(),
+            organism=organism,
         )
         ln.save(records)
-def is_outlier(adata, metric: str, nmads: int):
+def is_outlier(adata: AnnData, metric: str, nmads: int):
     """
     is_outlier detects outliers in adata.obs[metric]
@@ -426,7 +459,7 @@ def is_outlier(adata, metric: str, nmads: int):
     return outlier
-def length_normalize(adata, gene_lengths):
+def length_normalize(adata: AnnData, gene_lengths: list):
     """
     length_normalize normalizes the counts by the gene length
@@ -441,7 +474,7 @@ def length_normalize(adata, gene_lengths):
     return adata
-def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
+def pd_load_cached(url: str, loc: str = "/tmp/", cache: bool = True, **kwargs):
     """
     pd_load_cached downloads a file from a url and loads it as a pandas dataframe
@@ -459,3 +492,36 @@ def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
         urllib.request.urlretrieve(url, loc)
     # Load the data from the file
     return pd.read_csv(loc, **kwargs)
+def translate(
+    val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
+):
+    """
+    translate translates the ontology term id to the name
+    Args:
+        val (str, dict, set, list, dict): the object to translate
+        t (flat, optional): the type of ontology terms.
+            one of cell_type_ontology_term_id, assay_ontology_term_id, tissue_ontology_term_id.
+            Defaults to "cell_type_ontology_term_id".
+    Returns:
+        dict: the mapping for the translation
+    """
+    if t == "cell_type_ontology_term_id":
+        obj = bt.CellType.public(organism="all")
+    elif t == "assay_ontology_term_id":
+        obj = bt.ExperimentalFactor.public()
+    elif t == "tissue_ontology_term_id":
+        obj = bt.Tissue.public()
+    else:
+        return None
+    if type(val) is str:
+        return {val: obj.search(val, field=obj.ontology_id).name.iloc[0]}
+    elif type(val) is list or type(val) is set:
+        return {i: obj.search(i, field=obj.ontology_id).name.iloc[0] for i in set(val)}
+    elif type(val) is dict or type(val) is Counter:
+        return {
+            obj.search(k, field=obj.ontology_id).name.iloc[0]: v for k, v in val.items()
+        }

{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/METADATA RENAMED Viewed

@@ -1,39 +1,45 @@
 Metadata-Version: 2.1
 Name: scdataloader
-Version: 0.0.2
+Version: 0.0.4
 Summary: a dataloader for single cell data in lamindb
-Home-page: https://github.com/jkobject/scPrint
+Home-page: https://github.com/jkobject/scDataLoader
 License: GPL3
 Keywords: scRNAseq,dataloader,pytorch,lamindb,scPrint
 Author: jkobject
-Requires-Python: >=3.10,<4.0
+Requires-Python: ==3.10.*
 Classifier: License :: Other/Proprietary License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
 Requires-Dist: anndata
 Requires-Dist: biomart
+Requires-Dist: bionty
 Requires-Dist: cellxgene-census
 Requires-Dist: decoupler
 Requires-Dist: django
 Requires-Dist: ipykernel
 Requires-Dist: lamindb
 Requires-Dist: leidenalg
+Requires-Dist: lightning
+Requires-Dist: lnschema-bionty
 Requires-Dist: matplotlib
 Requires-Dist: pandas (>=2.0.0)
+Requires-Dist: scikit-misc
 Requires-Dist: seaborn
 Requires-Dist: torch
 Requires-Dist: torchdata
-Project-URL: Repository, https://github.com/jkobject/scPrint
+Project-URL: Repository, https://github.com/jkobject/scDataLoader
 Description-Content-Type: text/markdown
 # scdataloader
 [![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
 [![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
+[![DOI](https://zenodo.org/badge/731248665.svg)](https://zenodo.org/doi/10.5281/zenodo.10573143)
-Awesome single cell dataloader created by @jkobject
+Awesome single cell dataloader created by @jkobject
+built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
 This data loader is designed to be used with:
@@ -51,14 +57,78 @@ It allows you to:
 3. create a more complex single cell dataset
 4. extend it to your need
+## About
+the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
+1. loading from lamin
+2. doing some dataset specific preprocessing if needed
+3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
+4. passing it to a dataloader object that can work with it correctly
+Currently one would have to use the preprocess function to make the dataset fit for different tools like scGPT / Geneformer. But I would want to enable it through different Collators. This is still missing and a WIP... (please do contribute!)
+![docs/scdataloader.drawio.png](docs/scdataloader.drawio.png)
 ## Install it from PyPI
 ```bash
 pip install scdataloader
 ```
+### Install it locally and run the notebooks:
+```bash
+git clone https://github.com/jkobject/scDataLoader.git
+cd scDataLoader
+poetry install
+```
+then run the notebooks with the poetry installed environment
 ## Usage
+```python
+# initialize a local lamin database
+# !lamin init --storage ~/scdataloader --schema bionty
+from scdataloader import utils
+from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
+# preprocess datasets
+DESCRIPTION='preprocessed by scDataLoader'
+cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
+cx_dataset, len(cx_dataset.artifacts.all())
+do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
+preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
+# create dataloaders
+from scdataloader import DataModule
+import tqdm
+datamodule = DataModule(
+    collection_name="preprocessed dataset",
+    organisms=["NCBITaxon:9606"], #organism that we will work on
+    how="most expr", # for the collator (most expr genes only will be selected)
+    max_len=1000, # only the 1000 most expressed
+    batch_size=64,
+    num_workers=1,
+    validation_split=0.1,
+    test_split=0)
+for i in tqdm.tqdm(datamodule.train_dataloader()):
+    # pass #or do pass
+    print(i)
+    break
+# with lightning:
+# Trainer(model, datamodule)
+```
 see the notebooks in [docs](https://jkobject.github.io/scDataLoader/):
 1. [load a dataset](https://jkobject.github.io/scDataLoader/notebooks/01_load_dataset.html)

scdataloader-0.0.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+scdataloader/VERSION,sha256=ln2a-xATRmZxZvLnboGRC8GQSI19QdUMoAcunZLwDjI,6
+scdataloader/__init__.py,sha256=NIlE4oTUPRZ3uSW_maozoEHp470I7PV1vMOJ4XpSmL4,122
+scdataloader/__main__.py,sha256=UyXtFHgWxE-ecJmM_oEDLlzBDBbH-uEKAVj1A7BkwmM,6297
+scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
+scdataloader/collator.py,sha256=Ykjdw24GUvHdbowWUDtp28YTkaF3w65SiWTU2PKBzy4,11714
+scdataloader/config.py,sha256=0_LoIblgdZZ19yM2qvPE-padMGQzdhuaxX20zYrhWq0,2780
+scdataloader/data.py,sha256=faJWN--06N7irWBKcjeU6fcX5NbzyEPXs2_EVGxfBpw,12292
+scdataloader/datamodule.py,sha256=OhHPb3jhGG5HbvahzTGxgzJ_lxbVJ4PfZspVW9h7SZk,14789
+scdataloader/mapped.py,sha256=rhE11Xl3x_wIKu3m_wu8Is6mYsXdblu3nQpT5lNqr60,13301
+scdataloader/preprocess.py,sha256=67ewe6b4HIjz_vTDjlOAJ4lMe4K2oCw2HHHUS-7S77M,38205
+scdataloader/utils.py,sha256=6eKU3_cotEaQcxONMrCWzMx7U8DybabteNhk-vNqfUQ,19365
+scdataloader-0.0.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+scdataloader-0.0.4.dist-info/METADATA,sha256=Bf8UjMwRcqSbWW8VbWrLhSb7qKQYdjZtJ7d6Oz4-rn8,39733
+scdataloader-0.0.4.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
+scdataloader-0.0.4.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
+scdataloader-0.0.4.dist-info/RECORD,,

{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.8.1
+Generator: poetry-core 1.7.0
 Root-Is-Purelib: true
 Tag: py3-none-any

scdataloader-0.0.2.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-scdataloader/VERSION,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
-scdataloader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
-scdataloader/data.py,sha256=5-w4WL0Ho5RW47J37N-zdNhV4Fjs0b7lb6c6ugeTMi4,12793
-scdataloader/mapped.py,sha256=wQN2K7GnJv-UiNIlC41HItrVMW50tECAjc8mt-QV-1I,12290
-scdataloader/preprocess.py,sha256=sm5OPREZFJaGVF9VsTKGvT1jHT7sOouX_ql0mWx3_4Q,23103
-scdataloader/utils.py,sha256=Ih1LLnmRZYOpIk1IoAJKyRAT361zrgBgUhwJM04V6Pw,16115
-scdataloader-0.0.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-scdataloader-0.0.2.dist-info/METADATA,sha256=4ICXsQcdWkwrAZZVDIYG1L3d7JCpaxpr3MYlnVsD1Qw,37340
-scdataloader-0.0.2.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
-scdataloader-0.0.2.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
-scdataloader-0.0.2.dist-info/RECORD,,

{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

scdataloader 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

scdataloader 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl