scdataloader 1.1.3__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scdataloader/utils.py CHANGED
@@ -1,23 +1,21 @@
1
1
  import io
2
2
  import os
3
3
  import urllib
4
+ from collections import Counter
5
+ from functools import lru_cache
6
+ from typing import List, Optional, Union
4
7
 
5
8
  import bionty as bt
6
9
  import lamindb as ln
7
10
  import numpy as np
8
11
  import pandas as pd
12
+ import torch
13
+ from anndata import AnnData
9
14
  from biomart import BiomartServer
10
15
  from django.db import IntegrityError
11
16
  from scipy.sparse import csr_matrix
12
17
  from scipy.stats import median_abs_deviation
13
- from functools import lru_cache
14
- from collections import Counter
15
18
  from torch import Tensor
16
- import torch
17
-
18
- from typing import Union, List, Optional
19
-
20
- from anndata import AnnData
21
19
 
22
20
 
23
21
  def downsample_profile(mat: Tensor, dropout: float):
@@ -92,7 +90,7 @@ def _fetchFromServer(
92
90
 
93
91
 
94
92
  def getBiomartTable(
95
- ensemble_server: str = "http://jul2023.archive.ensembl.org/biomart",
93
+ ensemble_server: str = "http://may2024.archive.ensembl.org/biomart",
96
94
  useCache: bool = False,
97
95
  cache_folder: str = "/tmp/biomart/",
98
96
  attributes: List[str] = [],
@@ -102,7 +100,7 @@ def getBiomartTable(
102
100
  """generate a genelist dataframe from ensembl's biomart
103
101
 
104
102
  Args:
105
- ensemble_server (str, optional): the biomart server. Defaults to "http://jul2023.archive.ensembl.org/biomart".
103
+ ensemble_server (str, optional): the biomart server. Defaults to "http://may2023.archive.ensembl.org/biomart".
106
104
  useCache (bool, optional): whether to use the cache or not. Defaults to False.
107
105
  cache_folder (str, optional): the cache folder. Defaults to "/tmp/biomart/".
108
106
  attributes (List[str], optional): the attributes to fetch. Defaults to [].
@@ -129,21 +127,20 @@ def getBiomartTable(
129
127
 
130
128
  cache_folder = os.path.expanduser(cache_folder)
131
129
  createFoldersFor(cache_folder)
132
- cachefile = os.path.join(cache_folder, ".biomart.csv")
130
+ cachefile = os.path.join(cache_folder, ".biomart.parquet")
133
131
  if useCache & os.path.isfile(cachefile):
134
132
  print("fetching gene names from biomart cache")
135
- res = pd.read_csv(cachefile)
133
+ res = pd.read_parquet(cachefile)
136
134
  else:
137
135
  print("downloading gene names from biomart")
138
136
 
139
137
  res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
140
- res.to_csv(cachefile, index=False)
138
+ res.to_parquet(cachefile, index=False)
141
139
  res.columns = attr + attributes
142
140
  if type(res) is not type(pd.DataFrame()):
143
141
  raise ValueError("should be a dataframe")
144
142
  res = res[~(res["ensembl_gene_id"].isna())]
145
143
  if "hgnc_symbol" in res.columns:
146
- res = res[res["hgnc_symbol"].isna()]
147
144
  res.loc[res[res.hgnc_symbol.isna()].index, "hgnc_symbol"] = res[
148
145
  res.hgnc_symbol.isna()
149
146
  ]["ensembl_gene_id"]
@@ -371,10 +368,16 @@ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10
371
368
  genesdf["organism"] = organism
372
369
  organismdf.append(genesdf)
373
370
  organismdf = pd.concat(organismdf)
374
- organismdf.drop(
375
- columns=["source_id", "run_id", "created_by_id", "updated_at", "stable_id"],
376
- inplace=True,
377
- )
371
+ for col in [
372
+ "source_id",
373
+ "run_id",
374
+ "created_by_id",
375
+ "updated_at",
376
+ "stable_id",
377
+ "created_at",
378
+ ]:
379
+ if col in organismdf.columns:
380
+ organismdf.drop(columns=[col], inplace=True)
378
381
  return organismdf
379
382
 
380
383
 
@@ -387,6 +390,7 @@ def populate_my_ontology(
387
390
  tissues: List[str] = [],
388
391
  diseases: List[str] = [],
389
392
  dev_stages: List[str] = [],
393
+ organism_clade: str = "vertebrates",
390
394
  ):
391
395
  """
392
396
  creates a local version of the lamin ontologies and add the required missing values in base ontologies
@@ -397,7 +401,7 @@ def populate_my_ontology(
397
401
 
398
402
  add whatever value you need afterward like it is done here with:
399
403
 
400
- `bt.$ontology(name="ddd", ontology_id="ddddd").save()`
404
+ `bt.$ontology(name="ddd", ontolbogy_id="ddddd").save()`
401
405
 
402
406
  `df["assay_ontology_term_id"].unique()`
403
407
 
@@ -414,89 +418,111 @@ def populate_my_ontology(
414
418
  """
415
419
  # cell type
416
420
  if celltypes is not None:
417
- names = bt.CellType.public().df().index if not celltypes else celltypes
418
- records = bt.CellType.from_values(names, field="ontology_id")
419
- ln.save(records)
421
+ if len(celltypes) == 0:
422
+ bt.CellType.import_from_source(update=True)
423
+ else:
424
+ names = bt.CellType.public().df().index if not celltypes else celltypes
425
+ records = bt.CellType.from_values(names, field="ontology_id")
426
+ ln.save(records)
420
427
  bt.CellType(name="unknown", ontology_id="unknown").save()
421
428
  # Organism
422
429
  if organisms is not None:
423
- names = bt.Organism.public().df().index if not organisms else organisms
430
+ names = (
431
+ bt.Organism.public(organism=organism_clade).df().index
432
+ if not organisms
433
+ else organisms
434
+ )
435
+ source = bt.PublicSource.filter(name="ensembl", organism=organism_clade).last()
424
436
  records = [
425
437
  i[0] if type(i) is list else i
426
- for i in [bt.Organism.from_source(ontology_id=i) for i in names]
438
+ for i in [
439
+ bt.Organism.from_source(ontology_id=i, source=source) for i in names
440
+ ]
427
441
  ]
428
442
  ln.save(records)
429
443
  bt.Organism(name="unknown", ontology_id="unknown").save()
430
- organism_names = names
431
444
  # Phenotype
432
445
  if sex is not None:
433
446
  names = bt.Phenotype.public().df().index if not sex else sex
447
+ source = bt.PublicSource.filter(name="pato").first()
434
448
  records = [
435
- bt.Phenotype.from_source(
436
- ontology_id=i, source=bt.PublicSource.filter(name="pato").first()
437
- )
438
- for i in names
449
+ bt.Phenotype.from_source(ontology_id=i, source=source) for i in names
439
450
  ]
440
451
  ln.save(records)
441
452
  bt.Phenotype(name="unknown", ontology_id="unknown").save()
442
453
  # ethnicity
443
454
  if ethnicities is not None:
444
- names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
445
- records = bt.Ethnicity.from_values(names, field="ontology_id")
446
- ln.save(records)
455
+ if len(ethnicities) == 0:
456
+ bt.Ethnicity.import_from_source(update=True)
457
+ else:
458
+ names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
459
+ records = bt.Ethnicity.from_values(names, field="ontology_id")
460
+ ln.save(records)
447
461
  bt.Ethnicity(
448
462
  name="unknown", ontology_id="unknown"
449
463
  ).save() # multi ethnic will have to get renamed
450
464
  # ExperimentalFactor
451
465
  if assays is not None:
452
- names = bt.ExperimentalFactor.public().df().index if not assays else assays
453
- records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
454
- ln.save(records)
466
+ if len(assays) == 0:
467
+ bt.ExperimentalFactor.import_from_source(update=True)
468
+ else:
469
+ names = bt.ExperimentalFactor.public().df().index if not assays else assays
470
+ records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
471
+ ln.save(records)
455
472
  bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
456
473
  # lookup = bt.ExperimentalFactor.lookup()
457
474
  # lookup.smart_seq_v4.parents.add(lookup.smart_like)
458
475
  # Tissue
459
476
  if tissues is not None:
460
- names = bt.Tissue.public().df().index if not tissues else tissues
461
- records = bt.Tissue.from_values(names, field="ontology_id")
462
- ln.save(records)
477
+ if len(tissues) == 0:
478
+ bt.Tissue.import_from_source(update=True)
479
+ else:
480
+ names = bt.Tissue.public().df().index if not tissues else tissues
481
+ records = bt.Tissue.from_values(names, field="ontology_id")
482
+ ln.save(records)
463
483
  bt.Tissue(name="unknown", ontology_id="unknown").save()
464
484
  # DevelopmentalStage
465
485
  if dev_stages is not None:
466
- names = (
467
- bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
468
- )
469
- records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
470
- ln.save(records)
486
+ if len(dev_stages) == 0:
487
+ bt.DevelopmentalStage.import_from_source(update=True)
488
+ source = bt.PublicSource.filter(organism="mouse", name="mmusdv").last()
489
+ bt.DevelopmentalStage.import_from_source(source=source)
490
+ else:
491
+ names = (
492
+ bt.DevelopmentalStage.public().df().index
493
+ if not dev_stages
494
+ else dev_stages
495
+ )
496
+ records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
497
+ ln.save(records)
471
498
  bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
472
499
 
473
- names = bt.DevelopmentalStage.public(organism="mouse").df().index
474
- records = [
475
- bt.DevelopmentalStage.from_source(
476
- ontology_id=i,
477
- source=bt.PublicSource.filter(organism="mouse", name="mmusdv").first(),
478
- )
479
- for i in names.tolist()
480
- ]
481
- ln.save(records)
482
500
  # Disease
483
501
  if diseases is not None:
484
- names = bt.Disease.public().df().index if not diseases else diseases
485
- records = bt.Disease.from_values(names, field="ontology_id")
486
- ln.save(records)
502
+ if len(diseases) == 0:
503
+ bt.Disease.import_from_source(update=True)
504
+ else:
505
+ names = bt.Disease.public().df().index if not diseases else diseases
506
+ records = bt.Disease.from_values(names, field="ontology_id")
507
+ ln.save(records)
487
508
  bt.Disease(name="normal", ontology_id="PATO:0000461").save()
488
509
  bt.Disease(name="unknown", ontology_id="unknown").save()
489
510
  # genes
490
- for organism in organism_names:
511
+ for organism in ["NCBITaxon:10090", "NCBITaxon:9606"]:
491
512
  # convert onto to name
492
513
  organism = bt.Organism.filter(ontology_id=organism).one().name
493
514
  names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]
494
- records = bt.Gene.from_values(
495
- names,
496
- field="ensembl_gene_id",
497
- organism=organism,
498
- )
499
- ln.save(records)
515
+
516
+ # Process names in blocks of 10,000 elements
517
+ block_size = 10000
518
+ for i in range(0, len(names), block_size):
519
+ block = names[i : i + block_size]
520
+ records = bt.Gene.from_values(
521
+ block,
522
+ field="ensembl_gene_id",
523
+ organism=organism,
524
+ )
525
+ ln.save(records)
500
526
 
501
527
 
502
528
  def is_outlier(adata: AnnData, metric: str, nmads: int):
@@ -0,0 +1,299 @@
1
+ Metadata-Version: 2.3
2
+ Name: scdataloader
3
+ Version: 1.2.2
4
+ Summary: a dataloader for single cell data in lamindb
5
+ Project-URL: repository, https://github.com/jkobject/scDataLoader
6
+ Author-email: jkobject <jkobject@gmail.com>
7
+ License: MIT
8
+ Keywords: dataloader,lamindb,pytorch,scPRINT,scRNAseq
9
+ Requires-Python: <3.11,>=3.10
10
+ Requires-Dist: anndata>=0.9.0
11
+ Requires-Dist: biomart>=0.9.0
12
+ Requires-Dist: cellxgene-census>=0.1.0
13
+ Requires-Dist: django>=4.0.0
14
+ Requires-Dist: ipykernel>=6.20.0
15
+ Requires-Dist: lamindb[bionty]==0.76.12
16
+ Requires-Dist: leidenalg>=0.8.0
17
+ Requires-Dist: lightning>=2.0.0
18
+ Requires-Dist: matplotlib>=3.5.0
19
+ Requires-Dist: numpy>=1.26.0
20
+ Requires-Dist: palantir>=1.3.3
21
+ Requires-Dist: pandas>=2.0.0
22
+ Requires-Dist: scikit-misc>=0.5.0
23
+ Requires-Dist: seaborn>=0.11.0
24
+ Requires-Dist: torch==2.2.0
25
+ Requires-Dist: torchdata>=0.5.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: coverage>=7.3.2; extra == 'dev'
28
+ Requires-Dist: gitchangelog>=3.0.4; extra == 'dev'
29
+ Requires-Dist: mkdocs-git-authors-plugin>=0.4.0; extra == 'dev'
30
+ Requires-Dist: mkdocs-git-revision-date-localized-plugin>=1.0.0; extra == 'dev'
31
+ Requires-Dist: mkdocs-jupyter>=0.2.0; extra == 'dev'
32
+ Requires-Dist: mkdocs>=1.5.3; extra == 'dev'
33
+ Requires-Dist: mkdocstrings-python>=0.10.0; extra == 'dev'
34
+ Requires-Dist: mkdocstrings>=0.22.0; extra == 'dev'
35
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
36
+ Requires-Dist: pytest>=7.4.3; extra == 'dev'
37
+ Requires-Dist: ruff>=0.6.4; extra == 'dev'
38
+ Description-Content-Type: text/markdown
39
+
40
+ # scdataloader
41
+
42
+ [![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
43
+ [![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
44
+ [![PyPI version](https://badge.fury.io/py/scDataLoader.svg)](https://badge.fury.io/py/scDataLoader)
45
+ [![Downloads](https://pepy.tech/badge/scDataLoader)](https://pepy.tech/project/scDataLoader)
46
+ [![Downloads](https://pepy.tech/badge/scDataLoader/month)](https://pepy.tech/project/scDataLoader)
47
+ [![Downloads](https://pepy.tech/badge/scDataLoader/week)](https://pepy.tech/project/scDataLoader)
48
+ [![GitHub issues](https://img.shields.io/github/issues/jkobject/scDataLoader)](https://img.shields.io/github/issues/jkobject/scDataLoader)
49
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
50
+ [![DOI](https://img.shields.io/badge/DOI-10.1101%2F2024.07.29.605556-blue)](https://doi.org/10.1101/2024.07.29.605556)
51
+
52
+ This single cell pytorch dataloader / lighting datamodule is designed to be used with:
53
+
54
+ - [lamindb](https://lamin.ai/)
55
+
56
+ and:
57
+
58
+ - [scanpy](https://scanpy.readthedocs.io/en/stable/)
59
+ - [anndata](https://anndata.readthedocs.io/en/latest/)
60
+
61
+ It allows you to:
62
+
63
+ 1. load thousands of datasets containing millions of cells in a few seconds.
64
+ 2. preprocess the data per dataset and download it locally (normalization, filtering, etc.)
65
+ 3. create a more complex single cell dataset
66
+ 4. extend it to your need
67
+
68
+ built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
69
+
70
+ The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
71
+
72
+ ## More
73
+
74
+ I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
75
+
76
+ ![scdataloader.drawio.png](docs/scdataloader.drawio.png)
77
+
78
+ ## Install it from PyPI
79
+
80
+ ```bash
81
+ pip install scdataloader
82
+ # or
83
+ pip install scDataLoader[dev] # for dev dependencies
84
+
85
+ lamin init --storage ./testdb --name test --schema bionty
86
+ ```
87
+
88
+ if you start with lamin and had to do a `lamin init`, you will also need to populate your ontologies. This is because scPRINT is using ontologies to define its cell types, diseases, sexes, ethnicities, etc.
89
+
90
+ you can do it manually or with our function:
91
+
92
+ ```python
93
+ from scdataloader.utils import populate_my_ontology
94
+
95
+ populate_my_ontology() #to populate everything (recommended) (can take 2-10mns)
96
+
97
+ populate_my_ontology( #the minimum to the tool
98
+ organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
99
+ sex: List[str] = ["PATO:0000384", "PATO:0000383"],
100
+ celltypes = None,
101
+ ethnicities = None,
102
+ assays = None,
103
+ tissues = None,
104
+ diseases = None,
105
+ dev_stages = None,
106
+ )
107
+ ```
108
+
109
+ ### Dev install
110
+
111
+ If you want to use the latest version of scDataLoader and work on the code yourself use `git clone` and `pip -e` instead of `pip install`.
112
+
113
+ ```bash
114
+ git clone https://github.com/jkobject/scDataLoader.git
115
+ pip install -e scDataLoader[dev]
116
+ ```
117
+
118
+ ## Usage
119
+
120
+ ### DataModule usage
121
+
122
+ ```python
123
+ # initialize a local lamin database
124
+ #! lamin init --storage ./cellxgene --name cellxgene --schema bionty
125
+ from scdataloader import utils, Preprocessor, DataModule
126
+
127
+
128
+ # preprocess datasets
129
+ preprocessor = Preprocessor(
130
+ do_postp=False,
131
+ force_preprocess=True,
132
+ )
133
+ adata = preprocessor(adata)
134
+
135
+ art = ln.Artifact(adata, description="test")
136
+ art.save()
137
+ ln.Collection(art, name="test", description="test").save()
138
+
139
+ datamodule = DataModule(
140
+ collection_name="test",
141
+ organisms=["NCBITaxon:9606"], #organism that we will work on
142
+ how="most expr", # for the collator (most expr genes only will be selected)
143
+ max_len=1000, # only the 1000 most expressed
144
+ batch_size=64,
145
+ num_workers=1,
146
+ validation_split=0.1,
147
+ )
148
+ ```
149
+
150
+ ### lightning-free usage (Dataset+Collator+DataLoader)
151
+
152
+ ```python
153
+ # initialize a local lamin database
154
+ #! lamin init --storage ./cellxgene --name cellxgene --schema bionty
155
+
156
+ from scdataloader import utils, Preprocessor, SimpleAnnDataset, Collator, DataLoader
157
+
158
+ # preprocess dataset
159
+ preprocessor = Preprocessor(
160
+ do_postp=False,
161
+ force_preprocess=True,
162
+ )
163
+ adata = preprocessor(adata)
164
+
165
+ # create dataset
166
+ adataset = SimpleAnnDataset(
167
+ adata, obs_to_output=["organism_ontology_term_id"]
168
+ )
169
+ # create collator
170
+ col = Collator(
171
+ organisms="NCBITaxon:9606",
172
+ valid_genes=adata.var_names,
173
+ max_len=2000, #maximum number of genes to use
174
+ how="some" |"most expr"|"random_expr",
175
+ # genelist = [geneA, geneB] if how=='some'
176
+ )
177
+ # create dataloader
178
+ dataloader = DataLoader(
179
+ adataset,
180
+ collate_fn=col,
181
+ batch_size=64,
182
+ num_workers=4,
183
+ shuffle=False,
184
+ )
185
+
186
+ # predict
187
+ for batch in tqdm(dataloader):
188
+ gene_pos, expression, depth = (
189
+ batch["genes"],
190
+ batch["x"],
191
+ batch["depth"],
192
+ )
193
+ model.predict(
194
+ gene_pos,
195
+ expression,
196
+ depth,
197
+ )
198
+ ```
199
+
200
+ ### Usage on all of cellxgene
201
+
202
+ ```python
203
+ # initialize a local lamin database
204
+ #! lamin init --storage ./cellxgene --name cellxgene --schema bionty
205
+
206
+ from scdataloader import utils
207
+ from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
208
+
209
+ # preprocess datasets
210
+ DESCRIPTION='preprocessed by scDataLoader'
211
+
212
+ cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
213
+ cx_dataset, len(cx_dataset.artifacts.all())
214
+
215
+
216
+ do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
217
+
218
+ preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
219
+
220
+ # create dataloaders
221
+ from scdataloader import DataModule
222
+ import tqdm
223
+
224
+ datamodule = DataModule(
225
+ collection_name="preprocessed dataset",
226
+ organisms=["NCBITaxon:9606"], #organism that we will work on
227
+ how="most expr", # for the collator (most expr genes only will be selected)
228
+ max_len=1000, # only the 1000 most expressed
229
+ batch_size=64,
230
+ num_workers=1,
231
+ validation_split=0.1,
232
+ test_split=0)
233
+
234
+ for i in tqdm.tqdm(datamodule.train_dataloader()):
235
+ # pass #or do pass
236
+ print(i)
237
+ break
238
+
239
+ # with lightning:
240
+ # Trainer(model, datamodule)
241
+
242
+ ```
243
+
244
+ see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
245
+
246
+ 1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
247
+ 2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
248
+
249
+ ### command line preprocessing
250
+
251
+ You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
252
+
253
+ ```bash
254
+ scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
255
+ ```
256
+
257
+ ### command line usage
258
+
259
+ The main way to use
260
+
261
+ > please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
262
+
263
+ ## FAQ
264
+
265
+ ### how to update my ontologies?
266
+
267
+ ```bash
268
+ import bionty as bt
269
+ bt.reset_sources()
270
+
271
+ # Run via CLI: lamin load <your instance>
272
+
273
+ import lnschema_bionty as lb
274
+ lb.dev.sync_bionty_source_to_latest()
275
+ ```
276
+
277
+ ### how to load all ontologies?
278
+
279
+ ```python
280
+ from scdataloader import utils
281
+ utils.populate_ontologies() # this might take from 5-20mins
282
+ ```
283
+
284
+ ## Development
285
+
286
+ Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
287
+
288
+ ## License
289
+
290
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
291
+
292
+ ## Acknowledgments
293
+
294
+ - [lamin.ai](https://lamin.ai/)
295
+ - [scanpy](https://scanpy.readthedocs.io/en/stable/)
296
+ - [anndata](https://anndata.readthedocs.io/en/latest/)
297
+ - [scprint](https://www.jkobject.com/scPRINT/)
298
+
299
+ Awesome single cell dataloader created by @jkobject
@@ -0,0 +1,14 @@
1
+ scdataloader/VERSION,sha256=xipcxhrEUlk1dT9ewoTAoFKksdpLOjWA3OK313ohVK4,6
2
+ scdataloader/__init__.py,sha256=5y9VzRhOAUWeYMn2MrRRRlzgdiMjRFytr7gcn-I6IkE,147
3
+ scdataloader/__main__.py,sha256=VXrt2IykBypnIXWydwA7NfF7LtRGc-0Khjtm5OIBNpI,6527
4
+ scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
5
+ scdataloader/collator.py,sha256=gzHiuixUwK8JClhAbG12kgWMU_VTKkowibA-tDFpbwo,11341
6
+ scdataloader/config.py,sha256=rrW2DZxG4J2_pmpDbXXsaKJkpNC57w5dIlItiFbANYw,2905
7
+ scdataloader/data.py,sha256=3dCp-lIAfOkCi76SH5W3iSqFmAWZslwARkN9v5mylz8,14907
8
+ scdataloader/datamodule.py,sha256=B-udBevPSPF__hfy0pOz1dGovgE95K2pxPupjB7RblI,16936
9
+ scdataloader/preprocess.py,sha256=pH4EPrcRqH34o3t5X3A4kETiYdCZngih5SdP_PPfgOo,29178
10
+ scdataloader/utils.py,sha256=7tgt3sPj_XTKb-UlJDAZWvQr0_DG9VTC6ioiLdBWFFE,22498
11
+ scdataloader-1.2.2.dist-info/METADATA,sha256=XMtKO9ImiyY--F92njvMUe69OaJgDx8C3xQtBAXqo8g,9800
12
+ scdataloader-1.2.2.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
13
+ scdataloader-1.2.2.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
14
+ scdataloader-1.2.2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: hatchling 1.26.3
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any