scdataloader 1.1.3__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scdataloader/VERSION +1 -1
- scdataloader/__init__.py +1 -1
- scdataloader/__main__.py +16 -7
- scdataloader/collator.py +4 -2
- scdataloader/data.py +41 -17
- scdataloader/datamodule.py +13 -13
- scdataloader/preprocess.py +71 -56
- scdataloader/utils.py +87 -61
- scdataloader-1.2.2.dist-info/METADATA +299 -0
- scdataloader-1.2.2.dist-info/RECORD +14 -0
- {scdataloader-1.1.3.dist-info → scdataloader-1.2.2.dist-info}/WHEEL +1 -1
- scdataloader/mapped.py +0 -540
- scdataloader-1.1.3.dist-info/METADATA +0 -899
- scdataloader-1.1.3.dist-info/RECORD +0 -16
- scdataloader-1.1.3.dist-info/entry_points.txt +0 -3
- {scdataloader-1.1.3.dist-info → scdataloader-1.2.2.dist-info/licenses}/LICENSE +0 -0
scdataloader/utils.py
CHANGED
|
@@ -1,23 +1,21 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import os
|
|
3
3
|
import urllib
|
|
4
|
+
from collections import Counter
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from typing import List, Optional, Union
|
|
4
7
|
|
|
5
8
|
import bionty as bt
|
|
6
9
|
import lamindb as ln
|
|
7
10
|
import numpy as np
|
|
8
11
|
import pandas as pd
|
|
12
|
+
import torch
|
|
13
|
+
from anndata import AnnData
|
|
9
14
|
from biomart import BiomartServer
|
|
10
15
|
from django.db import IntegrityError
|
|
11
16
|
from scipy.sparse import csr_matrix
|
|
12
17
|
from scipy.stats import median_abs_deviation
|
|
13
|
-
from functools import lru_cache
|
|
14
|
-
from collections import Counter
|
|
15
18
|
from torch import Tensor
|
|
16
|
-
import torch
|
|
17
|
-
|
|
18
|
-
from typing import Union, List, Optional
|
|
19
|
-
|
|
20
|
-
from anndata import AnnData
|
|
21
19
|
|
|
22
20
|
|
|
23
21
|
def downsample_profile(mat: Tensor, dropout: float):
|
|
@@ -92,7 +90,7 @@ def _fetchFromServer(
|
|
|
92
90
|
|
|
93
91
|
|
|
94
92
|
def getBiomartTable(
|
|
95
|
-
ensemble_server: str = "http://
|
|
93
|
+
ensemble_server: str = "http://may2024.archive.ensembl.org/biomart",
|
|
96
94
|
useCache: bool = False,
|
|
97
95
|
cache_folder: str = "/tmp/biomart/",
|
|
98
96
|
attributes: List[str] = [],
|
|
@@ -102,7 +100,7 @@ def getBiomartTable(
|
|
|
102
100
|
"""generate a genelist dataframe from ensembl's biomart
|
|
103
101
|
|
|
104
102
|
Args:
|
|
105
|
-
ensemble_server (str, optional): the biomart server. Defaults to "http://
|
|
103
|
+
ensemble_server (str, optional): the biomart server. Defaults to "http://may2023.archive.ensembl.org/biomart".
|
|
106
104
|
useCache (bool, optional): whether to use the cache or not. Defaults to False.
|
|
107
105
|
cache_folder (str, optional): the cache folder. Defaults to "/tmp/biomart/".
|
|
108
106
|
attributes (List[str], optional): the attributes to fetch. Defaults to [].
|
|
@@ -129,21 +127,20 @@ def getBiomartTable(
|
|
|
129
127
|
|
|
130
128
|
cache_folder = os.path.expanduser(cache_folder)
|
|
131
129
|
createFoldersFor(cache_folder)
|
|
132
|
-
cachefile = os.path.join(cache_folder, ".biomart.
|
|
130
|
+
cachefile = os.path.join(cache_folder, ".biomart.parquet")
|
|
133
131
|
if useCache & os.path.isfile(cachefile):
|
|
134
132
|
print("fetching gene names from biomart cache")
|
|
135
|
-
res = pd.
|
|
133
|
+
res = pd.read_parquet(cachefile)
|
|
136
134
|
else:
|
|
137
135
|
print("downloading gene names from biomart")
|
|
138
136
|
|
|
139
137
|
res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
|
|
140
|
-
res.
|
|
138
|
+
res.to_parquet(cachefile, index=False)
|
|
141
139
|
res.columns = attr + attributes
|
|
142
140
|
if type(res) is not type(pd.DataFrame()):
|
|
143
141
|
raise ValueError("should be a dataframe")
|
|
144
142
|
res = res[~(res["ensembl_gene_id"].isna())]
|
|
145
143
|
if "hgnc_symbol" in res.columns:
|
|
146
|
-
res = res[res["hgnc_symbol"].isna()]
|
|
147
144
|
res.loc[res[res.hgnc_symbol.isna()].index, "hgnc_symbol"] = res[
|
|
148
145
|
res.hgnc_symbol.isna()
|
|
149
146
|
]["ensembl_gene_id"]
|
|
@@ -371,10 +368,16 @@ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10
|
|
|
371
368
|
genesdf["organism"] = organism
|
|
372
369
|
organismdf.append(genesdf)
|
|
373
370
|
organismdf = pd.concat(organismdf)
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
371
|
+
for col in [
|
|
372
|
+
"source_id",
|
|
373
|
+
"run_id",
|
|
374
|
+
"created_by_id",
|
|
375
|
+
"updated_at",
|
|
376
|
+
"stable_id",
|
|
377
|
+
"created_at",
|
|
378
|
+
]:
|
|
379
|
+
if col in organismdf.columns:
|
|
380
|
+
organismdf.drop(columns=[col], inplace=True)
|
|
378
381
|
return organismdf
|
|
379
382
|
|
|
380
383
|
|
|
@@ -387,6 +390,7 @@ def populate_my_ontology(
|
|
|
387
390
|
tissues: List[str] = [],
|
|
388
391
|
diseases: List[str] = [],
|
|
389
392
|
dev_stages: List[str] = [],
|
|
393
|
+
organism_clade: str = "vertebrates",
|
|
390
394
|
):
|
|
391
395
|
"""
|
|
392
396
|
creates a local version of the lamin ontologies and add the required missing values in base ontologies
|
|
@@ -397,7 +401,7 @@ def populate_my_ontology(
|
|
|
397
401
|
|
|
398
402
|
add whatever value you need afterward like it is done here with:
|
|
399
403
|
|
|
400
|
-
`bt.$ontology(name="ddd",
|
|
404
|
+
`bt.$ontology(name="ddd", ontolbogy_id="ddddd").save()`
|
|
401
405
|
|
|
402
406
|
`df["assay_ontology_term_id"].unique()`
|
|
403
407
|
|
|
@@ -414,89 +418,111 @@ def populate_my_ontology(
|
|
|
414
418
|
"""
|
|
415
419
|
# cell type
|
|
416
420
|
if celltypes is not None:
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
421
|
+
if len(celltypes) == 0:
|
|
422
|
+
bt.CellType.import_from_source(update=True)
|
|
423
|
+
else:
|
|
424
|
+
names = bt.CellType.public().df().index if not celltypes else celltypes
|
|
425
|
+
records = bt.CellType.from_values(names, field="ontology_id")
|
|
426
|
+
ln.save(records)
|
|
420
427
|
bt.CellType(name="unknown", ontology_id="unknown").save()
|
|
421
428
|
# Organism
|
|
422
429
|
if organisms is not None:
|
|
423
|
-
names =
|
|
430
|
+
names = (
|
|
431
|
+
bt.Organism.public(organism=organism_clade).df().index
|
|
432
|
+
if not organisms
|
|
433
|
+
else organisms
|
|
434
|
+
)
|
|
435
|
+
source = bt.PublicSource.filter(name="ensembl", organism=organism_clade).last()
|
|
424
436
|
records = [
|
|
425
437
|
i[0] if type(i) is list else i
|
|
426
|
-
for i in [
|
|
438
|
+
for i in [
|
|
439
|
+
bt.Organism.from_source(ontology_id=i, source=source) for i in names
|
|
440
|
+
]
|
|
427
441
|
]
|
|
428
442
|
ln.save(records)
|
|
429
443
|
bt.Organism(name="unknown", ontology_id="unknown").save()
|
|
430
|
-
organism_names = names
|
|
431
444
|
# Phenotype
|
|
432
445
|
if sex is not None:
|
|
433
446
|
names = bt.Phenotype.public().df().index if not sex else sex
|
|
447
|
+
source = bt.PublicSource.filter(name="pato").first()
|
|
434
448
|
records = [
|
|
435
|
-
bt.Phenotype.from_source(
|
|
436
|
-
ontology_id=i, source=bt.PublicSource.filter(name="pato").first()
|
|
437
|
-
)
|
|
438
|
-
for i in names
|
|
449
|
+
bt.Phenotype.from_source(ontology_id=i, source=source) for i in names
|
|
439
450
|
]
|
|
440
451
|
ln.save(records)
|
|
441
452
|
bt.Phenotype(name="unknown", ontology_id="unknown").save()
|
|
442
453
|
# ethnicity
|
|
443
454
|
if ethnicities is not None:
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
455
|
+
if len(ethnicities) == 0:
|
|
456
|
+
bt.Ethnicity.import_from_source(update=True)
|
|
457
|
+
else:
|
|
458
|
+
names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
|
|
459
|
+
records = bt.Ethnicity.from_values(names, field="ontology_id")
|
|
460
|
+
ln.save(records)
|
|
447
461
|
bt.Ethnicity(
|
|
448
462
|
name="unknown", ontology_id="unknown"
|
|
449
463
|
).save() # multi ethnic will have to get renamed
|
|
450
464
|
# ExperimentalFactor
|
|
451
465
|
if assays is not None:
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
466
|
+
if len(assays) == 0:
|
|
467
|
+
bt.ExperimentalFactor.import_from_source(update=True)
|
|
468
|
+
else:
|
|
469
|
+
names = bt.ExperimentalFactor.public().df().index if not assays else assays
|
|
470
|
+
records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
|
|
471
|
+
ln.save(records)
|
|
455
472
|
bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
|
|
456
473
|
# lookup = bt.ExperimentalFactor.lookup()
|
|
457
474
|
# lookup.smart_seq_v4.parents.add(lookup.smart_like)
|
|
458
475
|
# Tissue
|
|
459
476
|
if tissues is not None:
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
477
|
+
if len(tissues) == 0:
|
|
478
|
+
bt.Tissue.import_from_source(update=True)
|
|
479
|
+
else:
|
|
480
|
+
names = bt.Tissue.public().df().index if not tissues else tissues
|
|
481
|
+
records = bt.Tissue.from_values(names, field="ontology_id")
|
|
482
|
+
ln.save(records)
|
|
463
483
|
bt.Tissue(name="unknown", ontology_id="unknown").save()
|
|
464
484
|
# DevelopmentalStage
|
|
465
485
|
if dev_stages is not None:
|
|
466
|
-
|
|
467
|
-
bt.DevelopmentalStage.
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
486
|
+
if len(dev_stages) == 0:
|
|
487
|
+
bt.DevelopmentalStage.import_from_source(update=True)
|
|
488
|
+
source = bt.PublicSource.filter(organism="mouse", name="mmusdv").last()
|
|
489
|
+
bt.DevelopmentalStage.import_from_source(source=source)
|
|
490
|
+
else:
|
|
491
|
+
names = (
|
|
492
|
+
bt.DevelopmentalStage.public().df().index
|
|
493
|
+
if not dev_stages
|
|
494
|
+
else dev_stages
|
|
495
|
+
)
|
|
496
|
+
records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
|
|
497
|
+
ln.save(records)
|
|
471
498
|
bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
|
|
472
499
|
|
|
473
|
-
names = bt.DevelopmentalStage.public(organism="mouse").df().index
|
|
474
|
-
records = [
|
|
475
|
-
bt.DevelopmentalStage.from_source(
|
|
476
|
-
ontology_id=i,
|
|
477
|
-
source=bt.PublicSource.filter(organism="mouse", name="mmusdv").first(),
|
|
478
|
-
)
|
|
479
|
-
for i in names.tolist()
|
|
480
|
-
]
|
|
481
|
-
ln.save(records)
|
|
482
500
|
# Disease
|
|
483
501
|
if diseases is not None:
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
502
|
+
if len(diseases) == 0:
|
|
503
|
+
bt.Disease.import_from_source(update=True)
|
|
504
|
+
else:
|
|
505
|
+
names = bt.Disease.public().df().index if not diseases else diseases
|
|
506
|
+
records = bt.Disease.from_values(names, field="ontology_id")
|
|
507
|
+
ln.save(records)
|
|
487
508
|
bt.Disease(name="normal", ontology_id="PATO:0000461").save()
|
|
488
509
|
bt.Disease(name="unknown", ontology_id="unknown").save()
|
|
489
510
|
# genes
|
|
490
|
-
for organism in
|
|
511
|
+
for organism in ["NCBITaxon:10090", "NCBITaxon:9606"]:
|
|
491
512
|
# convert onto to name
|
|
492
513
|
organism = bt.Organism.filter(ontology_id=organism).one().name
|
|
493
514
|
names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
515
|
+
|
|
516
|
+
# Process names in blocks of 10,000 elements
|
|
517
|
+
block_size = 10000
|
|
518
|
+
for i in range(0, len(names), block_size):
|
|
519
|
+
block = names[i : i + block_size]
|
|
520
|
+
records = bt.Gene.from_values(
|
|
521
|
+
block,
|
|
522
|
+
field="ensembl_gene_id",
|
|
523
|
+
organism=organism,
|
|
524
|
+
)
|
|
525
|
+
ln.save(records)
|
|
500
526
|
|
|
501
527
|
|
|
502
528
|
def is_outlier(adata: AnnData, metric: str, nmads: int):
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: scdataloader
|
|
3
|
+
Version: 1.2.2
|
|
4
|
+
Summary: a dataloader for single cell data in lamindb
|
|
5
|
+
Project-URL: repository, https://github.com/jkobject/scDataLoader
|
|
6
|
+
Author-email: jkobject <jkobject@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: dataloader,lamindb,pytorch,scPRINT,scRNAseq
|
|
9
|
+
Requires-Python: <3.11,>=3.10
|
|
10
|
+
Requires-Dist: anndata>=0.9.0
|
|
11
|
+
Requires-Dist: biomart>=0.9.0
|
|
12
|
+
Requires-Dist: cellxgene-census>=0.1.0
|
|
13
|
+
Requires-Dist: django>=4.0.0
|
|
14
|
+
Requires-Dist: ipykernel>=6.20.0
|
|
15
|
+
Requires-Dist: lamindb[bionty]==0.76.12
|
|
16
|
+
Requires-Dist: leidenalg>=0.8.0
|
|
17
|
+
Requires-Dist: lightning>=2.0.0
|
|
18
|
+
Requires-Dist: matplotlib>=3.5.0
|
|
19
|
+
Requires-Dist: numpy>=1.26.0
|
|
20
|
+
Requires-Dist: palantir>=1.3.3
|
|
21
|
+
Requires-Dist: pandas>=2.0.0
|
|
22
|
+
Requires-Dist: scikit-misc>=0.5.0
|
|
23
|
+
Requires-Dist: seaborn>=0.11.0
|
|
24
|
+
Requires-Dist: torch==2.2.0
|
|
25
|
+
Requires-Dist: torchdata>=0.5.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: coverage>=7.3.2; extra == 'dev'
|
|
28
|
+
Requires-Dist: gitchangelog>=3.0.4; extra == 'dev'
|
|
29
|
+
Requires-Dist: mkdocs-git-authors-plugin>=0.4.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: mkdocs-git-revision-date-localized-plugin>=1.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: mkdocs-jupyter>=0.2.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: mkdocs>=1.5.3; extra == 'dev'
|
|
33
|
+
Requires-Dist: mkdocstrings-python>=0.10.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: mkdocstrings>=0.22.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest>=7.4.3; extra == 'dev'
|
|
37
|
+
Requires-Dist: ruff>=0.6.4; extra == 'dev'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# scdataloader
|
|
41
|
+
|
|
42
|
+
[](https://codecov.io/gh/jkobject/scDataLoader)
|
|
43
|
+
[](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
|
|
44
|
+
[](https://badge.fury.io/py/scDataLoader)
|
|
45
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
46
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
47
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
48
|
+
[](https://img.shields.io/github/issues/jkobject/scDataLoader)
|
|
49
|
+
[](https://github.com/psf/black)
|
|
50
|
+
[](https://doi.org/10.1101/2024.07.29.605556)
|
|
51
|
+
|
|
52
|
+
This single cell pytorch dataloader / lighting datamodule is designed to be used with:
|
|
53
|
+
|
|
54
|
+
- [lamindb](https://lamin.ai/)
|
|
55
|
+
|
|
56
|
+
and:
|
|
57
|
+
|
|
58
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
59
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
60
|
+
|
|
61
|
+
It allows you to:
|
|
62
|
+
|
|
63
|
+
1. load thousands of datasets containing millions of cells in a few seconds.
|
|
64
|
+
2. preprocess the data per dataset and download it locally (normalization, filtering, etc.)
|
|
65
|
+
3. create a more complex single cell dataset
|
|
66
|
+
4. extend it to your need
|
|
67
|
+
|
|
68
|
+
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
69
|
+
|
|
70
|
+
The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
|
|
71
|
+
|
|
72
|
+
## More
|
|
73
|
+
|
|
74
|
+
I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
|
|
75
|
+
|
|
76
|
+

|
|
77
|
+
|
|
78
|
+
## Install it from PyPI
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install scdataloader
|
|
82
|
+
# or
|
|
83
|
+
pip install scDataLoader[dev] # for dev dependencies
|
|
84
|
+
|
|
85
|
+
lamin init --storage ./testdb --name test --schema bionty
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
if you start with lamin and had to do a `lamin init`, you will also need to populate your ontologies. This is because scPRINT is using ontologies to define its cell types, diseases, sexes, ethnicities, etc.
|
|
89
|
+
|
|
90
|
+
you can do it manually or with our function:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from scdataloader.utils import populate_my_ontology
|
|
94
|
+
|
|
95
|
+
populate_my_ontology() #to populate everything (recommended) (can take 2-10mns)
|
|
96
|
+
|
|
97
|
+
populate_my_ontology( #the minimum to the tool
|
|
98
|
+
organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
|
|
99
|
+
sex: List[str] = ["PATO:0000384", "PATO:0000383"],
|
|
100
|
+
celltypes = None,
|
|
101
|
+
ethnicities = None,
|
|
102
|
+
assays = None,
|
|
103
|
+
tissues = None,
|
|
104
|
+
diseases = None,
|
|
105
|
+
dev_stages = None,
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Dev install
|
|
110
|
+
|
|
111
|
+
If you want to use the latest version of scDataLoader and work on the code yourself use `git clone` and `pip -e` instead of `pip install`.
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
git clone https://github.com/jkobject/scDataLoader.git
|
|
115
|
+
pip install -e scDataLoader[dev]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Usage
|
|
119
|
+
|
|
120
|
+
### DataModule usage
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
# initialize a local lamin database
|
|
124
|
+
#! lamin init --storage ./cellxgene --name cellxgene --schema bionty
|
|
125
|
+
from scdataloader import utils, Preprocessor, DataModule
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# preprocess datasets
|
|
129
|
+
preprocessor = Preprocessor(
|
|
130
|
+
do_postp=False,
|
|
131
|
+
force_preprocess=True,
|
|
132
|
+
)
|
|
133
|
+
adata = preprocessor(adata)
|
|
134
|
+
|
|
135
|
+
art = ln.Artifact(adata, description="test")
|
|
136
|
+
art.save()
|
|
137
|
+
ln.Collection(art, name="test", description="test").save()
|
|
138
|
+
|
|
139
|
+
datamodule = DataModule(
|
|
140
|
+
collection_name="test",
|
|
141
|
+
organisms=["NCBITaxon:9606"], #organism that we will work on
|
|
142
|
+
how="most expr", # for the collator (most expr genes only will be selected)
|
|
143
|
+
max_len=1000, # only the 1000 most expressed
|
|
144
|
+
batch_size=64,
|
|
145
|
+
num_workers=1,
|
|
146
|
+
validation_split=0.1,
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### lightning-free usage (Dataset+Collator+DataLoader)
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
# initialize a local lamin database
|
|
154
|
+
#! lamin init --storage ./cellxgene --name cellxgene --schema bionty
|
|
155
|
+
|
|
156
|
+
from scdataloader import utils, Preprocessor, SimpleAnnDataset, Collator, DataLoader
|
|
157
|
+
|
|
158
|
+
# preprocess dataset
|
|
159
|
+
preprocessor = Preprocessor(
|
|
160
|
+
do_postp=False,
|
|
161
|
+
force_preprocess=True,
|
|
162
|
+
)
|
|
163
|
+
adata = preprocessor(adata)
|
|
164
|
+
|
|
165
|
+
# create dataset
|
|
166
|
+
adataset = SimpleAnnDataset(
|
|
167
|
+
adata, obs_to_output=["organism_ontology_term_id"]
|
|
168
|
+
)
|
|
169
|
+
# create collator
|
|
170
|
+
col = Collator(
|
|
171
|
+
organisms="NCBITaxon:9606",
|
|
172
|
+
valid_genes=adata.var_names,
|
|
173
|
+
max_len=2000, #maximum number of genes to use
|
|
174
|
+
how="some" |"most expr"|"random_expr",
|
|
175
|
+
# genelist = [geneA, geneB] if how=='some'
|
|
176
|
+
)
|
|
177
|
+
# create dataloader
|
|
178
|
+
dataloader = DataLoader(
|
|
179
|
+
adataset,
|
|
180
|
+
collate_fn=col,
|
|
181
|
+
batch_size=64,
|
|
182
|
+
num_workers=4,
|
|
183
|
+
shuffle=False,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# predict
|
|
187
|
+
for batch in tqdm(dataloader):
|
|
188
|
+
gene_pos, expression, depth = (
|
|
189
|
+
batch["genes"],
|
|
190
|
+
batch["x"],
|
|
191
|
+
batch["depth"],
|
|
192
|
+
)
|
|
193
|
+
model.predict(
|
|
194
|
+
gene_pos,
|
|
195
|
+
expression,
|
|
196
|
+
depth,
|
|
197
|
+
)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Usage on all of cellxgene
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
# initialize a local lamin database
|
|
204
|
+
#! lamin init --storage ./cellxgene --name cellxgene --schema bionty
|
|
205
|
+
|
|
206
|
+
from scdataloader import utils
|
|
207
|
+
from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
|
|
208
|
+
|
|
209
|
+
# preprocess datasets
|
|
210
|
+
DESCRIPTION='preprocessed by scDataLoader'
|
|
211
|
+
|
|
212
|
+
cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
|
|
213
|
+
cx_dataset, len(cx_dataset.artifacts.all())
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
|
|
217
|
+
|
|
218
|
+
preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
|
|
219
|
+
|
|
220
|
+
# create dataloaders
|
|
221
|
+
from scdataloader import DataModule
|
|
222
|
+
import tqdm
|
|
223
|
+
|
|
224
|
+
datamodule = DataModule(
|
|
225
|
+
collection_name="preprocessed dataset",
|
|
226
|
+
organisms=["NCBITaxon:9606"], #organism that we will work on
|
|
227
|
+
how="most expr", # for the collator (most expr genes only will be selected)
|
|
228
|
+
max_len=1000, # only the 1000 most expressed
|
|
229
|
+
batch_size=64,
|
|
230
|
+
num_workers=1,
|
|
231
|
+
validation_split=0.1,
|
|
232
|
+
test_split=0)
|
|
233
|
+
|
|
234
|
+
for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
235
|
+
# pass #or do pass
|
|
236
|
+
print(i)
|
|
237
|
+
break
|
|
238
|
+
|
|
239
|
+
# with lightning:
|
|
240
|
+
# Trainer(model, datamodule)
|
|
241
|
+
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
|
|
245
|
+
|
|
246
|
+
1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
|
|
247
|
+
2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
|
|
248
|
+
|
|
249
|
+
### command line preprocessing
|
|
250
|
+
|
|
251
|
+
You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### command line usage
|
|
258
|
+
|
|
259
|
+
The main way to use
|
|
260
|
+
|
|
261
|
+
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
|
|
262
|
+
|
|
263
|
+
## FAQ
|
|
264
|
+
|
|
265
|
+
### how to update my ontologies?
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
import bionty as bt
|
|
269
|
+
bt.reset_sources()
|
|
270
|
+
|
|
271
|
+
# Run via CLI: lamin load <your instance>
|
|
272
|
+
|
|
273
|
+
import lnschema_bionty as lb
|
|
274
|
+
lb.dev.sync_bionty_source_to_latest()
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
### how to load all ontologies?
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
from scdataloader import utils
|
|
281
|
+
utils.populate_ontologies() # this might take from 5-20mins
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
## Development
|
|
285
|
+
|
|
286
|
+
Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
|
287
|
+
|
|
288
|
+
## License
|
|
289
|
+
|
|
290
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
291
|
+
|
|
292
|
+
## Acknowledgments
|
|
293
|
+
|
|
294
|
+
- [lamin.ai](https://lamin.ai/)
|
|
295
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
296
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
297
|
+
- [scprint](https://www.jkobject.com/scPRINT/)
|
|
298
|
+
|
|
299
|
+
Awesome single cell dataloader created by @jkobject
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
scdataloader/VERSION,sha256=xipcxhrEUlk1dT9ewoTAoFKksdpLOjWA3OK313ohVK4,6
|
|
2
|
+
scdataloader/__init__.py,sha256=5y9VzRhOAUWeYMn2MrRRRlzgdiMjRFytr7gcn-I6IkE,147
|
|
3
|
+
scdataloader/__main__.py,sha256=VXrt2IykBypnIXWydwA7NfF7LtRGc-0Khjtm5OIBNpI,6527
|
|
4
|
+
scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
|
|
5
|
+
scdataloader/collator.py,sha256=gzHiuixUwK8JClhAbG12kgWMU_VTKkowibA-tDFpbwo,11341
|
|
6
|
+
scdataloader/config.py,sha256=rrW2DZxG4J2_pmpDbXXsaKJkpNC57w5dIlItiFbANYw,2905
|
|
7
|
+
scdataloader/data.py,sha256=3dCp-lIAfOkCi76SH5W3iSqFmAWZslwARkN9v5mylz8,14907
|
|
8
|
+
scdataloader/datamodule.py,sha256=B-udBevPSPF__hfy0pOz1dGovgE95K2pxPupjB7RblI,16936
|
|
9
|
+
scdataloader/preprocess.py,sha256=pH4EPrcRqH34o3t5X3A4kETiYdCZngih5SdP_PPfgOo,29178
|
|
10
|
+
scdataloader/utils.py,sha256=7tgt3sPj_XTKb-UlJDAZWvQr0_DG9VTC6ioiLdBWFFE,22498
|
|
11
|
+
scdataloader-1.2.2.dist-info/METADATA,sha256=XMtKO9ImiyY--F92njvMUe69OaJgDx8C3xQtBAXqo8g,9800
|
|
12
|
+
scdataloader-1.2.2.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
|
|
13
|
+
scdataloader-1.2.2.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
14
|
+
scdataloader-1.2.2.dist-info/RECORD,,
|