scdataloader 0.0.4__tar.gz → 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scdataloader-0.0.4 → scdataloader-1.0.5}/PKG-INFO +82 -26
- scdataloader-1.0.5/README.md +154 -0
- scdataloader-1.0.5/pyproject.toml +62 -0
- scdataloader-1.0.5/scdataloader/VERSION +1 -0
- scdataloader-1.0.5/scdataloader/__init__.py +4 -0
- {scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/__main__.py +3 -0
- {scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/collator.py +61 -96
- {scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/config.py +6 -0
- {scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/data.py +138 -90
- {scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/datamodule.py +67 -39
- scdataloader-1.0.5/scdataloader/mapped.py +540 -0
- {scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/preprocess.py +4 -213
- {scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/utils.py +128 -92
- scdataloader-0.0.4/README.md +0 -107
- scdataloader-0.0.4/pyproject.toml +0 -53
- scdataloader-0.0.4/scdataloader/VERSION +0 -1
- scdataloader-0.0.4/scdataloader/__init__.py +0 -4
- scdataloader-0.0.4/scdataloader/mapped.py +0 -358
- {scdataloader-0.0.4 → scdataloader-1.0.5}/LICENSE +0 -0
- {scdataloader-0.0.4 → scdataloader-1.0.5}/scdataloader/base.py +0 -0
|
@@ -1,28 +1,37 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.0.5
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Home-page: https://github.com/jkobject/scDataLoader
|
|
6
6
|
License: GPL3
|
|
7
|
-
Keywords: scRNAseq,dataloader,pytorch,lamindb,
|
|
7
|
+
Keywords: scRNAseq,dataloader,pytorch,lamindb,scPRINT
|
|
8
8
|
Author: jkobject
|
|
9
9
|
Requires-Python: ==3.10.*
|
|
10
10
|
Classifier: License :: Other/Proprietary License
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Provides-Extra: dev
|
|
13
14
|
Requires-Dist: anndata
|
|
14
15
|
Requires-Dist: biomart
|
|
15
|
-
Requires-Dist: bionty
|
|
16
|
+
Requires-Dist: bionty (==0.48.0)
|
|
17
|
+
Requires-Dist: black (>=23.10.1,<24.0.0) ; extra == "dev"
|
|
16
18
|
Requires-Dist: cellxgene-census
|
|
19
|
+
Requires-Dist: coverage (>=7.3.2,<8.0.0) ; extra == "dev"
|
|
17
20
|
Requires-Dist: decoupler
|
|
18
21
|
Requires-Dist: django
|
|
22
|
+
Requires-Dist: flake8 (>=6.1.0,<7.0.0) ; extra == "dev"
|
|
23
|
+
Requires-Dist: gitchangelog (>=3.0.4,<4.0.0) ; extra == "dev"
|
|
19
24
|
Requires-Dist: ipykernel
|
|
20
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: isort (>=5.12.0,<6.0.0) ; extra == "dev"
|
|
26
|
+
Requires-Dist: lamindb (==0.75.1)
|
|
21
27
|
Requires-Dist: leidenalg
|
|
22
28
|
Requires-Dist: lightning
|
|
23
|
-
Requires-Dist: lnschema-bionty
|
|
24
29
|
Requires-Dist: matplotlib
|
|
30
|
+
Requires-Dist: mkdocs (>=1.5.3,<2.0.0) ; extra == "dev"
|
|
31
|
+
Requires-Dist: mypy (>=1.6.1,<2.0.0) ; extra == "dev"
|
|
25
32
|
Requires-Dist: pandas (>=2.0.0)
|
|
33
|
+
Requires-Dist: pytest (>=7.4.3,<8.0.0) ; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov (>=4.1.0,<5.0.0) ; extra == "dev"
|
|
26
35
|
Requires-Dist: scikit-misc
|
|
27
36
|
Requires-Dist: seaborn
|
|
28
37
|
Requires-Dist: torch
|
|
@@ -34,14 +43,16 @@ Description-Content-Type: text/markdown
|
|
|
34
43
|
|
|
35
44
|
[](https://codecov.io/gh/jkobject/scDataLoader)
|
|
36
45
|
[](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
|
|
37
|
-
[](https://badge.fury.io/py/scDataLoader)
|
|
47
|
+
[](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
|
|
48
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
49
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
50
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
51
|
+
[](https://img.shields.io/github/issues/jkobject/scDataLoader)
|
|
52
|
+
[](https://github.com/psf/black)
|
|
53
|
+
[](https://doi.org/10.1101/2024.07.29.605556)
|
|
38
54
|
|
|
39
|
-
|
|
40
|
-
Awesome single cell dataloader created by @jkobject
|
|
41
|
-
|
|
42
|
-
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
43
|
-
|
|
44
|
-
This data loader is designed to be used with:
|
|
55
|
+
This single cell pytorch dataloader / lighting datamodule is designed to be used with:
|
|
45
56
|
|
|
46
57
|
- [lamindb](https://lamin.ai/)
|
|
47
58
|
|
|
@@ -57,18 +68,15 @@ It allows you to:
|
|
|
57
68
|
3. create a more complex single cell dataset
|
|
58
69
|
4. extend it to your need
|
|
59
70
|
|
|
60
|
-
|
|
71
|
+
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
61
72
|
|
|
62
|
-
|
|
73
|
+
The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
|
|
63
74
|
|
|
64
|
-
|
|
65
|
-
2. doing some dataset specific preprocessing if needed
|
|
66
|
-
3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
|
|
67
|
-
4. passing it to a dataloader object that can work with it correctly
|
|
75
|
+
## More
|
|
68
76
|
|
|
69
|
-
|
|
77
|
+
I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
|
|
70
78
|
|
|
71
|
-

|
|
72
80
|
|
|
73
81
|
## Install it from PyPI
|
|
74
82
|
|
|
@@ -80,13 +88,13 @@ pip install scdataloader
|
|
|
80
88
|
|
|
81
89
|
```bash
|
|
82
90
|
git clone https://github.com/jkobject/scDataLoader.git
|
|
83
|
-
|
|
84
|
-
poetry install
|
|
91
|
+
pip install -e scDataLoader
|
|
85
92
|
```
|
|
86
|
-
then run the notebooks with the poetry installed environment
|
|
87
93
|
|
|
88
94
|
## Usage
|
|
89
95
|
|
|
96
|
+
### Direct Usage
|
|
97
|
+
|
|
90
98
|
```python
|
|
91
99
|
# initialize a local lamin database
|
|
92
100
|
# !lamin init --storage ~/scdataloader --schema bionty
|
|
@@ -129,15 +137,63 @@ for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
|
129
137
|
|
|
130
138
|
```
|
|
131
139
|
|
|
132
|
-
see the notebooks in [docs](https://jkobject.
|
|
140
|
+
see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
|
|
141
|
+
|
|
142
|
+
1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
|
|
143
|
+
2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
|
|
133
144
|
|
|
134
|
-
|
|
135
|
-
|
|
145
|
+
### command line preprocessing
|
|
146
|
+
|
|
147
|
+
You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### command line usage
|
|
154
|
+
|
|
155
|
+
The main way to use
|
|
156
|
+
|
|
157
|
+
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
|
|
158
|
+
|
|
159
|
+
## FAQ
|
|
160
|
+
|
|
161
|
+
### how to update my ontologies?
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
import bionty as bt
|
|
165
|
+
bt.reset_sources()
|
|
166
|
+
|
|
167
|
+
# Run via CLI: lamin load <your instance>
|
|
168
|
+
|
|
169
|
+
import lnschema_bionty as lb
|
|
170
|
+
lb.dev.sync_bionty_source_to_latest()
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### how to load all ontologies?
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from scdataloader import utils
|
|
177
|
+
utils.populate_ontologies() # this might take from 5-20mins
|
|
178
|
+
```
|
|
136
179
|
|
|
137
180
|
## Development
|
|
138
181
|
|
|
139
182
|
Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
|
140
183
|
|
|
184
|
+
## License
|
|
185
|
+
|
|
186
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
187
|
+
|
|
188
|
+
## Acknowledgments
|
|
189
|
+
|
|
190
|
+
- [lamin.ai](https://lamin.ai/)
|
|
191
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
192
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
193
|
+
- [scprint](https://www.jkobject.com/scPRINT/)
|
|
194
|
+
|
|
195
|
+
Awesome single cell dataloader created by @jkobject
|
|
196
|
+
|
|
141
197
|
GNU GENERAL PUBLIC LICENSE
|
|
142
198
|
Version 3, 29 June 2007
|
|
143
199
|
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# scdataloader
|
|
2
|
+
|
|
3
|
+
[](https://codecov.io/gh/jkobject/scDataLoader)
|
|
4
|
+
[](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
|
|
5
|
+
[](https://badge.fury.io/py/scDataLoader)
|
|
6
|
+
[](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
|
|
7
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
8
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
9
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
10
|
+
[](https://img.shields.io/github/issues/jkobject/scDataLoader)
|
|
11
|
+
[](https://github.com/psf/black)
|
|
12
|
+
[](https://doi.org/10.1101/2024.07.29.605556)
|
|
13
|
+
|
|
14
|
+
This single cell pytorch dataloader / lighting datamodule is designed to be used with:
|
|
15
|
+
|
|
16
|
+
- [lamindb](https://lamin.ai/)
|
|
17
|
+
|
|
18
|
+
and:
|
|
19
|
+
|
|
20
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
21
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
22
|
+
|
|
23
|
+
It allows you to:
|
|
24
|
+
|
|
25
|
+
1. load thousands of datasets containing millions of cells in a few seconds.
|
|
26
|
+
2. preprocess the data per dataset and download it locally (normalization, filtering, etc.)
|
|
27
|
+
3. create a more complex single cell dataset
|
|
28
|
+
4. extend it to your need
|
|
29
|
+
|
|
30
|
+
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
31
|
+
|
|
32
|
+
The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
|
|
33
|
+
|
|
34
|
+
## More
|
|
35
|
+
|
|
36
|
+
I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
|
|
37
|
+
|
|
38
|
+

|
|
39
|
+
|
|
40
|
+
## Install it from PyPI
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install scdataloader
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Install it locally and run the notebooks:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
git clone https://github.com/jkobject/scDataLoader.git
|
|
50
|
+
pip install -e scDataLoader
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
|
|
55
|
+
### Direct Usage
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
# initialize a local lamin database
|
|
59
|
+
# !lamin init --storage ~/scdataloader --schema bionty
|
|
60
|
+
|
|
61
|
+
from scdataloader import utils
|
|
62
|
+
from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
|
|
63
|
+
|
|
64
|
+
# preprocess datasets
|
|
65
|
+
DESCRIPTION='preprocessed by scDataLoader'
|
|
66
|
+
|
|
67
|
+
cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
|
|
68
|
+
cx_dataset, len(cx_dataset.artifacts.all())
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
|
|
72
|
+
|
|
73
|
+
preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
|
|
74
|
+
|
|
75
|
+
# create dataloaders
|
|
76
|
+
from scdataloader import DataModule
|
|
77
|
+
import tqdm
|
|
78
|
+
|
|
79
|
+
datamodule = DataModule(
|
|
80
|
+
collection_name="preprocessed dataset",
|
|
81
|
+
organisms=["NCBITaxon:9606"], #organism that we will work on
|
|
82
|
+
how="most expr", # for the collator (most expr genes only will be selected)
|
|
83
|
+
max_len=1000, # only the 1000 most expressed
|
|
84
|
+
batch_size=64,
|
|
85
|
+
num_workers=1,
|
|
86
|
+
validation_split=0.1,
|
|
87
|
+
test_split=0)
|
|
88
|
+
|
|
89
|
+
for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
90
|
+
# pass #or do pass
|
|
91
|
+
print(i)
|
|
92
|
+
break
|
|
93
|
+
|
|
94
|
+
# with lightning:
|
|
95
|
+
# Trainer(model, datamodule)
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
|
|
100
|
+
|
|
101
|
+
1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
|
|
102
|
+
2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
|
|
103
|
+
|
|
104
|
+
### command line preprocessing
|
|
105
|
+
|
|
106
|
+
You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### command line usage
|
|
113
|
+
|
|
114
|
+
The main way to use
|
|
115
|
+
|
|
116
|
+
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
|
|
117
|
+
|
|
118
|
+
## FAQ
|
|
119
|
+
|
|
120
|
+
### how to update my ontologies?
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
import bionty as bt
|
|
124
|
+
bt.reset_sources()
|
|
125
|
+
|
|
126
|
+
# Run via CLI: lamin load <your instance>
|
|
127
|
+
|
|
128
|
+
import lnschema_bionty as lb
|
|
129
|
+
lb.dev.sync_bionty_source_to_latest()
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### how to load all ontologies?
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from scdataloader import utils
|
|
136
|
+
utils.populate_ontologies() # this might take from 5-20mins
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Development
|
|
140
|
+
|
|
141
|
+
Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
|
142
|
+
|
|
143
|
+
## License
|
|
144
|
+
|
|
145
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
146
|
+
|
|
147
|
+
## Acknowledgments
|
|
148
|
+
|
|
149
|
+
- [lamin.ai](https://lamin.ai/)
|
|
150
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
151
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
152
|
+
- [scprint](https://www.jkobject.com/scPRINT/)
|
|
153
|
+
|
|
154
|
+
Awesome single cell dataloader created by @jkobject
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "scdataloader"
|
|
3
|
+
version = "1.0.5"
|
|
4
|
+
description = "a dataloader for single cell data in lamindb"
|
|
5
|
+
authors = ["jkobject"]
|
|
6
|
+
license = "GPL3"
|
|
7
|
+
readme = ["README.md", "LICENSE"]
|
|
8
|
+
repository = "https://github.com/jkobject/scDataLoader"
|
|
9
|
+
keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
|
|
10
|
+
|
|
11
|
+
[tool.poetry.dependencies]
|
|
12
|
+
python = "3.10.*"
|
|
13
|
+
lamindb = "0.75.1"
|
|
14
|
+
bionty = "0.48.0"
|
|
15
|
+
cellxgene-census = "*"
|
|
16
|
+
torch = "*"
|
|
17
|
+
lightning = "*"
|
|
18
|
+
anndata = "*"
|
|
19
|
+
matplotlib = "*"
|
|
20
|
+
seaborn = "*"
|
|
21
|
+
ipykernel = "*"
|
|
22
|
+
torchdata = "*"
|
|
23
|
+
biomart = "*"
|
|
24
|
+
pandas = ">=2.0.0"
|
|
25
|
+
leidenalg = "*"
|
|
26
|
+
decoupler = "*"
|
|
27
|
+
django = "*"
|
|
28
|
+
scikit-misc = "*"
|
|
29
|
+
pytest = { version = "^7.4.3", optional = true }
|
|
30
|
+
coverage = { version = "^7.3.2", optional = true }
|
|
31
|
+
flake8 = { version = "^6.1.0", optional = true }
|
|
32
|
+
black = { version = "^23.10.1", optional = true }
|
|
33
|
+
isort = { version = "^5.12.0", optional = true }
|
|
34
|
+
pytest-cov = { version = "^4.1.0", optional = true }
|
|
35
|
+
mypy = { version = "^1.6.1", optional = true }
|
|
36
|
+
gitchangelog = { version = "^3.0.4", optional = true }
|
|
37
|
+
mkdocs = { version = "^1.5.3", optional = true }
|
|
38
|
+
|
|
39
|
+
[tool.poetry.extras]
|
|
40
|
+
dev = [
|
|
41
|
+
"pytest",
|
|
42
|
+
"coverage",
|
|
43
|
+
"flake8",
|
|
44
|
+
"black",
|
|
45
|
+
"isort",
|
|
46
|
+
"pytest-cov",
|
|
47
|
+
"mypy",
|
|
48
|
+
"gitchangelog",
|
|
49
|
+
"mkdocs",
|
|
50
|
+
"mkdocs-git-revision-date-localized-plugin",
|
|
51
|
+
"mkdocstrings",
|
|
52
|
+
"mkdocs-git-authors-plugin",
|
|
53
|
+
"mkdocs-jupyter",
|
|
54
|
+
"mkdocstrings-python"
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
[build-system]
|
|
58
|
+
requires = ["poetry-core"]
|
|
59
|
+
build-backend = "poetry.core.masonry.api"
|
|
60
|
+
|
|
61
|
+
[tool.poetry.scripts]
|
|
62
|
+
scdataloader = 'scdataloader.__main__:main'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
1.0.5
|
|
@@ -10,6 +10,9 @@ from typing import Optional, Union
|
|
|
10
10
|
|
|
11
11
|
# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
|
|
12
12
|
def main():
|
|
13
|
+
"""
|
|
14
|
+
main function to preprocess datasets in a given lamindb collection.
|
|
15
|
+
"""
|
|
13
16
|
parser = argparse.ArgumentParser(
|
|
14
17
|
description="Preprocess datasets in a given lamindb collection."
|
|
15
18
|
)
|
|
@@ -1,26 +1,27 @@
|
|
|
1
1
|
import numpy as np
|
|
2
|
-
from .utils import load_genes
|
|
2
|
+
from .utils import load_genes, downsample_profile
|
|
3
3
|
from torch import Tensor, long
|
|
4
|
-
|
|
5
|
-
# class SimpleCollator:
|
|
4
|
+
from typing import Optional
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
class Collator:
|
|
9
8
|
def __init__(
|
|
10
9
|
self,
|
|
11
|
-
organisms: list,
|
|
12
|
-
how="all",
|
|
13
|
-
org_to_id: dict = None,
|
|
14
|
-
valid_genes: list = [],
|
|
15
|
-
max_len=2000,
|
|
16
|
-
add_zero_genes=0,
|
|
17
|
-
logp1=False,
|
|
18
|
-
norm_to=None,
|
|
19
|
-
n_bins=0,
|
|
20
|
-
tp_name=None,
|
|
21
|
-
organism_name="organism_ontology_term_id",
|
|
22
|
-
class_names=[],
|
|
23
|
-
genelist=[],
|
|
10
|
+
organisms: list[str],
|
|
11
|
+
how: str = "all",
|
|
12
|
+
org_to_id: dict[str, int] = None,
|
|
13
|
+
valid_genes: list[str] = [],
|
|
14
|
+
max_len: int = 2000,
|
|
15
|
+
add_zero_genes: int = 0,
|
|
16
|
+
logp1: bool = False,
|
|
17
|
+
norm_to: Optional[float] = None,
|
|
18
|
+
n_bins: int = 0,
|
|
19
|
+
tp_name: Optional[str] = None,
|
|
20
|
+
organism_name: str = "organism_ontology_term_id",
|
|
21
|
+
class_names: list[str] = [],
|
|
22
|
+
genelist: list[str] = [],
|
|
23
|
+
downsample: Optional[float] = None, # don't use it for training!
|
|
24
|
+
save_output: bool = False,
|
|
24
25
|
):
|
|
25
26
|
"""
|
|
26
27
|
This class is responsible for collating data for the scPRINT model. It handles the
|
|
@@ -44,38 +45,57 @@ class Collator:
|
|
|
44
45
|
org_to_id (dict): Dictionary mapping organisms to their respective IDs.
|
|
45
46
|
valid_genes (list, optional): List of genes from the datasets, to be considered. Defaults to [].
|
|
46
47
|
it will drop any other genes from the input expression data (usefull when your model only works on some genes)
|
|
47
|
-
max_len (int, optional):
|
|
48
|
+
max_len (int, optional): Total number of genes to use (for random expr and most expr). Defaults to 2000.
|
|
48
49
|
n_bins (int, optional): Number of bins for binning the data. Defaults to 0. meaning, no binning of expression.
|
|
49
50
|
add_zero_genes (int, optional): Number of additional unexpressed genes to add to the input data. Defaults to 0.
|
|
50
51
|
logp1 (bool, optional): If True, logp1 normalization is applied. Defaults to False.
|
|
51
|
-
norm_to (
|
|
52
|
+
norm_to (float, optional): Rescaling value of the normalization to be applied. Defaults to None.
|
|
53
|
+
organism_name (str, optional): Name of the organism ontology term id. Defaults to "organism_ontology_term_id".
|
|
54
|
+
tp_name (str, optional): Name of the heat diff. Defaults to None.
|
|
55
|
+
class_names (list, optional): List of other classes to be considered. Defaults to [].
|
|
56
|
+
genelist (list, optional): List of genes to be considered. Defaults to [].
|
|
57
|
+
If [] all genes will be considered
|
|
58
|
+
downsample (float, optional): Downsample the profile to a certain number of cells. Defaults to None.
|
|
59
|
+
This is usually done by the scPRINT model during training but this option allows you to do it directly from the collator
|
|
60
|
+
save_output (bool, optional): If True, saves the output to a file. Defaults to False.
|
|
61
|
+
This is mainly for debugging purposes
|
|
52
62
|
"""
|
|
53
63
|
self.organisms = organisms
|
|
64
|
+
self.genedf = load_genes(organisms)
|
|
54
65
|
self.max_len = max_len
|
|
55
66
|
self.n_bins = n_bins
|
|
56
67
|
self.add_zero_genes = add_zero_genes
|
|
57
68
|
self.logp1 = logp1
|
|
58
69
|
self.norm_to = norm_to
|
|
59
|
-
self.org_to_id = org_to_id
|
|
60
70
|
self.how = how
|
|
61
|
-
self.organism_ids = (
|
|
62
|
-
set([org_to_id[k] for k in organisms])
|
|
63
|
-
if org_to_id is not None
|
|
64
|
-
else set(organisms)
|
|
65
|
-
)
|
|
66
71
|
if self.how == "some":
|
|
67
72
|
assert len(genelist) > 0, "if how is some, genelist must be provided"
|
|
68
73
|
self.organism_name = organism_name
|
|
69
74
|
self.tp_name = tp_name
|
|
70
75
|
self.class_names = class_names
|
|
71
|
-
|
|
76
|
+
self.save_output = save_output
|
|
72
77
|
self.start_idx = {}
|
|
73
78
|
self.accepted_genes = {}
|
|
74
|
-
self.
|
|
79
|
+
self.downsample = downsample
|
|
75
80
|
self.to_subset = {}
|
|
76
|
-
|
|
81
|
+
self._setup(org_to_id, valid_genes, genelist)
|
|
82
|
+
|
|
83
|
+
def _setup(self, org_to_id=None, valid_genes=[], genelist=[]):
|
|
84
|
+
self.org_to_id = org_to_id
|
|
85
|
+
self.to_subset = {}
|
|
86
|
+
self.accepted_genes = {}
|
|
87
|
+
self.start_idx = {}
|
|
88
|
+
self.organism_ids = (
|
|
89
|
+
set([org_to_id[k] for k in self.organisms])
|
|
90
|
+
if org_to_id is not None
|
|
91
|
+
else set(self.organisms)
|
|
92
|
+
)
|
|
93
|
+
for organism in self.organisms:
|
|
77
94
|
ogenedf = self.genedf[self.genedf.organism == organism]
|
|
78
|
-
|
|
95
|
+
if len(valid_genes) > 0:
|
|
96
|
+
tot = self.genedf[self.genedf.index.isin(valid_genes)]
|
|
97
|
+
else:
|
|
98
|
+
tot = self.genedf
|
|
79
99
|
org = org_to_id[organism] if org_to_id is not None else organism
|
|
80
100
|
self.start_idx.update({org: np.where(tot.organism == organism)[0][0]})
|
|
81
101
|
if len(valid_genes) > 0:
|
|
@@ -84,14 +104,14 @@ class Collator:
|
|
|
84
104
|
df = ogenedf[ogenedf.index.isin(valid_genes)]
|
|
85
105
|
self.to_subset.update({org: df.index.isin(genelist)})
|
|
86
106
|
|
|
87
|
-
def __call__(self, batch):
|
|
107
|
+
def __call__(self, batch) -> dict[str, Tensor]:
|
|
88
108
|
"""
|
|
89
109
|
__call__ applies the collator to a minibatch of data
|
|
90
110
|
|
|
91
111
|
Args:
|
|
92
112
|
batch (list[dict[str: array]]): List of dicts of arrays containing gene expression data.
|
|
93
113
|
the first list is for the different samples, the second list is for the different elements with
|
|
94
|
-
elem["
|
|
114
|
+
elem["X"]: gene expression
|
|
95
115
|
elem["organism_name"]: organism ontology term id
|
|
96
116
|
elem["tp_name"]: heat diff
|
|
97
117
|
elem["class_names.."]: other classes
|
|
@@ -113,9 +133,9 @@ class Collator:
|
|
|
113
133
|
organism_id = elem[self.organism_name]
|
|
114
134
|
if organism_id not in self.organism_ids:
|
|
115
135
|
continue
|
|
116
|
-
if "
|
|
117
|
-
dataset.append(elem["
|
|
118
|
-
expr = np.array(elem["
|
|
136
|
+
if "_storage_idx" in elem:
|
|
137
|
+
dataset.append(elem["_storage_idx"])
|
|
138
|
+
expr = np.array(elem["X"])
|
|
119
139
|
total_count.append(expr.sum())
|
|
120
140
|
if len(self.accepted_genes) > 0:
|
|
121
141
|
expr = expr[self.accepted_genes[organism_id]]
|
|
@@ -206,72 +226,17 @@ class Collator:
|
|
|
206
226
|
}
|
|
207
227
|
if len(dataset) > 0:
|
|
208
228
|
ret.update({"dataset": Tensor(dataset).to(long)})
|
|
229
|
+
if self.downsample is not None:
|
|
230
|
+
ret["x"] = downsample_profile(ret["x"], self.downsample)
|
|
231
|
+
if self.save_output:
|
|
232
|
+
with open("collator_output.txt", "a") as f:
|
|
233
|
+
np.savetxt(f, ret["x"].numpy())
|
|
209
234
|
return ret
|
|
210
235
|
|
|
211
236
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
AnnDataCollator Collator to use if working with AnnData's experimental dataloader (it is very slow!!!)
|
|
216
|
-
|
|
217
|
-
Args:
|
|
218
|
-
@see Collator
|
|
219
|
-
"""
|
|
220
|
-
super().__init__(*args, **kwargs)
|
|
221
|
-
|
|
222
|
-
def __call__(self, batch):
|
|
223
|
-
exprs = []
|
|
224
|
-
total_count = []
|
|
225
|
-
other_classes = []
|
|
226
|
-
gene_locs = []
|
|
227
|
-
tp = []
|
|
228
|
-
for elem in batch:
|
|
229
|
-
organism_id = elem.obs[self.organism_name]
|
|
230
|
-
if organism_id.item() not in self.organism_ids:
|
|
231
|
-
print(organism_id)
|
|
232
|
-
expr = np.array(elem.X[0])
|
|
233
|
-
|
|
234
|
-
total_count.append(expr.sum())
|
|
235
|
-
if len(self.accepted_genes) > 0:
|
|
236
|
-
expr = expr[self.accepted_genes[organism_id]]
|
|
237
|
-
if self.how == "most expr":
|
|
238
|
-
loc = np.argsort(expr)[-(self.max_len) :][::-1]
|
|
239
|
-
elif self.how == "random expr":
|
|
240
|
-
nnz_loc = np.where(expr > 0)[0]
|
|
241
|
-
loc = nnz_loc[
|
|
242
|
-
np.random.choice(len(nnz_loc), self.max_len, replace=False)
|
|
243
|
-
]
|
|
244
|
-
else:
|
|
245
|
-
raise ValueError("how must be either most expr or random expr")
|
|
246
|
-
if self.add_zero_genes > 0:
|
|
247
|
-
zero_loc = np.where(expr == 0)[0]
|
|
248
|
-
zero_loc = [
|
|
249
|
-
np.random.choice(len(zero_loc), self.add_zero_genes, replace=False)
|
|
250
|
-
]
|
|
251
|
-
loc = np.concatenate((loc, zero_loc), axis=None)
|
|
252
|
-
exprs.append(expr[loc])
|
|
253
|
-
gene_locs.append(loc + self.start_idx[organism_id.item()])
|
|
254
|
-
|
|
255
|
-
if self.tp_name is not None:
|
|
256
|
-
tp.append(elem.obs[self.tp_name])
|
|
257
|
-
else:
|
|
258
|
-
tp.append(0)
|
|
259
|
-
|
|
260
|
-
other_classes.append([elem.obs[i].values[0] for i in self.class_names])
|
|
261
|
-
|
|
262
|
-
expr = np.array(exprs)
|
|
263
|
-
tp = np.array(tp)
|
|
264
|
-
gene_locs = np.array(gene_locs)
|
|
265
|
-
total_count = np.array(total_count)
|
|
266
|
-
other_classes = np.array(other_classes)
|
|
267
|
-
return {
|
|
268
|
-
"x": Tensor(expr),
|
|
269
|
-
"genes": Tensor(gene_locs).int(),
|
|
270
|
-
"depth": Tensor(total_count),
|
|
271
|
-
"class": Tensor(other_classes),
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
|
|
237
|
+
#############
|
|
238
|
+
#### WIP ####
|
|
239
|
+
#############
|
|
275
240
|
class GeneformerCollator(Collator):
|
|
276
241
|
def __init__(self, *args, gene_norm_list: list, **kwargs):
|
|
277
242
|
"""
|