scdataloader 0.0.4__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scdataloader-0.0.4 → scdataloader-1.0.1}/PKG-INFO +45 -20
- scdataloader-1.0.1/README.md +133 -0
- {scdataloader-0.0.4 → scdataloader-1.0.1}/pyproject.toml +1 -1
- scdataloader-1.0.1/scdataloader/VERSION +1 -0
- {scdataloader-0.0.4 → scdataloader-1.0.1}/scdataloader/__main__.py +3 -0
- {scdataloader-0.0.4 → scdataloader-1.0.1}/scdataloader/collator.py +56 -31
- {scdataloader-0.0.4 → scdataloader-1.0.1}/scdataloader/config.py +6 -0
- {scdataloader-0.0.4 → scdataloader-1.0.1}/scdataloader/data.py +98 -87
- {scdataloader-0.0.4 → scdataloader-1.0.1}/scdataloader/datamodule.py +66 -38
- scdataloader-1.0.1/scdataloader/mapped.py +519 -0
- {scdataloader-0.0.4 → scdataloader-1.0.1}/scdataloader/preprocess.py +3 -207
- {scdataloader-0.0.4 → scdataloader-1.0.1}/scdataloader/utils.py +57 -8
- scdataloader-0.0.4/README.md +0 -107
- scdataloader-0.0.4/scdataloader/VERSION +0 -1
- scdataloader-0.0.4/scdataloader/mapped.py +0 -358
- {scdataloader-0.0.4 → scdataloader-1.0.1}/LICENSE +0 -0
- {scdataloader-0.0.4 → scdataloader-1.0.1}/scdataloader/__init__.py +0 -0
- {scdataloader-0.0.4 → scdataloader-1.0.1}/scdataloader/base.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Home-page: https://github.com/jkobject/scDataLoader
|
|
6
6
|
License: GPL3
|
|
@@ -34,14 +34,16 @@ Description-Content-Type: text/markdown
|
|
|
34
34
|
|
|
35
35
|
[](https://codecov.io/gh/jkobject/scDataLoader)
|
|
36
36
|
[](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
|
|
37
|
-
[](https://badge.fury.io/py/scDataLoader)
|
|
38
|
+
[](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
|
|
39
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
40
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
41
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
42
|
+
[](https://img.shields.io/github/issues/jkobject/scDataLoader)
|
|
43
|
+
[](https://github.com/psf/black)
|
|
44
|
+
[](https://doi.org/10.1101/2024.07.29.605556)
|
|
38
45
|
|
|
39
|
-
|
|
40
|
-
Awesome single cell dataloader created by @jkobject
|
|
41
|
-
|
|
42
|
-
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
43
|
-
|
|
44
|
-
This data loader is designed to be used with:
|
|
46
|
+
This single cell pytorch dataloader / lighting datamodule is designed to be used with:
|
|
45
47
|
|
|
46
48
|
- [lamindb](https://lamin.ai/)
|
|
47
49
|
|
|
@@ -57,18 +59,13 @@ It allows you to:
|
|
|
57
59
|
3. create a more complex single cell dataset
|
|
58
60
|
4. extend it to your need
|
|
59
61
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
|
|
62
|
+
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
63
63
|
|
|
64
|
-
|
|
65
|
-
2. doing some dataset specific preprocessing if needed
|
|
66
|
-
3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
|
|
67
|
-
4. passing it to a dataloader object that can work with it correctly
|
|
64
|
+
## More
|
|
68
65
|
|
|
69
|
-
|
|
66
|
+
I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
|
|
70
67
|
|
|
71
|
-

|
|
72
69
|
|
|
73
70
|
## Install it from PyPI
|
|
74
71
|
|
|
@@ -87,6 +84,8 @@ then run the notebooks with the poetry installed environment
|
|
|
87
84
|
|
|
88
85
|
## Usage
|
|
89
86
|
|
|
87
|
+
### Direct Usage
|
|
88
|
+
|
|
90
89
|
```python
|
|
91
90
|
# initialize a local lamin database
|
|
92
91
|
# !lamin init --storage ~/scdataloader --schema bionty
|
|
@@ -129,15 +128,41 @@ for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
|
129
128
|
|
|
130
129
|
```
|
|
131
130
|
|
|
132
|
-
see the notebooks in [docs](https://jkobject.
|
|
131
|
+
see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
|
|
132
|
+
|
|
133
|
+
1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
|
|
134
|
+
2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
|
|
135
|
+
|
|
136
|
+
### command line preprocessing
|
|
133
137
|
|
|
134
|
-
|
|
135
|
-
|
|
138
|
+
You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### command line usage
|
|
145
|
+
|
|
146
|
+
The main way to use
|
|
147
|
+
|
|
148
|
+
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
|
|
136
149
|
|
|
137
150
|
## Development
|
|
138
151
|
|
|
139
152
|
Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
|
140
153
|
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
157
|
+
|
|
158
|
+
## Acknowledgments
|
|
159
|
+
|
|
160
|
+
- [lamin.ai](https://lamin.ai/)
|
|
161
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
162
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
163
|
+
- [scprint](https://www.jkobject.com/scPRINT/)
|
|
164
|
+
|
|
165
|
+
Awesome single cell dataloader created by @jkobject
|
|
141
166
|
GNU GENERAL PUBLIC LICENSE
|
|
142
167
|
Version 3, 29 June 2007
|
|
143
168
|
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# scdataloader
|
|
2
|
+
|
|
3
|
+
[](https://codecov.io/gh/jkobject/scDataLoader)
|
|
4
|
+
[](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
|
|
5
|
+
[](https://badge.fury.io/py/scDataLoader)
|
|
6
|
+
[](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
|
|
7
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
8
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
9
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
10
|
+
[](https://img.shields.io/github/issues/jkobject/scDataLoader)
|
|
11
|
+
[](https://github.com/psf/black)
|
|
12
|
+
[](https://doi.org/10.1101/2024.07.29.605556)
|
|
13
|
+
|
|
14
|
+
This single cell pytorch dataloader / lighting datamodule is designed to be used with:
|
|
15
|
+
|
|
16
|
+
- [lamindb](https://lamin.ai/)
|
|
17
|
+
|
|
18
|
+
and:
|
|
19
|
+
|
|
20
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
21
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
22
|
+
|
|
23
|
+
It allows you to:
|
|
24
|
+
|
|
25
|
+
1. load thousands of datasets containing millions of cells in a few seconds.
|
|
26
|
+
2. preprocess the data per dataset and download it locally (normalization, filtering, etc.)
|
|
27
|
+
3. create a more complex single cell dataset
|
|
28
|
+
4. extend it to your need
|
|
29
|
+
|
|
30
|
+
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
31
|
+
|
|
32
|
+
## More
|
|
33
|
+
|
|
34
|
+
I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
|
|
35
|
+
|
|
36
|
+

|
|
37
|
+
|
|
38
|
+
## Install it from PyPI
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install scdataloader
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Install it locally and run the notebooks:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
git clone https://github.com/jkobject/scDataLoader.git
|
|
48
|
+
cd scDataLoader
|
|
49
|
+
poetry install
|
|
50
|
+
```
|
|
51
|
+
then run the notebooks with the poetry installed environment
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
|
|
55
|
+
### Direct Usage
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
# initialize a local lamin database
|
|
59
|
+
# !lamin init --storage ~/scdataloader --schema bionty
|
|
60
|
+
|
|
61
|
+
from scdataloader import utils
|
|
62
|
+
from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
|
|
63
|
+
|
|
64
|
+
# preprocess datasets
|
|
65
|
+
DESCRIPTION='preprocessed by scDataLoader'
|
|
66
|
+
|
|
67
|
+
cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
|
|
68
|
+
cx_dataset, len(cx_dataset.artifacts.all())
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
|
|
72
|
+
|
|
73
|
+
preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
|
|
74
|
+
|
|
75
|
+
# create dataloaders
|
|
76
|
+
from scdataloader import DataModule
|
|
77
|
+
import tqdm
|
|
78
|
+
|
|
79
|
+
datamodule = DataModule(
|
|
80
|
+
collection_name="preprocessed dataset",
|
|
81
|
+
organisms=["NCBITaxon:9606"], #organism that we will work on
|
|
82
|
+
how="most expr", # for the collator (most expr genes only will be selected)
|
|
83
|
+
max_len=1000, # only the 1000 most expressed
|
|
84
|
+
batch_size=64,
|
|
85
|
+
num_workers=1,
|
|
86
|
+
validation_split=0.1,
|
|
87
|
+
test_split=0)
|
|
88
|
+
|
|
89
|
+
for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
90
|
+
# pass #or do pass
|
|
91
|
+
print(i)
|
|
92
|
+
break
|
|
93
|
+
|
|
94
|
+
# with lightning:
|
|
95
|
+
# Trainer(model, datamodule)
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
|
|
100
|
+
|
|
101
|
+
1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
|
|
102
|
+
2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
|
|
103
|
+
|
|
104
|
+
### command line preprocessing
|
|
105
|
+
|
|
106
|
+
You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### command line usage
|
|
113
|
+
|
|
114
|
+
The main way to use
|
|
115
|
+
|
|
116
|
+
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
|
|
117
|
+
|
|
118
|
+
## Development
|
|
119
|
+
|
|
120
|
+
Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
|
121
|
+
|
|
122
|
+
## License
|
|
123
|
+
|
|
124
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
125
|
+
|
|
126
|
+
## Acknowledgments
|
|
127
|
+
|
|
128
|
+
- [lamin.ai](https://lamin.ai/)
|
|
129
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
130
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
131
|
+
- [scprint](https://www.jkobject.com/scPRINT/)
|
|
132
|
+
|
|
133
|
+
Awesome single cell dataloader created by @jkobject
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
1.0.0
|
|
@@ -10,6 +10,9 @@ from typing import Optional, Union
|
|
|
10
10
|
|
|
11
11
|
# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
|
|
12
12
|
def main():
|
|
13
|
+
"""
|
|
14
|
+
main function to preprocess datasets in a given lamindb collection.
|
|
15
|
+
"""
|
|
13
16
|
parser = argparse.ArgumentParser(
|
|
14
17
|
description="Preprocess datasets in a given lamindb collection."
|
|
15
18
|
)
|
|
@@ -1,26 +1,27 @@
|
|
|
1
1
|
import numpy as np
|
|
2
|
-
from .utils import load_genes
|
|
2
|
+
from .utils import load_genes, downsample_profile
|
|
3
3
|
from torch import Tensor, long
|
|
4
|
-
|
|
5
|
-
# class SimpleCollator:
|
|
4
|
+
from typing import Optional
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
class Collator:
|
|
9
8
|
def __init__(
|
|
10
9
|
self,
|
|
11
|
-
organisms: list,
|
|
12
|
-
how="all",
|
|
13
|
-
org_to_id: dict = None,
|
|
14
|
-
valid_genes: list = [],
|
|
15
|
-
max_len=2000,
|
|
16
|
-
add_zero_genes=0,
|
|
17
|
-
logp1=False,
|
|
18
|
-
norm_to=None,
|
|
19
|
-
n_bins=0,
|
|
20
|
-
tp_name=None,
|
|
21
|
-
organism_name="organism_ontology_term_id",
|
|
22
|
-
class_names=[],
|
|
23
|
-
genelist=[],
|
|
10
|
+
organisms: list[str],
|
|
11
|
+
how: str = "all",
|
|
12
|
+
org_to_id: dict[str, int] = None,
|
|
13
|
+
valid_genes: list[str] = [],
|
|
14
|
+
max_len: int = 2000,
|
|
15
|
+
add_zero_genes: int = 0,
|
|
16
|
+
logp1: bool = False,
|
|
17
|
+
norm_to: Optional[float] = None,
|
|
18
|
+
n_bins: int = 0,
|
|
19
|
+
tp_name: Optional[str] = None,
|
|
20
|
+
organism_name: str = "organism_ontology_term_id",
|
|
21
|
+
class_names: list[str] = [],
|
|
22
|
+
genelist: list[str] = [],
|
|
23
|
+
downsample: Optional[float] = None, # don't use it for training!
|
|
24
|
+
save_output: bool = False,
|
|
24
25
|
):
|
|
25
26
|
"""
|
|
26
27
|
This class is responsible for collating data for the scPRINT model. It handles the
|
|
@@ -44,36 +45,52 @@ class Collator:
|
|
|
44
45
|
org_to_id (dict): Dictionary mapping organisms to their respective IDs.
|
|
45
46
|
valid_genes (list, optional): List of genes from the datasets, to be considered. Defaults to [].
|
|
46
47
|
it will drop any other genes from the input expression data (usefull when your model only works on some genes)
|
|
47
|
-
max_len (int, optional):
|
|
48
|
+
max_len (int, optional): Total number of genes to use (for random expr and most expr). Defaults to 2000.
|
|
48
49
|
n_bins (int, optional): Number of bins for binning the data. Defaults to 0. meaning, no binning of expression.
|
|
49
50
|
add_zero_genes (int, optional): Number of additional unexpressed genes to add to the input data. Defaults to 0.
|
|
50
51
|
logp1 (bool, optional): If True, logp1 normalization is applied. Defaults to False.
|
|
51
|
-
norm_to (
|
|
52
|
+
norm_to (float, optional): Rescaling value of the normalization to be applied. Defaults to None.
|
|
53
|
+
organism_name (str, optional): Name of the organism ontology term id. Defaults to "organism_ontology_term_id".
|
|
54
|
+
tp_name (str, optional): Name of the heat diff. Defaults to None.
|
|
55
|
+
class_names (list, optional): List of other classes to be considered. Defaults to [].
|
|
56
|
+
genelist (list, optional): List of genes to be considered. Defaults to [].
|
|
57
|
+
If [] all genes will be considered
|
|
58
|
+
downsample (float, optional): Downsample the profile to a certain number of cells. Defaults to None.
|
|
59
|
+
This is usually done by the scPRINT model during training but this option allows you to do it directly from the collator
|
|
60
|
+
save_output (bool, optional): If True, saves the output to a file. Defaults to False.
|
|
61
|
+
This is mainly for debugging purposes
|
|
52
62
|
"""
|
|
53
63
|
self.organisms = organisms
|
|
64
|
+
self.genedf = load_genes(organisms)
|
|
54
65
|
self.max_len = max_len
|
|
55
66
|
self.n_bins = n_bins
|
|
56
67
|
self.add_zero_genes = add_zero_genes
|
|
57
68
|
self.logp1 = logp1
|
|
58
69
|
self.norm_to = norm_to
|
|
59
|
-
self.org_to_id = org_to_id
|
|
60
70
|
self.how = how
|
|
61
|
-
self.organism_ids = (
|
|
62
|
-
set([org_to_id[k] for k in organisms])
|
|
63
|
-
if org_to_id is not None
|
|
64
|
-
else set(organisms)
|
|
65
|
-
)
|
|
66
71
|
if self.how == "some":
|
|
67
72
|
assert len(genelist) > 0, "if how is some, genelist must be provided"
|
|
68
73
|
self.organism_name = organism_name
|
|
69
74
|
self.tp_name = tp_name
|
|
70
75
|
self.class_names = class_names
|
|
71
|
-
|
|
76
|
+
self.save_output = save_output
|
|
72
77
|
self.start_idx = {}
|
|
73
78
|
self.accepted_genes = {}
|
|
74
|
-
self.
|
|
79
|
+
self.downsample = downsample
|
|
80
|
+
self.to_subset = {}
|
|
81
|
+
self._setup(org_to_id, valid_genes, genelist)
|
|
82
|
+
|
|
83
|
+
def _setup(self, org_to_id=None, valid_genes=[], genelist=[]):
|
|
84
|
+
self.org_to_id = org_to_id
|
|
75
85
|
self.to_subset = {}
|
|
76
|
-
|
|
86
|
+
self.accepted_genes = {}
|
|
87
|
+
self.start_idx = {}
|
|
88
|
+
self.organism_ids = (
|
|
89
|
+
set([org_to_id[k] for k in self.organisms])
|
|
90
|
+
if org_to_id is not None
|
|
91
|
+
else set(self.organisms)
|
|
92
|
+
)
|
|
93
|
+
for organism in self.organisms:
|
|
77
94
|
ogenedf = self.genedf[self.genedf.organism == organism]
|
|
78
95
|
tot = self.genedf[self.genedf.index.isin(valid_genes)]
|
|
79
96
|
org = org_to_id[organism] if org_to_id is not None else organism
|
|
@@ -84,7 +101,7 @@ class Collator:
|
|
|
84
101
|
df = ogenedf[ogenedf.index.isin(valid_genes)]
|
|
85
102
|
self.to_subset.update({org: df.index.isin(genelist)})
|
|
86
103
|
|
|
87
|
-
def __call__(self, batch):
|
|
104
|
+
def __call__(self, batch) -> dict[str, Tensor]:
|
|
88
105
|
"""
|
|
89
106
|
__call__ applies the collator to a minibatch of data
|
|
90
107
|
|
|
@@ -113,8 +130,8 @@ class Collator:
|
|
|
113
130
|
organism_id = elem[self.organism_name]
|
|
114
131
|
if organism_id not in self.organism_ids:
|
|
115
132
|
continue
|
|
116
|
-
if "
|
|
117
|
-
dataset.append(elem["
|
|
133
|
+
if "_storage_idx" in elem:
|
|
134
|
+
dataset.append(elem["_storage_idx"])
|
|
118
135
|
expr = np.array(elem["x"])
|
|
119
136
|
total_count.append(expr.sum())
|
|
120
137
|
if len(self.accepted_genes) > 0:
|
|
@@ -206,6 +223,11 @@ class Collator:
|
|
|
206
223
|
}
|
|
207
224
|
if len(dataset) > 0:
|
|
208
225
|
ret.update({"dataset": Tensor(dataset).to(long)})
|
|
226
|
+
if self.downsample is not None:
|
|
227
|
+
ret["x"] = downsample_profile(ret["x"], self.downsample)
|
|
228
|
+
if self.save_output:
|
|
229
|
+
with open("collator_output.txt", "a") as f:
|
|
230
|
+
np.savetxt(f, ret["x"].numpy())
|
|
209
231
|
return ret
|
|
210
232
|
|
|
211
233
|
|
|
@@ -219,7 +241,7 @@ class AnnDataCollator(Collator):
|
|
|
219
241
|
"""
|
|
220
242
|
super().__init__(*args, **kwargs)
|
|
221
243
|
|
|
222
|
-
def __call__(self, batch):
|
|
244
|
+
def __call__(self, batch) -> dict[str, Tensor]:
|
|
223
245
|
exprs = []
|
|
224
246
|
total_count = []
|
|
225
247
|
other_classes = []
|
|
@@ -272,6 +294,9 @@ class AnnDataCollator(Collator):
|
|
|
272
294
|
}
|
|
273
295
|
|
|
274
296
|
|
|
297
|
+
#############
|
|
298
|
+
#### WIP ####
|
|
299
|
+
#############
|
|
275
300
|
class GeneformerCollator(Collator):
|
|
276
301
|
def __init__(self, *args, gene_norm_list: list, **kwargs):
|
|
277
302
|
"""
|