scdataloader 0.0.3__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scdataloader-0.0.3 → scdataloader-1.0.1}/PKG-INFO +87 -18
- scdataloader-1.0.1/README.md +133 -0
- {scdataloader-0.0.3 → scdataloader-1.0.1}/pyproject.toml +1 -1
- scdataloader-1.0.1/scdataloader/VERSION +1 -0
- {scdataloader-0.0.3 → scdataloader-1.0.1}/scdataloader/__init__.py +1 -1
- {scdataloader-0.0.3 → scdataloader-1.0.1}/scdataloader/__main__.py +66 -42
- {scdataloader-0.0.3 → scdataloader-1.0.1}/scdataloader/collator.py +136 -67
- scdataloader-1.0.1/scdataloader/config.py +112 -0
- scdataloader-1.0.1/scdataloader/data.py +327 -0
- scdataloader-1.0.1/scdataloader/datamodule.py +403 -0
- scdataloader-1.0.1/scdataloader/mapped.py +519 -0
- {scdataloader-0.0.3 → scdataloader-1.0.1}/scdataloader/preprocess.py +240 -109
- {scdataloader-0.0.3 → scdataloader-1.0.1}/scdataloader/utils.py +162 -70
- scdataloader-0.0.3/README.md +0 -63
- scdataloader-0.0.3/scdataloader/VERSION +0 -1
- scdataloader-0.0.3/scdataloader/data.py +0 -336
- scdataloader-0.0.3/scdataloader/dataloader.py +0 -318
- scdataloader-0.0.3/scdataloader/mapped.py +0 -343
- {scdataloader-0.0.3 → scdataloader-1.0.1}/LICENSE +0 -0
- {scdataloader-0.0.3 → scdataloader-1.0.1}/scdataloader/base.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Home-page: https://github.com/jkobject/scDataLoader
|
|
6
6
|
License: GPL3
|
|
@@ -34,12 +34,16 @@ Description-Content-Type: text/markdown
|
|
|
34
34
|
|
|
35
35
|
[](https://codecov.io/gh/jkobject/scDataLoader)
|
|
36
36
|
[](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
|
|
37
|
+
[](https://badge.fury.io/py/scDataLoader)
|
|
38
|
+
[](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
|
|
39
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
40
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
41
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
42
|
+
[](https://img.shields.io/github/issues/jkobject/scDataLoader)
|
|
43
|
+
[](https://github.com/psf/black)
|
|
44
|
+
[](https://doi.org/10.1101/2024.07.29.605556)
|
|
37
45
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
41
|
-
|
|
42
|
-
This data loader is designed to be used with:
|
|
46
|
+
This single cell pytorch dataloader / lighting datamodule is designed to be used with:
|
|
43
47
|
|
|
44
48
|
- [lamindb](https://lamin.ai/)
|
|
45
49
|
|
|
@@ -55,18 +59,13 @@ It allows you to:
|
|
|
55
59
|
3. create a more complex single cell dataset
|
|
56
60
|
4. extend it to your need
|
|
57
61
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
|
|
62
|
+
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
61
63
|
|
|
62
|
-
|
|
63
|
-
2. doing some dataset specific preprocessing if needed
|
|
64
|
-
3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
|
|
65
|
-
4. passing it to a dataloader object that can work with it correctly
|
|
64
|
+
## More
|
|
66
65
|
|
|
67
|
-
|
|
66
|
+
I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
|
|
68
67
|
|
|
69
|
-

|
|
68
|
+

|
|
70
69
|
|
|
71
70
|
## Install it from PyPI
|
|
72
71
|
|
|
@@ -85,15 +84,85 @@ then run the notebooks with the poetry installed environment
|
|
|
85
84
|
|
|
86
85
|
## Usage
|
|
87
86
|
|
|
88
|
-
|
|
87
|
+
### Direct Usage
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
# initialize a local lamin database
|
|
91
|
+
# !lamin init --storage ~/scdataloader --schema bionty
|
|
92
|
+
|
|
93
|
+
from scdataloader import utils
|
|
94
|
+
from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
|
|
95
|
+
|
|
96
|
+
# preprocess datasets
|
|
97
|
+
DESCRIPTION='preprocessed by scDataLoader'
|
|
98
|
+
|
|
99
|
+
cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
|
|
100
|
+
cx_dataset, len(cx_dataset.artifacts.all())
|
|
89
101
|
|
|
90
|
-
|
|
91
|
-
|
|
102
|
+
|
|
103
|
+
do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
|
|
104
|
+
|
|
105
|
+
preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
|
|
106
|
+
|
|
107
|
+
# create dataloaders
|
|
108
|
+
from scdataloader import DataModule
|
|
109
|
+
import tqdm
|
|
110
|
+
|
|
111
|
+
datamodule = DataModule(
|
|
112
|
+
collection_name="preprocessed dataset",
|
|
113
|
+
organisms=["NCBITaxon:9606"], #organism that we will work on
|
|
114
|
+
how="most expr", # for the collator (most expr genes only will be selected)
|
|
115
|
+
max_len=1000, # only the 1000 most expressed
|
|
116
|
+
batch_size=64,
|
|
117
|
+
num_workers=1,
|
|
118
|
+
validation_split=0.1,
|
|
119
|
+
test_split=0)
|
|
120
|
+
|
|
121
|
+
for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
122
|
+
# pass #or do pass
|
|
123
|
+
print(i)
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
# with lightning:
|
|
127
|
+
# Trainer(model, datamodule)
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
|
|
132
|
+
|
|
133
|
+
1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
|
|
134
|
+
2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
|
|
135
|
+
|
|
136
|
+
### command line preprocessing
|
|
137
|
+
|
|
138
|
+
You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### command line usage
|
|
145
|
+
|
|
146
|
+
The main way to use
|
|
147
|
+
|
|
148
|
+
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
|
|
92
149
|
|
|
93
150
|
## Development
|
|
94
151
|
|
|
95
152
|
Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
|
96
153
|
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
157
|
+
|
|
158
|
+
## Acknowledgments
|
|
159
|
+
|
|
160
|
+
- [lamin.ai](https://lamin.ai/)
|
|
161
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
162
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
163
|
+
- [scprint](https://www.jkobject.com/scPRINT/)
|
|
164
|
+
|
|
165
|
+
Awesome single cell dataloader created by @jkobject
|
|
97
166
|
GNU GENERAL PUBLIC LICENSE
|
|
98
167
|
Version 3, 29 June 2007
|
|
99
168
|
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# scdataloader
|
|
2
|
+
|
|
3
|
+
[](https://codecov.io/gh/jkobject/scDataLoader)
|
|
4
|
+
[](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
|
|
5
|
+
[](https://badge.fury.io/py/scDataLoader)
|
|
6
|
+
[](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
|
|
7
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
8
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
9
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
10
|
+
[](https://img.shields.io/github/issues/jkobject/scDataLoader)
|
|
11
|
+
[](https://github.com/psf/black)
|
|
12
|
+
[](https://doi.org/10.1101/2024.07.29.605556)
|
|
13
|
+
|
|
14
|
+
This single cell pytorch dataloader / lighting datamodule is designed to be used with:
|
|
15
|
+
|
|
16
|
+
- [lamindb](https://lamin.ai/)
|
|
17
|
+
|
|
18
|
+
and:
|
|
19
|
+
|
|
20
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
21
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
22
|
+
|
|
23
|
+
It allows you to:
|
|
24
|
+
|
|
25
|
+
1. load thousands of datasets containing millions of cells in a few seconds.
|
|
26
|
+
2. preprocess the data per dataset and download it locally (normalization, filtering, etc.)
|
|
27
|
+
3. create a more complex single cell dataset
|
|
28
|
+
4. extend it to your need
|
|
29
|
+
|
|
30
|
+
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
31
|
+
|
|
32
|
+
## More
|
|
33
|
+
|
|
34
|
+
I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
|
|
35
|
+
|
|
36
|
+

|
|
37
|
+
|
|
38
|
+
## Install it from PyPI
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install scdataloader
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Install it locally and run the notebooks:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
git clone https://github.com/jkobject/scDataLoader.git
|
|
48
|
+
cd scDataLoader
|
|
49
|
+
poetry install
|
|
50
|
+
```
|
|
51
|
+
then run the notebooks with the poetry installed environment
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
|
|
55
|
+
### Direct Usage
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
# initialize a local lamin database
|
|
59
|
+
# !lamin init --storage ~/scdataloader --schema bionty
|
|
60
|
+
|
|
61
|
+
from scdataloader import utils
|
|
62
|
+
from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
|
|
63
|
+
|
|
64
|
+
# preprocess datasets
|
|
65
|
+
DESCRIPTION='preprocessed by scDataLoader'
|
|
66
|
+
|
|
67
|
+
cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
|
|
68
|
+
cx_dataset, len(cx_dataset.artifacts.all())
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
|
|
72
|
+
|
|
73
|
+
preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
|
|
74
|
+
|
|
75
|
+
# create dataloaders
|
|
76
|
+
from scdataloader import DataModule
|
|
77
|
+
import tqdm
|
|
78
|
+
|
|
79
|
+
datamodule = DataModule(
|
|
80
|
+
collection_name="preprocessed dataset",
|
|
81
|
+
organisms=["NCBITaxon:9606"], #organism that we will work on
|
|
82
|
+
how="most expr", # for the collator (most expr genes only will be selected)
|
|
83
|
+
max_len=1000, # only the 1000 most expressed
|
|
84
|
+
batch_size=64,
|
|
85
|
+
num_workers=1,
|
|
86
|
+
validation_split=0.1,
|
|
87
|
+
test_split=0)
|
|
88
|
+
|
|
89
|
+
for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
90
|
+
# pass #or do pass
|
|
91
|
+
print(i)
|
|
92
|
+
break
|
|
93
|
+
|
|
94
|
+
# with lightning:
|
|
95
|
+
# Trainer(model, datamodule)
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
|
|
100
|
+
|
|
101
|
+
1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
|
|
102
|
+
2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
|
|
103
|
+
|
|
104
|
+
### command line preprocessing
|
|
105
|
+
|
|
106
|
+
You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### command line usage
|
|
113
|
+
|
|
114
|
+
The main way to use
|
|
115
|
+
|
|
116
|
+
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
|
|
117
|
+
|
|
118
|
+
## Development
|
|
119
|
+
|
|
120
|
+
Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
|
121
|
+
|
|
122
|
+
## License
|
|
123
|
+
|
|
124
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
125
|
+
|
|
126
|
+
## Acknowledgments
|
|
127
|
+
|
|
128
|
+
- [lamin.ai](https://lamin.ai/)
|
|
129
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
130
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
131
|
+
- [scprint](https://www.jkobject.com/scPRINT/)
|
|
132
|
+
|
|
133
|
+
Awesome single cell dataloader created by @jkobject
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
1.0.0
|
|
@@ -1,9 +1,18 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
from scdataloader.preprocess import
|
|
2
|
+
from scdataloader.preprocess import (
|
|
3
|
+
LaminPreprocessor,
|
|
4
|
+
additional_preprocess,
|
|
5
|
+
additional_postprocess,
|
|
6
|
+
)
|
|
3
7
|
import lamindb as ln
|
|
4
8
|
from typing import Optional, Union
|
|
5
9
|
|
|
10
|
+
|
|
11
|
+
# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
|
|
6
12
|
def main():
|
|
13
|
+
"""
|
|
14
|
+
main function to preprocess datasets in a given lamindb collection.
|
|
15
|
+
"""
|
|
7
16
|
parser = argparse.ArgumentParser(
|
|
8
17
|
description="Preprocess datasets in a given lamindb collection."
|
|
9
18
|
)
|
|
@@ -11,22 +20,31 @@ def main():
|
|
|
11
20
|
"--name", type=str, required=True, help="Name of the input dataset"
|
|
12
21
|
)
|
|
13
22
|
parser.add_argument(
|
|
14
|
-
"--new_name",
|
|
23
|
+
"--new_name",
|
|
24
|
+
type=str,
|
|
25
|
+
default="preprocessed dataset",
|
|
26
|
+
help="Name of the preprocessed dataset.",
|
|
15
27
|
)
|
|
16
28
|
parser.add_argument(
|
|
17
29
|
"--description",
|
|
18
30
|
type=str,
|
|
19
|
-
default="preprocessed by scDataLoader"
|
|
31
|
+
default="preprocessed by scDataLoader",
|
|
20
32
|
help="Description of the preprocessed dataset.",
|
|
21
33
|
)
|
|
22
34
|
parser.add_argument(
|
|
23
35
|
"--start_at", type=int, default=0, help="Position to start preprocessing at."
|
|
24
36
|
)
|
|
25
37
|
parser.add_argument(
|
|
26
|
-
"--new_version",
|
|
38
|
+
"--new_version",
|
|
39
|
+
type=str,
|
|
40
|
+
default="2",
|
|
41
|
+
help="Version of the output dataset and files.",
|
|
27
42
|
)
|
|
28
43
|
parser.add_argument(
|
|
29
|
-
"--instance",
|
|
44
|
+
"--instance",
|
|
45
|
+
type=str,
|
|
46
|
+
default=None,
|
|
47
|
+
help="Instance storing the input dataset, if not local",
|
|
30
48
|
)
|
|
31
49
|
parser.add_argument(
|
|
32
50
|
"--version", type=str, default=None, help="Version of the input dataset."
|
|
@@ -35,125 +53,127 @@ def main():
|
|
|
35
53
|
"--filter_gene_by_counts",
|
|
36
54
|
type=Union[int, bool],
|
|
37
55
|
default=False,
|
|
38
|
-
help="Determines whether to filter genes by counts."
|
|
56
|
+
help="Determines whether to filter genes by counts.",
|
|
39
57
|
)
|
|
40
58
|
parser.add_argument(
|
|
41
59
|
"--filter_cell_by_counts",
|
|
42
60
|
type=Union[int, bool],
|
|
43
61
|
default=False,
|
|
44
|
-
help="Determines whether to filter cells by counts."
|
|
62
|
+
help="Determines whether to filter cells by counts.",
|
|
45
63
|
)
|
|
46
64
|
parser.add_argument(
|
|
47
65
|
"--normalize_sum",
|
|
48
66
|
type=float,
|
|
49
67
|
default=1e4,
|
|
50
|
-
help="Determines whether to normalize the total counts of each cell to a specific value."
|
|
51
|
-
)
|
|
52
|
-
parser.add_argument(
|
|
53
|
-
"--keep_norm_layer",
|
|
54
|
-
type=bool,
|
|
55
|
-
default=False,
|
|
56
|
-
help="Determines whether to keep the normalization layer."
|
|
68
|
+
help="Determines whether to normalize the total counts of each cell to a specific value.",
|
|
57
69
|
)
|
|
58
70
|
parser.add_argument(
|
|
59
71
|
"--subset_hvg",
|
|
60
72
|
type=int,
|
|
61
73
|
default=0,
|
|
62
|
-
help="Determines whether to subset highly variable genes."
|
|
74
|
+
help="Determines whether to subset highly variable genes.",
|
|
63
75
|
)
|
|
64
76
|
parser.add_argument(
|
|
65
77
|
"--hvg_flavor",
|
|
66
78
|
type=str,
|
|
67
79
|
default="seurat_v3",
|
|
68
|
-
help="Specifies the flavor of highly variable genes selection."
|
|
80
|
+
help="Specifies the flavor of highly variable genes selection.",
|
|
69
81
|
)
|
|
70
82
|
parser.add_argument(
|
|
71
83
|
"--binning",
|
|
72
84
|
type=Optional[int],
|
|
73
85
|
default=None,
|
|
74
|
-
help="Determines whether to bin the data into discrete values of number of bins provided."
|
|
86
|
+
help="Determines whether to bin the data into discrete values of number of bins provided.",
|
|
75
87
|
)
|
|
76
88
|
parser.add_argument(
|
|
77
89
|
"--result_binned_key",
|
|
78
90
|
type=str,
|
|
79
91
|
default="X_binned",
|
|
80
|
-
help="Specifies the key of AnnData to store the binned data."
|
|
92
|
+
help="Specifies the key of AnnData to store the binned data.",
|
|
81
93
|
)
|
|
82
94
|
parser.add_argument(
|
|
83
95
|
"--length_normalize",
|
|
84
96
|
type=bool,
|
|
85
97
|
default=False,
|
|
86
|
-
help="Determines whether to normalize the length."
|
|
98
|
+
help="Determines whether to normalize the length.",
|
|
87
99
|
)
|
|
88
100
|
parser.add_argument(
|
|
89
101
|
"--force_preprocess",
|
|
90
102
|
type=bool,
|
|
91
103
|
default=False,
|
|
92
|
-
help="Determines whether to force preprocessing."
|
|
104
|
+
help="Determines whether to force preprocessing.",
|
|
93
105
|
)
|
|
94
106
|
parser.add_argument(
|
|
95
107
|
"--min_dataset_size",
|
|
96
108
|
type=int,
|
|
97
109
|
default=100,
|
|
98
|
-
help="Specifies the minimum dataset size."
|
|
110
|
+
help="Specifies the minimum dataset size.",
|
|
99
111
|
)
|
|
100
112
|
parser.add_argument(
|
|
101
113
|
"--min_valid_genes_id",
|
|
102
114
|
type=int,
|
|
103
115
|
default=10_000,
|
|
104
|
-
help="Specifies the minimum valid genes id."
|
|
116
|
+
help="Specifies the minimum valid genes id.",
|
|
105
117
|
)
|
|
106
118
|
parser.add_argument(
|
|
107
119
|
"--min_nnz_genes",
|
|
108
120
|
type=int,
|
|
109
|
-
default=
|
|
110
|
-
help="Specifies the minimum non-zero genes."
|
|
121
|
+
default=400,
|
|
122
|
+
help="Specifies the minimum non-zero genes.",
|
|
111
123
|
)
|
|
112
124
|
parser.add_argument(
|
|
113
125
|
"--maxdropamount",
|
|
114
126
|
type=int,
|
|
115
|
-
default=
|
|
116
|
-
help="Specifies the maximum drop amount."
|
|
127
|
+
default=50,
|
|
128
|
+
help="Specifies the maximum drop amount.",
|
|
117
129
|
)
|
|
118
130
|
parser.add_argument(
|
|
119
|
-
"--madoutlier",
|
|
120
|
-
type=int,
|
|
121
|
-
default=5,
|
|
122
|
-
help="Specifies the MAD outlier."
|
|
131
|
+
"--madoutlier", type=int, default=5, help="Specifies the MAD outlier."
|
|
123
132
|
)
|
|
124
133
|
parser.add_argument(
|
|
125
134
|
"--pct_mt_outlier",
|
|
126
135
|
type=int,
|
|
127
136
|
default=8,
|
|
128
|
-
help="Specifies the percentage of MT outlier."
|
|
137
|
+
help="Specifies the percentage of MT outlier.",
|
|
129
138
|
)
|
|
130
139
|
parser.add_argument(
|
|
131
|
-
"--batch_key",
|
|
132
|
-
type=Optional[str],
|
|
133
|
-
default=None,
|
|
134
|
-
help="Specifies the batch key."
|
|
140
|
+
"--batch_key", type=Optional[str], default=None, help="Specifies the batch key."
|
|
135
141
|
)
|
|
136
142
|
parser.add_argument(
|
|
137
143
|
"--skip_validate",
|
|
138
144
|
type=bool,
|
|
139
145
|
default=False,
|
|
140
|
-
help="Determines whether to skip validation."
|
|
146
|
+
help="Determines whether to skip validation.",
|
|
147
|
+
)
|
|
148
|
+
parser.add_argument(
|
|
149
|
+
"--do_postp",
|
|
150
|
+
type=bool,
|
|
151
|
+
default=False,
|
|
152
|
+
help="Determines whether to do postprocessing.",
|
|
141
153
|
)
|
|
142
154
|
args = parser.parse_args()
|
|
143
155
|
|
|
144
156
|
# Load the collection
|
|
157
|
+
# if not args.preprocess:
|
|
158
|
+
# print("Only preprocess is available for now")
|
|
159
|
+
# return
|
|
145
160
|
if args.instance is not None:
|
|
146
|
-
collection =
|
|
147
|
-
|
|
148
|
-
|
|
161
|
+
collection = (
|
|
162
|
+
ln.Collection.using(instance=args.instance)
|
|
163
|
+
.filter(name=args.name, version=args.version)
|
|
164
|
+
.first()
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
collection = ln.Collection.filter(name=args.name, version=args.version).first()
|
|
149
168
|
|
|
150
|
-
print(
|
|
169
|
+
print(
|
|
170
|
+
"using the dataset ", collection, " of size ", len(collection.artifacts.all())
|
|
171
|
+
)
|
|
151
172
|
# Initialize the preprocessor
|
|
152
173
|
preprocessor = LaminPreprocessor(
|
|
153
174
|
filter_gene_by_counts=args.filter_gene_by_counts,
|
|
154
175
|
filter_cell_by_counts=args.filter_cell_by_counts,
|
|
155
176
|
normalize_sum=args.normalize_sum,
|
|
156
|
-
keep_norm_layer=args.keep_norm_layer,
|
|
157
177
|
subset_hvg=args.subset_hvg,
|
|
158
178
|
hvg_flavor=args.hvg_flavor,
|
|
159
179
|
binning=args.binning,
|
|
@@ -168,10 +188,14 @@ def main():
|
|
|
168
188
|
pct_mt_outlier=args.pct_mt_outlier,
|
|
169
189
|
batch_key=args.batch_key,
|
|
170
190
|
skip_validate=args.skip_validate,
|
|
191
|
+
do_postp=args.do_postp,
|
|
192
|
+
additional_preprocess=additional_preprocess,
|
|
193
|
+
additional_postprocess=additional_postprocess,
|
|
194
|
+
keep_files=False,
|
|
171
195
|
)
|
|
172
196
|
|
|
173
197
|
# Preprocess the dataset
|
|
174
|
-
|
|
198
|
+
preprocessor(
|
|
175
199
|
collection,
|
|
176
200
|
name=args.new_name,
|
|
177
201
|
description=args.description,
|