scdataloader 1.1.3__tar.gz → 1.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,133 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # templates
132
+ .github/templates/*
133
+ .DS_Store
@@ -0,0 +1,299 @@
1
+ Metadata-Version: 2.3
2
+ Name: scdataloader
3
+ Version: 1.2.2
4
+ Summary: a dataloader for single cell data in lamindb
5
+ Project-URL: repository, https://github.com/jkobject/scDataLoader
6
+ Author-email: jkobject <jkobject@gmail.com>
7
+ License: MIT
8
+ Keywords: dataloader,lamindb,pytorch,scPRINT,scRNAseq
9
+ Requires-Python: <3.11,>=3.10
10
+ Requires-Dist: anndata>=0.9.0
11
+ Requires-Dist: biomart>=0.9.0
12
+ Requires-Dist: cellxgene-census>=0.1.0
13
+ Requires-Dist: django>=4.0.0
14
+ Requires-Dist: ipykernel>=6.20.0
15
+ Requires-Dist: lamindb[bionty]==0.76.12
16
+ Requires-Dist: leidenalg>=0.8.0
17
+ Requires-Dist: lightning>=2.0.0
18
+ Requires-Dist: matplotlib>=3.5.0
19
+ Requires-Dist: numpy>=1.26.0
20
+ Requires-Dist: palantir>=1.3.3
21
+ Requires-Dist: pandas>=2.0.0
22
+ Requires-Dist: scikit-misc>=0.5.0
23
+ Requires-Dist: seaborn>=0.11.0
24
+ Requires-Dist: torch==2.2.0
25
+ Requires-Dist: torchdata>=0.5.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: coverage>=7.3.2; extra == 'dev'
28
+ Requires-Dist: gitchangelog>=3.0.4; extra == 'dev'
29
+ Requires-Dist: mkdocs-git-authors-plugin>=0.4.0; extra == 'dev'
30
+ Requires-Dist: mkdocs-git-revision-date-localized-plugin>=1.0.0; extra == 'dev'
31
+ Requires-Dist: mkdocs-jupyter>=0.2.0; extra == 'dev'
32
+ Requires-Dist: mkdocs>=1.5.3; extra == 'dev'
33
+ Requires-Dist: mkdocstrings-python>=0.10.0; extra == 'dev'
34
+ Requires-Dist: mkdocstrings>=0.22.0; extra == 'dev'
35
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
36
+ Requires-Dist: pytest>=7.4.3; extra == 'dev'
37
+ Requires-Dist: ruff>=0.6.4; extra == 'dev'
38
+ Description-Content-Type: text/markdown
39
+
40
+ # scdataloader
41
+
42
+ [![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
43
+ [![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
44
+ [![PyPI version](https://badge.fury.io/py/scDataLoader.svg)](https://badge.fury.io/py/scDataLoader)
45
+ [![Downloads](https://pepy.tech/badge/scDataLoader)](https://pepy.tech/project/scDataLoader)
46
+ [![Downloads](https://pepy.tech/badge/scDataLoader/month)](https://pepy.tech/project/scDataLoader)
47
+ [![Downloads](https://pepy.tech/badge/scDataLoader/week)](https://pepy.tech/project/scDataLoader)
48
+ [![GitHub issues](https://img.shields.io/github/issues/jkobject/scDataLoader)](https://img.shields.io/github/issues/jkobject/scDataLoader)
49
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
50
+ [![DOI](https://img.shields.io/badge/DOI-10.1101%2F2024.07.29.605556-blue)](https://doi.org/10.1101/2024.07.29.605556)
51
+
52
+ This single cell pytorch dataloader / lighting datamodule is designed to be used with:
53
+
54
+ - [lamindb](https://lamin.ai/)
55
+
56
+ and:
57
+
58
+ - [scanpy](https://scanpy.readthedocs.io/en/stable/)
59
+ - [anndata](https://anndata.readthedocs.io/en/latest/)
60
+
61
+ It allows you to:
62
+
63
+ 1. load thousands of datasets containing millions of cells in a few seconds.
64
+ 2. preprocess the data per dataset and download it locally (normalization, filtering, etc.)
65
+ 3. create a more complex single cell dataset
66
+ 4. extend it to your need
67
+
68
+ built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
69
+
70
+ The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
71
+
72
+ ## More
73
+
74
+ I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
75
+
76
+ ![scdataloader.drawio.png](docs/scdataloader.drawio.png)
77
+
78
+ ## Install it from PyPI
79
+
80
+ ```bash
81
+ pip install scdataloader
82
+ # or
83
+ pip install scDataLoader[dev] # for dev dependencies
84
+
85
+ lamin init --storage ./testdb --name test --schema bionty
86
+ ```
87
+
88
+ if you start with lamin and had to do a `lamin init`, you will also need to populate your ontologies. This is because scPRINT is using ontologies to define its cell types, diseases, sexes, ethnicities, etc.
89
+
90
+ you can do it manually or with our function:
91
+
92
+ ```python
93
+ from scdataloader.utils import populate_my_ontology
94
+
95
+ populate_my_ontology() #to populate everything (recommended) (can take 2-10mns)
96
+
97
+ populate_my_ontology( #the minimum to the tool
98
+ organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
99
+ sex: List[str] = ["PATO:0000384", "PATO:0000383"],
100
+ celltypes = None,
101
+ ethnicities = None,
102
+ assays = None,
103
+ tissues = None,
104
+ diseases = None,
105
+ dev_stages = None,
106
+ )
107
+ ```
108
+
109
+ ### Dev install
110
+
111
+ If you want to use the latest version of scDataLoader and work on the code yourself use `git clone` and `pip -e` instead of `pip install`.
112
+
113
+ ```bash
114
+ git clone https://github.com/jkobject/scDataLoader.git
115
+ pip install -e scDataLoader[dev]
116
+ ```
117
+
118
+ ## Usage
119
+
120
+ ### DataModule usage
121
+
122
+ ```python
123
+ # initialize a local lamin database
124
+ #! lamin init --storage ./cellxgene --name cellxgene --schema bionty
125
+ from scdataloader import utils, Preprocessor, DataModule
126
+
127
+
128
+ # preprocess datasets
129
+ preprocessor = Preprocessor(
130
+ do_postp=False,
131
+ force_preprocess=True,
132
+ )
133
+ adata = preprocessor(adata)
134
+
135
+ art = ln.Artifact(adata, description="test")
136
+ art.save()
137
+ ln.Collection(art, name="test", description="test").save()
138
+
139
+ datamodule = DataModule(
140
+ collection_name="test",
141
+ organisms=["NCBITaxon:9606"], #organism that we will work on
142
+ how="most expr", # for the collator (most expr genes only will be selected)
143
+ max_len=1000, # only the 1000 most expressed
144
+ batch_size=64,
145
+ num_workers=1,
146
+ validation_split=0.1,
147
+ )
148
+ ```
149
+
150
+ ### lightning-free usage (Dataset+Collator+DataLoader)
151
+
152
+ ```python
153
+ # initialize a local lamin database
154
+ #! lamin init --storage ./cellxgene --name cellxgene --schema bionty
155
+
156
+ from scdataloader import utils, Preprocessor, SimpleAnnDataset, Collator, DataLoader
157
+
158
+ # preprocess dataset
159
+ preprocessor = Preprocessor(
160
+ do_postp=False,
161
+ force_preprocess=True,
162
+ )
163
+ adata = preprocessor(adata)
164
+
165
+ # create dataset
166
+ adataset = SimpleAnnDataset(
167
+ adata, obs_to_output=["organism_ontology_term_id"]
168
+ )
169
+ # create collator
170
+ col = Collator(
171
+ organisms="NCBITaxon:9606",
172
+ valid_genes=adata.var_names,
173
+ max_len=2000, #maximum number of genes to use
174
+ how="some" |"most expr"|"random_expr",
175
+ # genelist = [geneA, geneB] if how=='some'
176
+ )
177
+ # create dataloader
178
+ dataloader = DataLoader(
179
+ adataset,
180
+ collate_fn=col,
181
+ batch_size=64,
182
+ num_workers=4,
183
+ shuffle=False,
184
+ )
185
+
186
+ # predict
187
+ for batch in tqdm(dataloader):
188
+ gene_pos, expression, depth = (
189
+ batch["genes"],
190
+ batch["x"],
191
+ batch["depth"],
192
+ )
193
+ model.predict(
194
+ gene_pos,
195
+ expression,
196
+ depth,
197
+ )
198
+ ```
199
+
200
+ ### Usage on all of cellxgene
201
+
202
+ ```python
203
+ # initialize a local lamin database
204
+ #! lamin init --storage ./cellxgene --name cellxgene --schema bionty
205
+
206
+ from scdataloader import utils
207
+ from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
208
+
209
+ # preprocess datasets
210
+ DESCRIPTION='preprocessed by scDataLoader'
211
+
212
+ cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
213
+ cx_dataset, len(cx_dataset.artifacts.all())
214
+
215
+
216
+ do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
217
+
218
+ preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
219
+
220
+ # create dataloaders
221
+ from scdataloader import DataModule
222
+ import tqdm
223
+
224
+ datamodule = DataModule(
225
+ collection_name="preprocessed dataset",
226
+ organisms=["NCBITaxon:9606"], #organism that we will work on
227
+ how="most expr", # for the collator (most expr genes only will be selected)
228
+ max_len=1000, # only the 1000 most expressed
229
+ batch_size=64,
230
+ num_workers=1,
231
+ validation_split=0.1,
232
+ test_split=0)
233
+
234
+ for i in tqdm.tqdm(datamodule.train_dataloader()):
235
+ # pass #or do pass
236
+ print(i)
237
+ break
238
+
239
+ # with lightning:
240
+ # Trainer(model, datamodule)
241
+
242
+ ```
243
+
244
+ see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
245
+
246
+ 1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
247
+ 2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
248
+
249
+ ### command line preprocessing
250
+
251
+ You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
252
+
253
+ ```bash
254
+ scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
255
+ ```
256
+
257
+ ### command line usage
258
+
259
+ The main way to use
260
+
261
+ > please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
262
+
263
+ ## FAQ
264
+
265
+ ### how to update my ontologies?
266
+
267
+ ```bash
268
+ import bionty as bt
269
+ bt.reset_sources()
270
+
271
+ # Run via CLI: lamin load <your instance>
272
+
273
+ import lnschema_bionty as lb
274
+ lb.dev.sync_bionty_source_to_latest()
275
+ ```
276
+
277
+ ### how to load all ontologies?
278
+
279
+ ```python
280
+ from scdataloader import utils
281
+ utils.populate_ontologies() # this might take from 5-20mins
282
+ ```
283
+
284
+ ## Development
285
+
286
+ Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
287
+
288
+ ## License
289
+
290
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
291
+
292
+ ## Acknowledgments
293
+
294
+ - [lamin.ai](https://lamin.ai/)
295
+ - [scanpy](https://scanpy.readthedocs.io/en/stable/)
296
+ - [anndata](https://anndata.readthedocs.io/en/latest/)
297
+ - [scprint](https://www.jkobject.com/scPRINT/)
298
+
299
+ Awesome single cell dataloader created by @jkobject
@@ -3,7 +3,6 @@
3
3
  [![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
4
4
  [![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
5
5
  [![PyPI version](https://badge.fury.io/py/scDataLoader.svg)](https://badge.fury.io/py/scDataLoader)
6
- [![Documentation Status](https://readthedocs.org/projects/scDataLoader/badge/?version=latest)](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
7
6
  [![Downloads](https://pepy.tech/badge/scDataLoader)](https://pepy.tech/project/scDataLoader)
8
7
  [![Downloads](https://pepy.tech/badge/scDataLoader/month)](https://pepy.tech/project/scDataLoader)
9
8
  [![Downloads](https://pepy.tech/badge/scDataLoader/week)](https://pepy.tech/project/scDataLoader)
@@ -44,8 +43,7 @@ pip install scdataloader
44
43
  # or
45
44
  pip install scDataLoader[dev] # for dev dependencies
46
45
 
47
- lamin login <email> --key <API-key>
48
- lamin init --storage [folder-name-where-lamin-data-will-be-stored] --schema bionty
46
+ lamin init --storage ./testdb --name test --schema bionty
49
47
  ```
50
48
 
51
49
  if you start with lamin and had to do a `lamin init`, you will also need to populate your ontologies. This is because scPRINT is using ontologies to define its cell types, diseases, sexes, ethnicities, etc.
@@ -57,7 +55,7 @@ from scdataloader.utils import populate_my_ontology
57
55
 
58
56
  populate_my_ontology() #to populate everything (recommended) (can take 2-10mns)
59
57
 
60
- populate_my_ontology( #the minimum for scprint to run some inferences (denoising, grn inference)
58
+ populate_my_ontology( #the minimum to the tool
61
59
  organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
62
60
  sex: List[str] = ["PATO:0000384", "PATO:0000383"],
63
61
  celltypes = None,
@@ -80,11 +78,91 @@ pip install -e scDataLoader[dev]
80
78
 
81
79
  ## Usage
82
80
 
83
- ### Direct Usage
81
+ ### DataModule usage
84
82
 
85
83
  ```python
86
84
  # initialize a local lamin database
87
- # !lamin init --storage ~/scdataloader --schema bionty
85
+ #! lamin init --storage ./cellxgene --name cellxgene --schema bionty
86
+ from scdataloader import utils, Preprocessor, DataModule
87
+
88
+
89
+ # preprocess datasets
90
+ preprocessor = Preprocessor(
91
+ do_postp=False,
92
+ force_preprocess=True,
93
+ )
94
+ adata = preprocessor(adata)
95
+
96
+ art = ln.Artifact(adata, description="test")
97
+ art.save()
98
+ ln.Collection(art, name="test", description="test").save()
99
+
100
+ datamodule = DataModule(
101
+ collection_name="test",
102
+ organisms=["NCBITaxon:9606"], #organism that we will work on
103
+ how="most expr", # for the collator (most expr genes only will be selected)
104
+ max_len=1000, # only the 1000 most expressed
105
+ batch_size=64,
106
+ num_workers=1,
107
+ validation_split=0.1,
108
+ )
109
+ ```
110
+
111
+ ### lightning-free usage (Dataset+Collator+DataLoader)
112
+
113
+ ```python
114
+ # initialize a local lamin database
115
+ #! lamin init --storage ./cellxgene --name cellxgene --schema bionty
116
+
117
+ from scdataloader import utils, Preprocessor, SimpleAnnDataset, Collator, DataLoader
118
+
119
+ # preprocess dataset
120
+ preprocessor = Preprocessor(
121
+ do_postp=False,
122
+ force_preprocess=True,
123
+ )
124
+ adata = preprocessor(adata)
125
+
126
+ # create dataset
127
+ adataset = SimpleAnnDataset(
128
+ adata, obs_to_output=["organism_ontology_term_id"]
129
+ )
130
+ # create collator
131
+ col = Collator(
132
+ organisms="NCBITaxon:9606",
133
+ valid_genes=adata.var_names,
134
+ max_len=2000, #maximum number of genes to use
135
+ how="some" |"most expr"|"random_expr",
136
+ # genelist = [geneA, geneB] if how=='some'
137
+ )
138
+ # create dataloader
139
+ dataloader = DataLoader(
140
+ adataset,
141
+ collate_fn=col,
142
+ batch_size=64,
143
+ num_workers=4,
144
+ shuffle=False,
145
+ )
146
+
147
+ # predict
148
+ for batch in tqdm(dataloader):
149
+ gene_pos, expression, depth = (
150
+ batch["genes"],
151
+ batch["x"],
152
+ batch["depth"],
153
+ )
154
+ model.predict(
155
+ gene_pos,
156
+ expression,
157
+ depth,
158
+ )
159
+ ```
160
+
161
+ ### Usage on all of cellxgene
162
+
163
+ ```python
164
+ # initialize a local lamin database
165
+ #! lamin init --storage ./cellxgene --name cellxgene --schema bionty
88
166
 
89
167
  from scdataloader import utils
90
168
  from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
@@ -0,0 +1,78 @@
1
+ [project]
2
+ name = "scdataloader"
3
+ version = "1.2.2"
4
+ description = "a dataloader for single cell data in lamindb"
5
+ authors = [
6
+ {name = "jkobject", email = "jkobject@gmail.com"}
7
+ ]
8
+ license = "MIT"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10,<3.11"
11
+ keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
12
+ dependencies = [
13
+ "numpy>=1.26.0",
14
+ "lamindb[bionty]==0.76.12",
15
+ "cellxgene-census>=0.1.0",
16
+ "torch==2.2.0",
17
+ "lightning>=2.0.0",
18
+ "anndata>=0.9.0",
19
+ "matplotlib>=3.5.0",
20
+ "seaborn>=0.11.0",
21
+ "ipykernel>=6.20.0",
22
+ "torchdata>=0.5.0",
23
+ "biomart>=0.9.0",
24
+ "pandas>=2.0.0",
25
+ "leidenalg>=0.8.0",
26
+ "django>=4.0.0",
27
+ "scikit-misc>=0.5.0",
28
+ "palantir>=1.3.3",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=7.4.3",
34
+ "coverage>=7.3.2",
35
+ "pytest-cov>=4.1.0",
36
+ "ruff>=0.6.4",
37
+ "gitchangelog>=3.0.4",
38
+ "mkdocs>=1.5.3",
39
+ "mkdocs-git-revision-date-localized-plugin>=1.0.0",
40
+ "mkdocstrings>=0.22.0",
41
+ "mkdocs-git-authors-plugin>=0.4.0",
42
+ "mkdocs-jupyter>=0.2.0",
43
+ "mkdocstrings-python>=0.10.0"
44
+ ]
45
+
46
+ [tool.ruff]
47
+ # Set the maximum line length to 88.
48
+ line-length = 88
49
+
50
+ [tool.ruff.lint]
51
+ select = ["E", "F", "I"]
52
+ ignore = ["E501", "E203", "E266", "E265", "F401", "F403", "E722", "E741", "E731", "E721"]
53
+
54
+ [project.urls]
55
+ repository = "https://github.com/jkobject/scDataLoader"
56
+
57
+ [build-system]
58
+ requires = ["hatchling"]
59
+ build-backend = "hatchling.build"
60
+
61
+ [tool.hatch.build.targets.sdist]
62
+ only-include = [
63
+ "/scdataloader",
64
+ ]
65
+
66
+ [tool.hatch.build.targets.wheel]
67
+ only-include = [
68
+ "/scdataloader",
69
+ ]
70
+
71
+ [tool.hatch.metadata]
72
+ allow-direct-references = true
73
+
74
+ [tool.uv.sources]
75
+ scdataloader = { workspace = true }
76
+
77
+ [scripts]
78
+ scdataloader = 'scdataloader.__main__:main'
@@ -0,0 +1 @@
1
+ 1.2.2
@@ -1,11 +1,13 @@
1
1
  import argparse
2
+ from typing import Optional, Union
3
+
4
+ import lamindb as ln
5
+
2
6
  from scdataloader.preprocess import (
3
7
  LaminPreprocessor,
4
- additional_preprocess,
5
8
  additional_postprocess,
9
+ additional_preprocess,
6
10
  )
7
- import lamindb as ln
8
- from typing import Optional, Union
9
11
 
10
12
 
11
13
  # scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
@@ -51,14 +53,14 @@ def main():
51
53
  )
52
54
  parser.add_argument(
53
55
  "--filter_gene_by_counts",
54
- type=Union[int, bool],
55
- default=False,
56
+ type=int,
57
+ default=0,
56
58
  help="Determines whether to filter genes by counts.",
57
59
  )
58
60
  parser.add_argument(
59
61
  "--filter_cell_by_counts",
60
- type=Union[int, bool],
61
- default=False,
62
+ type=int,
63
+ default=0,
62
64
  help="Determines whether to filter cells by counts.",
63
65
  )
64
66
  parser.add_argument(
@@ -151,6 +153,12 @@ def main():
151
153
  default=False,
152
154
  help="Determines whether to do postprocessing.",
153
155
  )
156
+ parser.add_argument(
157
+ "--cache",
158
+ type=bool,
159
+ default=True,
160
+ help="Determines whether to cache the dataset.",
161
+ )
154
162
  args = parser.parse_args()
155
163
 
156
164
  # Load the collection
@@ -176,6 +184,7 @@ def main():
176
184
  normalize_sum=args.normalize_sum,
177
185
  subset_hvg=args.subset_hvg,
178
186
  hvg_flavor=args.hvg_flavor,
187
+ cache=args.cache,
179
188
  binning=args.binning,
180
189
  result_binned_key=args.result_binned_key,
181
190
  length_normalize=args.length_normalize,
@@ -1,7 +1,9 @@
1
+ from typing import Optional
2
+
1
3
  import numpy as np
2
- from .utils import load_genes, downsample_profile
3
4
  from torch import Tensor, long
4
- from typing import Optional
5
+
6
+ from .utils import downsample_profile, load_genes
5
7
 
6
8
 
7
9
  class Collator: