scdataloader 1.9.1__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scdataloader-1.9.1 → scdataloader-2.0.0}/.gitignore +3 -0
- scdataloader-2.0.0/LICENSE +21 -0
- {scdataloader-1.9.1 → scdataloader-2.0.0}/PKG-INFO +13 -5
- {scdataloader-1.9.1 → scdataloader-2.0.0}/README.md +10 -1
- {scdataloader-1.9.1 → scdataloader-2.0.0}/pyproject.toml +4 -5
- {scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/__init__.py +2 -1
- {scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/collator.py +30 -42
- {scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/config.py +25 -9
- scdataloader-2.0.0/scdataloader/data.json +384 -0
- {scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/data.py +116 -43
- scdataloader-2.0.0/scdataloader/datamodule.py +829 -0
- {scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/mapped.py +84 -18
- {scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/preprocess.py +108 -94
- {scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/utils.py +39 -33
- scdataloader-1.9.1/LICENSE +0 -674
- scdataloader-1.9.1/scdataloader/VERSION +0 -1
- scdataloader-1.9.1/scdataloader/datamodule.py +0 -499
- {scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/__main__.py +0 -0
- {scdataloader-1.9.1 → scdataloader-2.0.0}/scdataloader/base.py +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Jérémie Kalfon
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Project-URL: repository, https://github.com/jkobject/scDataLoader
|
|
6
6
|
Author-email: jkobject <jkobject@gmail.com>
|
|
@@ -12,14 +12,13 @@ Requires-Dist: anndata>=0.9.0
|
|
|
12
12
|
Requires-Dist: biomart>=0.9.0
|
|
13
13
|
Requires-Dist: cellxgene-census>=0.1.0
|
|
14
14
|
Requires-Dist: django>=4.0.0
|
|
15
|
-
Requires-Dist: harmonypy>=0.0.10
|
|
16
15
|
Requires-Dist: ipykernel>=6.20.0
|
|
17
16
|
Requires-Dist: jupytext>=1.16.0
|
|
18
|
-
Requires-Dist: lamindb[bionty,cellregistry,jupyter,
|
|
17
|
+
Requires-Dist: lamindb[bionty,cellregistry,jupyter,zarr]==1.0.4
|
|
19
18
|
Requires-Dist: leidenalg>=0.8.0
|
|
19
|
+
Requires-Dist: lightning>=2.3.0
|
|
20
20
|
Requires-Dist: matplotlib>=3.5.0
|
|
21
21
|
Requires-Dist: numpy==1.26.0
|
|
22
|
-
Requires-Dist: palantir>=1.3.3
|
|
23
22
|
Requires-Dist: pandas>=2.0.0
|
|
24
23
|
Requires-Dist: pytorch-lightning>=2.3.0
|
|
25
24
|
Requires-Dist: scikit-misc>=0.5.0
|
|
@@ -71,7 +70,16 @@ It allows you to:
|
|
|
71
70
|
3. create a more complex single cell dataset
|
|
72
71
|
4. extend it to your need
|
|
73
72
|
|
|
74
|
-
built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
|
|
73
|
+
built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
Portions of the mapped.py file are derived from Lamin Labs
|
|
77
|
+
Copyright 2024 Lamin Labs
|
|
78
|
+
Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
|
79
|
+
The rest of the package is licensed under MIT License, see LICENSE for details
|
|
80
|
+
Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_collection.py
|
|
81
|
+
for the original implementation
|
|
82
|
+
```
|
|
75
83
|
|
|
76
84
|
The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
|
|
77
85
|
|
|
@@ -28,7 +28,16 @@ It allows you to:
|
|
|
28
28
|
3. create a more complex single cell dataset
|
|
29
29
|
4. extend it to your need
|
|
30
30
|
|
|
31
|
-
built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
|
|
31
|
+
built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
Portions of the mapped.py file are derived from Lamin Labs
|
|
35
|
+
Copyright 2024 Lamin Labs
|
|
36
|
+
Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
|
37
|
+
The rest of the package is licensed under MIT License, see LICENSE for details
|
|
38
|
+
Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_collection.py
|
|
39
|
+
for the original implementation
|
|
40
|
+
```
|
|
32
41
|
|
|
33
42
|
The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
|
|
34
43
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "scdataloader"
|
|
3
|
-
version = "
|
|
3
|
+
version = "2.0.0"
|
|
4
4
|
description = "a dataloader for single cell data in lamindb"
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "jkobject", email = "jkobject@gmail.com"}
|
|
@@ -11,7 +11,7 @@ requires-python = ">=3.10,<3.14"
|
|
|
11
11
|
keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
|
|
12
12
|
dependencies = [
|
|
13
13
|
"numpy==1.26.0",
|
|
14
|
-
"lamindb[bionty,
|
|
14
|
+
"lamindb[bionty,jupyter,cellregistry,zarr]==1.0.4",
|
|
15
15
|
"cellxgene-census>=0.1.0",
|
|
16
16
|
"torch==2.2.0",
|
|
17
17
|
"pytorch-lightning>=2.3.0",
|
|
@@ -26,10 +26,9 @@ dependencies = [
|
|
|
26
26
|
"leidenalg>=0.8.0",
|
|
27
27
|
"django>=4.0.0",
|
|
28
28
|
"scikit-misc>=0.5.0",
|
|
29
|
-
"palantir>=1.3.3",
|
|
30
|
-
"harmonypy>=0.0.10",
|
|
31
29
|
"jupytext>=1.16.0",
|
|
32
|
-
|
|
30
|
+
"lightning>=2.3.0",
|
|
31
|
+
"pytorch-lightning>=2.3.0",
|
|
33
32
|
]
|
|
34
33
|
|
|
35
34
|
[project.optional-dependencies]
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
from importlib.metadata import version
|
|
2
|
+
|
|
1
3
|
from .collator import Collator
|
|
2
4
|
from .data import Dataset, SimpleAnnDataset
|
|
3
5
|
from .datamodule import DataModule
|
|
4
6
|
from .preprocess import Preprocessor
|
|
5
|
-
from importlib.metadata import version
|
|
6
7
|
|
|
7
8
|
__version__ = version("scdataloader")
|
|
@@ -3,7 +3,7 @@ from typing import Optional
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
from torch import Tensor, long
|
|
5
5
|
|
|
6
|
-
from .utils import
|
|
6
|
+
from .utils import load_genes
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class Collator:
|
|
@@ -22,8 +22,6 @@ class Collator:
|
|
|
22
22
|
organism_name: str = "organism_ontology_term_id",
|
|
23
23
|
class_names: list[str] = [],
|
|
24
24
|
genelist: list[str] = [],
|
|
25
|
-
downsample: Optional[float] = None, # don't use it for training!
|
|
26
|
-
save_output: Optional[str] = None,
|
|
27
25
|
):
|
|
28
26
|
"""
|
|
29
27
|
This class is responsible for collating data for the scPRINT model. It handles the
|
|
@@ -57,13 +55,8 @@ class Collator:
|
|
|
57
55
|
class_names (list, optional): List of other classes to be considered. Defaults to [].
|
|
58
56
|
genelist (list, optional): List of genes to be considered. Defaults to [].
|
|
59
57
|
If [] all genes will be considered
|
|
60
|
-
downsample (float, optional): Downsample the profile to a certain number of cells. Defaults to None.
|
|
61
|
-
This is usually done by the scPRINT model during training but this option allows you to do it directly from the collator
|
|
62
|
-
save_output (str, optional): If not None, saves the output to a file. Defaults to None.
|
|
63
|
-
This is mainly for debugging purposes
|
|
64
58
|
"""
|
|
65
59
|
self.organisms = organisms
|
|
66
|
-
self.genedf = load_genes(organisms)
|
|
67
60
|
self.max_len = max_len
|
|
68
61
|
self.n_bins = n_bins
|
|
69
62
|
self.add_zero_genes = add_zero_genes
|
|
@@ -75,14 +68,14 @@ class Collator:
|
|
|
75
68
|
self.organism_name = organism_name
|
|
76
69
|
self.tp_name = tp_name
|
|
77
70
|
self.class_names = class_names
|
|
78
|
-
self.save_output = save_output
|
|
79
71
|
self.start_idx = {}
|
|
80
72
|
self.accepted_genes = {}
|
|
81
|
-
self.downsample = downsample
|
|
82
73
|
self.to_subset = {}
|
|
83
|
-
self._setup(org_to_id, valid_genes, genelist)
|
|
74
|
+
self._setup(None, org_to_id, valid_genes, genelist)
|
|
84
75
|
|
|
85
|
-
def _setup(self, org_to_id=None, valid_genes=[], genelist=[]):
|
|
76
|
+
def _setup(self, genedf=None, org_to_id=None, valid_genes=[], genelist=[]):
|
|
77
|
+
if genedf is None:
|
|
78
|
+
genedf = load_genes(self.organisms)
|
|
86
79
|
self.org_to_id = org_to_id
|
|
87
80
|
self.to_subset = {}
|
|
88
81
|
self.accepted_genes = {}
|
|
@@ -92,14 +85,17 @@ class Collator:
|
|
|
92
85
|
if org_to_id is not None
|
|
93
86
|
else set(self.organisms)
|
|
94
87
|
)
|
|
88
|
+
if len(valid_genes) > 0:
|
|
89
|
+
if len(set(valid_genes) - set(genedf.index)) > 0:
|
|
90
|
+
print("Some valid genes are not in the genedf!!!")
|
|
91
|
+
tot = genedf[genedf.index.isin(valid_genes)]
|
|
92
|
+
else:
|
|
93
|
+
tot = genedf
|
|
95
94
|
for organism in self.organisms:
|
|
96
|
-
ogenedf = self.genedf[self.genedf.organism == organism]
|
|
97
|
-
if len(valid_genes) > 0:
|
|
98
|
-
tot = self.genedf[self.genedf.index.isin(valid_genes)]
|
|
99
|
-
else:
|
|
100
|
-
tot = self.genedf
|
|
101
95
|
org = org_to_id[organism] if org_to_id is not None else organism
|
|
102
96
|
self.start_idx.update({org: np.where(tot.organism == organism)[0][0]})
|
|
97
|
+
|
|
98
|
+
ogenedf = genedf[genedf.organism == organism]
|
|
103
99
|
if len(valid_genes) > 0:
|
|
104
100
|
self.accepted_genes.update({org: ogenedf.index.isin(valid_genes)})
|
|
105
101
|
if len(genelist) > 0:
|
|
@@ -148,7 +144,6 @@ class Collator:
|
|
|
148
144
|
:, self.accepted_genes[organism_id]
|
|
149
145
|
]
|
|
150
146
|
if self.how == "most expr":
|
|
151
|
-
nnz_loc = np.where(expr > 0)[0]
|
|
152
147
|
if "knn_cells" in elem:
|
|
153
148
|
nnz_loc = np.where(expr + elem["knn_cells"].sum(0) > 0)[0]
|
|
154
149
|
ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
|
|
@@ -161,14 +156,18 @@ class Collator:
|
|
|
161
156
|
# loc = np.argsort(expr)[-(self.max_len) :][::-1]
|
|
162
157
|
elif self.how == "random expr":
|
|
163
158
|
nnz_loc = np.where(expr > 0)[0]
|
|
164
|
-
loc =
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
159
|
+
loc = (
|
|
160
|
+
nnz_loc[
|
|
161
|
+
np.random.choice(
|
|
162
|
+
len(nnz_loc),
|
|
163
|
+
self.max_len,
|
|
164
|
+
replace=False,
|
|
165
|
+
# p=(expr.max() + (expr[nnz_loc])*19) / expr.max(), # 20 at most times more likely to be selected
|
|
166
|
+
)
|
|
167
|
+
]
|
|
168
|
+
if self.max_len < len(nnz_loc)
|
|
169
|
+
else nnz_loc
|
|
170
|
+
)
|
|
172
171
|
elif self.how in ["all", "some"]:
|
|
173
172
|
loc = np.arange(len(expr))
|
|
174
173
|
else:
|
|
@@ -179,23 +178,19 @@ class Collator:
|
|
|
179
178
|
"all",
|
|
180
179
|
"some",
|
|
181
180
|
]:
|
|
181
|
+
ma = self.add_zero_genes + (
|
|
182
|
+
0 if self.max_len < len(nnz_loc) else self.max_len - len(nnz_loc)
|
|
183
|
+
)
|
|
182
184
|
if "knn_cells" in elem:
|
|
183
185
|
# we complete with genes expressed in the knn
|
|
184
|
-
nnz_loc = np.where(elem["knn_cells"].sum(0) > 0)[0]
|
|
185
|
-
ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
|
|
186
186
|
# which is not a zero_loc in this context
|
|
187
|
-
zero_loc = np.argsort(elem["knn_cells"].sum(0))[-
|
|
187
|
+
zero_loc = np.argsort(elem["knn_cells"].sum(0))[-ma:][::-1]
|
|
188
188
|
else:
|
|
189
189
|
zero_loc = np.where(expr == 0)[0]
|
|
190
190
|
zero_loc = zero_loc[
|
|
191
191
|
np.random.choice(
|
|
192
192
|
len(zero_loc),
|
|
193
|
-
|
|
194
|
-
+ (
|
|
195
|
-
0
|
|
196
|
-
if self.max_len < len(nnz_loc)
|
|
197
|
-
else self.max_len - len(nnz_loc)
|
|
198
|
-
),
|
|
193
|
+
ma,
|
|
199
194
|
replace=False,
|
|
200
195
|
)
|
|
201
196
|
]
|
|
@@ -255,13 +250,6 @@ class Collator:
|
|
|
255
250
|
ret.update({"knn_cells": Tensor(knn_cells)})
|
|
256
251
|
if len(dataset) > 0:
|
|
257
252
|
ret.update({"dataset": Tensor(dataset).to(long)})
|
|
258
|
-
if self.downsample is not None:
|
|
259
|
-
ret["x"] = downsample_profile(ret["x"], self.downsample)
|
|
260
|
-
if self.save_output is not None:
|
|
261
|
-
with open(self.save_output, "a") as f:
|
|
262
|
-
np.savetxt(f, ret["x"].numpy())
|
|
263
|
-
with open(self.save_output + "_loc", "a") as f:
|
|
264
|
-
np.savetxt(f, gene_locs)
|
|
265
253
|
return ret
|
|
266
254
|
|
|
267
255
|
|
|
@@ -113,26 +113,34 @@ COARSE_ASSAY = {
|
|
|
113
113
|
|
|
114
114
|
|
|
115
115
|
MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
|
|
116
|
-
"HsapDv:0010000": [
|
|
116
|
+
"HsapDv:0010000": [ # postnatal stage
|
|
117
117
|
"MmusDv:0000092", # postnatal stage
|
|
118
118
|
],
|
|
119
|
-
"HsapDv:0000258": [ # mature stage
|
|
119
|
+
"HsapDv:0000258": [ # mature stage >15
|
|
120
120
|
"MmusDv:0000110", # mature stage
|
|
121
|
-
"HsapDv:0000204",
|
|
121
|
+
"HsapDv:0000204", #
|
|
122
122
|
],
|
|
123
|
-
"HsapDv:
|
|
123
|
+
"HsapDv:0000087": [], # adult stage >19
|
|
124
|
+
"HsapDv:0000227": [ # late adult stage > 40
|
|
124
125
|
"MmusDv:0000091", # 20 month-old stage
|
|
125
126
|
"MmusDv:0000089", # 18 month-old stage
|
|
127
|
+
"HsapDv:0000091", # > 45
|
|
128
|
+
"HsapDv:0000093", # > 65
|
|
129
|
+
],
|
|
130
|
+
"HsapDv:0000272": [ # 60-79 year-old stage
|
|
131
|
+
"HsapDv:0000094", # 60-79 year-old stage
|
|
126
132
|
],
|
|
127
|
-
"HsapDv:0000272": [], # 60-79 year-old stage
|
|
128
133
|
"HsapDv:0000095": [], # 80 year-old and over stage
|
|
129
|
-
"HsapDv:0000267": [ # middle aged stage
|
|
134
|
+
"HsapDv:0000267": [ # middle aged stage >40 <60
|
|
130
135
|
"MmusDv:0000087", # 16 month-old stage
|
|
131
136
|
"UBERON:0018241", # prime adult stage
|
|
132
137
|
"MmusDv:0000083", # 12 month-old stage
|
|
133
138
|
"HsapDv:0000092", # same
|
|
134
139
|
],
|
|
135
|
-
"HsapDv:0000266": [ # young adult stage
|
|
140
|
+
"HsapDv:0000266": [ # young adult stage <40
|
|
141
|
+
"HsapDv:0000088", # mature stage
|
|
142
|
+
"HsapDv:0000090", # 25 - 44
|
|
143
|
+
"HsapDv:0000086", # adolescent stage
|
|
136
144
|
"MmusDv:0000050", # 6 weeks
|
|
137
145
|
"HsapDv:0000089", # same
|
|
138
146
|
"MmusDv:0000051", # 7 weeks
|
|
@@ -163,22 +171,30 @@ MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
|
|
|
163
171
|
"MmusDv:0000099", # 26 weeks
|
|
164
172
|
"MmusDv:0000102", # 29 weeks
|
|
165
173
|
],
|
|
166
|
-
"HsapDv:0000265": [
|
|
174
|
+
"HsapDv:0000265": [ # child stage (1-4 yo)
|
|
175
|
+
"HsapDv:0000084", # 2-5 yo
|
|
176
|
+
],
|
|
167
177
|
"HsapDv:0000271": [ # juvenile stage (5-14 yo)
|
|
168
178
|
"MmusDv:0000048", # 4 weeks
|
|
169
179
|
"MmusDv:0000049", # 5 weeks
|
|
180
|
+
"HsapDv:0000081", # child
|
|
181
|
+
"HsapDv:0000085", # 6-11 yo
|
|
170
182
|
],
|
|
171
|
-
"HsapDv:0000260": [ # infant stage
|
|
183
|
+
"HsapDv:0000260": [ # infant stage <2
|
|
172
184
|
"MmusDv:0000046", # 2 weeks
|
|
173
185
|
"MmusDv:0000045", # 1 week
|
|
174
186
|
"MmusDv:0000047", # 3 weeks
|
|
175
187
|
"HsapDv:0000083",
|
|
188
|
+
"HsapDv:0000256", # under 1 yo
|
|
176
189
|
],
|
|
177
190
|
"HsapDv:0000262": [ # newborn stage (0-28 days)
|
|
178
191
|
"MmusDv:0000036", # Theiler stage 27
|
|
179
192
|
"MmusDv:0000037", # Theiler stage 28
|
|
180
193
|
"MmusDv:0000113", # 4-7 days
|
|
194
|
+
"HsapDv:0000174", # 1 month-old stage
|
|
195
|
+
"HsapDv:0000082", # newborn stage
|
|
181
196
|
],
|
|
197
|
+
"HsapDv:0000002": [], # embryonic stage
|
|
182
198
|
"HsapDv:0000007": [], # Carnegie stage 03
|
|
183
199
|
"HsapDv:0000008": [], # Carnegie stage 04
|
|
184
200
|
"HsapDv:0000009": [], # Carnegie stage 05
|