scdataloader 1.8.0__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scdataloader-1.8.0 → scdataloader-1.9.0}/PKG-INFO +3 -2
- {scdataloader-1.8.0 → scdataloader-1.9.0}/pyproject.toml +4 -2
- scdataloader-1.9.0/scdataloader/VERSION +1 -0
- {scdataloader-1.8.0 → scdataloader-1.9.0}/scdataloader/__init__.py +2 -1
- {scdataloader-1.8.0 → scdataloader-1.9.0}/scdataloader/collator.py +51 -31
- {scdataloader-1.8.0 → scdataloader-1.9.0}/scdataloader/config.py +1 -1
- {scdataloader-1.8.0 → scdataloader-1.9.0}/scdataloader/data.py +4 -0
- {scdataloader-1.8.0 → scdataloader-1.9.0}/scdataloader/datamodule.py +3 -1
- {scdataloader-1.8.0 → scdataloader-1.9.0}/scdataloader/mapped.py +26 -7
- {scdataloader-1.8.0 → scdataloader-1.9.0}/scdataloader/preprocess.py +97 -35
- {scdataloader-1.8.0 → scdataloader-1.9.0}/scdataloader/utils.py +22 -19
- scdataloader-1.8.0/scdataloader/VERSION +0 -1
- {scdataloader-1.8.0 → scdataloader-1.9.0}/.gitignore +0 -0
- {scdataloader-1.8.0 → scdataloader-1.9.0}/LICENSE +0 -0
- {scdataloader-1.8.0 → scdataloader-1.9.0}/README.md +0 -0
- {scdataloader-1.8.0 → scdataloader-1.9.0}/scdataloader/__main__.py +0 -0
- {scdataloader-1.8.0 → scdataloader-1.9.0}/scdataloader/base.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.0
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Project-URL: repository, https://github.com/jkobject/scDataLoader
|
|
6
6
|
Author-email: jkobject <jkobject@gmail.com>
|
|
@@ -14,13 +14,14 @@ Requires-Dist: cellxgene-census>=0.1.0
|
|
|
14
14
|
Requires-Dist: django>=4.0.0
|
|
15
15
|
Requires-Dist: harmonypy>=0.0.10
|
|
16
16
|
Requires-Dist: ipykernel>=6.20.0
|
|
17
|
+
Requires-Dist: jupytext>=1.16.0
|
|
17
18
|
Requires-Dist: lamindb[bionty,cellregistry,jupyter,ourprojects,zarr]<2,>=1.0.4
|
|
18
19
|
Requires-Dist: leidenalg>=0.8.0
|
|
19
|
-
Requires-Dist: lightning>=2.0.0
|
|
20
20
|
Requires-Dist: matplotlib>=3.5.0
|
|
21
21
|
Requires-Dist: numpy==1.26.0
|
|
22
22
|
Requires-Dist: palantir>=1.3.3
|
|
23
23
|
Requires-Dist: pandas>=2.0.0
|
|
24
|
+
Requires-Dist: pytorch-lightning>=2.3.0
|
|
24
25
|
Requires-Dist: scikit-misc>=0.5.0
|
|
25
26
|
Requires-Dist: seaborn>=0.11.0
|
|
26
27
|
Requires-Dist: torch==2.2.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "scdataloader"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.9.0"
|
|
4
4
|
description = "a dataloader for single cell data in lamindb"
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "jkobject", email = "jkobject@gmail.com"}
|
|
@@ -14,7 +14,7 @@ dependencies = [
|
|
|
14
14
|
"lamindb[bionty,ourprojects,jupyter,cellregistry,zarr]>=1.0.4,<2",
|
|
15
15
|
"cellxgene-census>=0.1.0",
|
|
16
16
|
"torch==2.2.0",
|
|
17
|
-
"lightning>=2.
|
|
17
|
+
"pytorch-lightning>=2.3.0",
|
|
18
18
|
"anndata>=0.9.0",
|
|
19
19
|
"zarr>=2.10.0",
|
|
20
20
|
"matplotlib>=3.5.0",
|
|
@@ -28,6 +28,8 @@ dependencies = [
|
|
|
28
28
|
"scikit-misc>=0.5.0",
|
|
29
29
|
"palantir>=1.3.3",
|
|
30
30
|
"harmonypy>=0.0.10",
|
|
31
|
+
"jupytext>=1.16.0",
|
|
32
|
+
|
|
31
33
|
]
|
|
32
34
|
|
|
33
35
|
[project.optional-dependencies]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
1.9.0
|
|
@@ -24,7 +24,6 @@ class Collator:
|
|
|
24
24
|
genelist: list[str] = [],
|
|
25
25
|
downsample: Optional[float] = None, # don't use it for training!
|
|
26
26
|
save_output: Optional[str] = None,
|
|
27
|
-
metacell_mode: bool = False,
|
|
28
27
|
):
|
|
29
28
|
"""
|
|
30
29
|
This class is responsible for collating data for the scPRINT model. It handles the
|
|
@@ -62,7 +61,6 @@ class Collator:
|
|
|
62
61
|
This is usually done by the scPRINT model during training but this option allows you to do it directly from the collator
|
|
63
62
|
save_output (str, optional): If not None, saves the output to a file. Defaults to None.
|
|
64
63
|
This is mainly for debugging purposes
|
|
65
|
-
metacell_mode (bool, optional): Whether to sample a metacell. Defaults to False.
|
|
66
64
|
"""
|
|
67
65
|
self.organisms = organisms
|
|
68
66
|
self.genedf = load_genes(organisms)
|
|
@@ -82,7 +80,6 @@ class Collator:
|
|
|
82
80
|
self.accepted_genes = {}
|
|
83
81
|
self.downsample = downsample
|
|
84
82
|
self.to_subset = {}
|
|
85
|
-
self.metacell_mode = metacell_mode
|
|
86
83
|
self._setup(org_to_id, valid_genes, genelist)
|
|
87
84
|
|
|
88
85
|
def _setup(self, org_to_id=None, valid_genes=[], genelist=[]):
|
|
@@ -135,6 +132,7 @@ class Collator:
|
|
|
135
132
|
dataset = []
|
|
136
133
|
nnz_loc = []
|
|
137
134
|
is_meta = []
|
|
135
|
+
knn_cells = []
|
|
138
136
|
for elem in batch:
|
|
139
137
|
organism_id = elem[self.organism_name]
|
|
140
138
|
if organism_id not in self.organism_ids:
|
|
@@ -145,10 +143,20 @@ class Collator:
|
|
|
145
143
|
total_count.append(expr.sum())
|
|
146
144
|
if len(self.accepted_genes) > 0:
|
|
147
145
|
expr = expr[self.accepted_genes[organism_id]]
|
|
146
|
+
if "knn_cells" in elem:
|
|
147
|
+
elem["knn_cells"] = elem["knn_cells"][
|
|
148
|
+
:, self.accepted_genes[organism_id]
|
|
149
|
+
]
|
|
148
150
|
if self.how == "most expr":
|
|
149
151
|
nnz_loc = np.where(expr > 0)[0]
|
|
150
|
-
|
|
151
|
-
|
|
152
|
+
if "knn_cells" in elem:
|
|
153
|
+
nnz_loc = np.where(expr + elem["knn_cells"].sum(0) > 0)[0]
|
|
154
|
+
ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
|
|
155
|
+
loc = np.argsort(expr + elem["knn_cells"].mean(0))[-(ma):][::-1]
|
|
156
|
+
else:
|
|
157
|
+
nnz_loc = np.where(expr > 0)[0]
|
|
158
|
+
ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
|
|
159
|
+
loc = np.argsort(expr)[-(ma):][::-1]
|
|
152
160
|
# nnz_loc = [1] * 30_000
|
|
153
161
|
# loc = np.argsort(expr)[-(self.max_len) :][::-1]
|
|
154
162
|
elif self.how == "random expr":
|
|
@@ -171,33 +179,49 @@ class Collator:
|
|
|
171
179
|
"all",
|
|
172
180
|
"some",
|
|
173
181
|
]:
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
np.
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
182
|
+
if "knn_cells" in elem:
|
|
183
|
+
# we complete with genes expressed in the knn
|
|
184
|
+
nnz_loc = np.where(elem["knn_cells"].sum(0) > 0)[0]
|
|
185
|
+
ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
|
|
186
|
+
# which is not a zero_loc in this context
|
|
187
|
+
zero_loc = np.argsort(elem["knn_cells"].sum(0))[-(ma):][::-1]
|
|
188
|
+
else:
|
|
189
|
+
zero_loc = np.where(expr == 0)[0]
|
|
190
|
+
zero_loc = zero_loc[
|
|
191
|
+
np.random.choice(
|
|
192
|
+
len(zero_loc),
|
|
193
|
+
self.add_zero_genes
|
|
194
|
+
+ (
|
|
195
|
+
0
|
|
196
|
+
if self.max_len < len(nnz_loc)
|
|
197
|
+
else self.max_len - len(nnz_loc)
|
|
198
|
+
),
|
|
199
|
+
replace=False,
|
|
200
|
+
)
|
|
201
|
+
]
|
|
187
202
|
loc = np.concatenate((loc, zero_loc), axis=None)
|
|
188
203
|
expr = expr[loc]
|
|
189
|
-
|
|
204
|
+
if "knn_cells" in elem:
|
|
205
|
+
elem["knn_cells"] = elem["knn_cells"][:, loc]
|
|
190
206
|
if self.how == "some":
|
|
207
|
+
if "knn_cells" in elem:
|
|
208
|
+
elem["knn_cells"] = elem["knn_cells"][
|
|
209
|
+
:, self.to_subset[organism_id]
|
|
210
|
+
]
|
|
191
211
|
expr = expr[self.to_subset[organism_id]]
|
|
192
212
|
loc = loc[self.to_subset[organism_id]]
|
|
193
213
|
exprs.append(expr)
|
|
194
|
-
|
|
214
|
+
if "knn_cells" in elem:
|
|
215
|
+
knn_cells.append(elem["knn_cells"])
|
|
216
|
+
# then we need to add the start_idx to the loc to give it the correct index
|
|
217
|
+
# according to the model
|
|
218
|
+
gene_locs.append(loc + self.start_idx[organism_id])
|
|
195
219
|
|
|
196
220
|
if self.tp_name is not None:
|
|
197
221
|
tp.append(elem[self.tp_name])
|
|
198
222
|
else:
|
|
199
223
|
tp.append(0)
|
|
200
|
-
if
|
|
224
|
+
if "is_meta" in elem:
|
|
201
225
|
is_meta.append(elem["is_meta"])
|
|
202
226
|
other_classes.append([elem[i] for i in self.class_names])
|
|
203
227
|
expr = np.array(exprs)
|
|
@@ -207,6 +231,7 @@ class Collator:
|
|
|
207
231
|
other_classes = np.array(other_classes)
|
|
208
232
|
dataset = np.array(dataset)
|
|
209
233
|
is_meta = np.array(is_meta)
|
|
234
|
+
knn_cells = np.array(knn_cells)
|
|
210
235
|
# normalize counts
|
|
211
236
|
if self.norm_to is not None:
|
|
212
237
|
expr = (expr * self.norm_to) / total_count[:, None]
|
|
@@ -217,15 +242,6 @@ class Collator:
|
|
|
217
242
|
if self.n_bins:
|
|
218
243
|
pass
|
|
219
244
|
|
|
220
|
-
# find the associated gene ids (given the species)
|
|
221
|
-
|
|
222
|
-
# get the NN cells
|
|
223
|
-
|
|
224
|
-
# do encoding / selection a la scGPT
|
|
225
|
-
|
|
226
|
-
# do encoding of graph location
|
|
227
|
-
# encode all the edges in some sparse way
|
|
228
|
-
# normalizing total counts between 0,1
|
|
229
245
|
ret = {
|
|
230
246
|
"x": Tensor(expr),
|
|
231
247
|
"genes": Tensor(gene_locs).int(),
|
|
@@ -233,8 +249,10 @@ class Collator:
|
|
|
233
249
|
"tp": Tensor(tp),
|
|
234
250
|
"depth": Tensor(total_count),
|
|
235
251
|
}
|
|
236
|
-
if
|
|
252
|
+
if len(is_meta) > 0:
|
|
237
253
|
ret.update({"is_meta": Tensor(is_meta).int()})
|
|
254
|
+
if len(knn_cells) > 0:
|
|
255
|
+
ret.update({"knn_cells": Tensor(knn_cells)})
|
|
238
256
|
if len(dataset) > 0:
|
|
239
257
|
ret.update({"dataset": Tensor(dataset).to(long)})
|
|
240
258
|
if self.downsample is not None:
|
|
@@ -242,6 +260,8 @@ class Collator:
|
|
|
242
260
|
if self.save_output is not None:
|
|
243
261
|
with open(self.save_output, "a") as f:
|
|
244
262
|
np.savetxt(f, ret["x"].numpy())
|
|
263
|
+
with open(self.save_output + "_loc", "a") as f:
|
|
264
|
+
np.savetxt(f, gene_locs)
|
|
245
265
|
return ret
|
|
246
266
|
|
|
247
267
|
|
|
@@ -118,7 +118,7 @@ MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
|
|
|
118
118
|
],
|
|
119
119
|
"HsapDv:0000258": [ # mature stage
|
|
120
120
|
"MmusDv:0000110", # mature stage
|
|
121
|
-
"HsapDv:0000204",
|
|
121
|
+
"HsapDv:0000204", #
|
|
122
122
|
],
|
|
123
123
|
"HsapDv:0000227": [ # late adult stage
|
|
124
124
|
"MmusDv:0000091", # 20 month-old stage
|
|
@@ -58,6 +58,7 @@ class Dataset(torchDataset):
|
|
|
58
58
|
hierarchical_clss: Optional[list[str]] = field(default_factory=list)
|
|
59
59
|
join_vars: Literal["inner", "outer"] | None = None
|
|
60
60
|
metacell_mode: float = 0.0
|
|
61
|
+
get_knn_cells: bool = False
|
|
61
62
|
|
|
62
63
|
def __post_init__(self):
|
|
63
64
|
self.mapped_dataset = mapped(
|
|
@@ -69,6 +70,7 @@ class Dataset(torchDataset):
|
|
|
69
70
|
stream=True,
|
|
70
71
|
parallel=True,
|
|
71
72
|
metacell_mode=self.metacell_mode,
|
|
73
|
+
get_knn_cells=self.get_knn_cells,
|
|
72
74
|
)
|
|
73
75
|
print(
|
|
74
76
|
"won't do any check but we recommend to have your dataset coming from local storage"
|
|
@@ -371,6 +373,7 @@ def mapped(
|
|
|
371
373
|
is_run_input: bool | None = None,
|
|
372
374
|
metacell_mode: bool = False,
|
|
373
375
|
meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
|
|
376
|
+
get_knn_cells: bool = False,
|
|
374
377
|
) -> MappedCollection:
|
|
375
378
|
path_list = []
|
|
376
379
|
for artifact in dataset.artifacts.all():
|
|
@@ -397,5 +400,6 @@ def mapped(
|
|
|
397
400
|
dtype=dtype,
|
|
398
401
|
meta_assays=meta_assays,
|
|
399
402
|
metacell_mode=metacell_mode,
|
|
403
|
+
get_knn_cells=get_knn_cells,
|
|
400
404
|
)
|
|
401
405
|
return ds
|
|
@@ -52,6 +52,7 @@ class DataModule(L.LightningDataModule):
|
|
|
52
52
|
# "EFO:0030062", # slide-seq
|
|
53
53
|
],
|
|
54
54
|
metacell_mode: float = 0.0,
|
|
55
|
+
get_knn_cells: bool = False,
|
|
55
56
|
modify_seed_on_requeue: bool = True,
|
|
56
57
|
**kwargs,
|
|
57
58
|
):
|
|
@@ -88,6 +89,7 @@ class DataModule(L.LightningDataModule):
|
|
|
88
89
|
metacell_mode (float, optional): The probability of using metacell mode. Defaults to 0.0.
|
|
89
90
|
clss_to_predict (list, optional): List of classes to predict. Defaults to ["organism_ontology_term_id"].
|
|
90
91
|
modify_seed_on_requeue (bool, optional): Whether to modify the seed on requeue. Defaults to True.
|
|
92
|
+
get_knn_cells (bool, optional): Whether to get the k-nearest neighbors of each queried cells. Defaults to False.
|
|
91
93
|
**kwargs: Additional keyword arguments passed to the pytorch DataLoader.
|
|
92
94
|
see @file data.py and @file collator.py for more details about some of the parameters
|
|
93
95
|
"""
|
|
@@ -98,6 +100,7 @@ class DataModule(L.LightningDataModule):
|
|
|
98
100
|
clss_to_predict=clss_to_predict,
|
|
99
101
|
hierarchical_clss=hierarchical_clss,
|
|
100
102
|
metacell_mode=metacell_mode,
|
|
103
|
+
get_knn_cells=get_knn_cells,
|
|
101
104
|
)
|
|
102
105
|
# and location
|
|
103
106
|
self.metacell_mode = bool(metacell_mode)
|
|
@@ -157,7 +160,6 @@ class DataModule(L.LightningDataModule):
|
|
|
157
160
|
tp_name=tp_name,
|
|
158
161
|
organism_name=organism_name,
|
|
159
162
|
class_names=clss_to_predict,
|
|
160
|
-
metacell_mode=bool(metacell_mode),
|
|
161
163
|
)
|
|
162
164
|
self.validation_split = validation_split
|
|
163
165
|
self.test_split = test_split
|
|
@@ -96,8 +96,9 @@ class MappedCollection:
|
|
|
96
96
|
cache_categories: Enable caching categories of ``obs_keys`` for faster access.
|
|
97
97
|
parallel: Enable sampling with multiple processes.
|
|
98
98
|
dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
|
|
99
|
-
meta_assays: Assays
|
|
100
|
-
metacell_mode:
|
|
99
|
+
meta_assays: Assays that are already defined as metacells.
|
|
100
|
+
metacell_mode: frequency at which to sample a metacell (an average of k-nearest neighbors).
|
|
101
|
+
get_knn_cells: Whether to also dataload the k-nearest neighbors of each queried cells.
|
|
101
102
|
"""
|
|
102
103
|
|
|
103
104
|
def __init__(
|
|
@@ -114,6 +115,7 @@ class MappedCollection:
|
|
|
114
115
|
parallel: bool = False,
|
|
115
116
|
dtype: str | None = None,
|
|
116
117
|
metacell_mode: float = 0.0,
|
|
118
|
+
get_knn_cells: bool = False,
|
|
117
119
|
meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
|
|
118
120
|
):
|
|
119
121
|
if join not in {None, "inner", "outer"}: # pragma: nocover
|
|
@@ -166,6 +168,7 @@ class MappedCollection:
|
|
|
166
168
|
self.metacell_mode = metacell_mode
|
|
167
169
|
self.path_list = path_list
|
|
168
170
|
self.meta_assays = meta_assays
|
|
171
|
+
self.get_knn_cells = get_knn_cells
|
|
169
172
|
self._make_connections(path_list, parallel)
|
|
170
173
|
|
|
171
174
|
self._cache_cats: dict = {}
|
|
@@ -396,12 +399,15 @@ class MappedCollection:
|
|
|
396
399
|
label_idx = self.encoders[label][label_idx]
|
|
397
400
|
out[label] = label_idx
|
|
398
401
|
|
|
399
|
-
out["is_meta"] = False
|
|
400
|
-
if len(self.meta_assays) > 0 and "assay_ontology_term_id" in self.obs_keys:
|
|
401
|
-
if out["assay_ontology_term_id"] in self.meta_assays:
|
|
402
|
-
out["is_meta"] = True
|
|
403
|
-
return out
|
|
404
402
|
if self.metacell_mode > 0:
|
|
403
|
+
if (
|
|
404
|
+
len(self.meta_assays) > 0
|
|
405
|
+
and "assay_ontology_term_id" in self.obs_keys
|
|
406
|
+
):
|
|
407
|
+
if out["assay_ontology_term_id"] in self.meta_assays:
|
|
408
|
+
out["is_meta"] = True
|
|
409
|
+
return out
|
|
410
|
+
out["is_meta"] = False
|
|
405
411
|
if np.random.random() < self.metacell_mode:
|
|
406
412
|
out["is_meta"] = True
|
|
407
413
|
distances = self._get_data_idx(store["obsp"]["distances"], obs_idx)
|
|
@@ -410,6 +416,19 @@ class MappedCollection:
|
|
|
410
416
|
out[layers_key] += self._get_data_idx(
|
|
411
417
|
lazy_data, i, self.join_vars, var_idxs_join, self.n_vars
|
|
412
418
|
)
|
|
419
|
+
elif self.get_knn_cells:
|
|
420
|
+
distances = self._get_data_idx(store["obsp"]["distances"], obs_idx)
|
|
421
|
+
nn_idx = np.argsort(-1 / (distances - 1e-6))[:6]
|
|
422
|
+
out["knn_cells"] = np.array(
|
|
423
|
+
[
|
|
424
|
+
self._get_data_idx(
|
|
425
|
+
lazy_data, i, self.join_vars, var_idxs_join, self.n_vars
|
|
426
|
+
)
|
|
427
|
+
for i in nn_idx
|
|
428
|
+
],
|
|
429
|
+
dtype=int,
|
|
430
|
+
)
|
|
431
|
+
out["distances"] = distances[nn_idx]
|
|
413
432
|
|
|
414
433
|
return out
|
|
415
434
|
|
|
@@ -9,7 +9,7 @@ import scanpy as sc
|
|
|
9
9
|
from anndata import AnnData, read_h5ad
|
|
10
10
|
from scipy.sparse import csr_matrix
|
|
11
11
|
from upath import UPath
|
|
12
|
-
|
|
12
|
+
import gc
|
|
13
13
|
from scdataloader import utils as data_utils
|
|
14
14
|
|
|
15
15
|
FULL_LENGTH_ASSAYS = [
|
|
@@ -18,7 +18,7 @@ FULL_LENGTH_ASSAYS = [
|
|
|
18
18
|
"EFO:0008931",
|
|
19
19
|
]
|
|
20
20
|
|
|
21
|
-
MAXFILESIZE =
|
|
21
|
+
MAXFILESIZE = 5_000_000_000
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class Preprocessor:
|
|
@@ -64,6 +64,11 @@ class Preprocessor:
|
|
|
64
64
|
"""
|
|
65
65
|
Initializes the preprocessor and configures the workflow steps.
|
|
66
66
|
|
|
67
|
+
Your dataset should contain at least the following obs:
|
|
68
|
+
- `organism_ontology_term_id` with the ontology id of the organism of your anndata
|
|
69
|
+
- gene names in the `var.index` field of your anndata that map to the ensembl_gene nomenclature
|
|
70
|
+
or the hugo gene symbols nomenclature (if the later, set `is_symbol` to True)
|
|
71
|
+
|
|
67
72
|
Args:
|
|
68
73
|
filter_gene_by_counts (int or bool, optional): Determines whether to filter genes by counts.
|
|
69
74
|
If int, filters genes with counts. Defaults to False.
|
|
@@ -130,13 +135,21 @@ class Preprocessor:
|
|
|
130
135
|
self.keepdata = keepdata
|
|
131
136
|
|
|
132
137
|
def __call__(self, adata, dataset_id=None) -> AnnData:
|
|
133
|
-
if
|
|
138
|
+
if self.additional_preprocess is not None:
|
|
139
|
+
adata = self.additional_preprocess(adata)
|
|
140
|
+
if "organism_ontology_term_id" not in adata[0].obs.columns:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
"organism_ontology_term_id not found in adata.obs, you need to add an ontology term id for the organism of your anndata"
|
|
143
|
+
)
|
|
144
|
+
if not adata[0].var.index.str.contains("ENS").any() and not self.is_symbol:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
"gene names in the `var.index` field of your anndata should map to the ensembl_gene nomenclature else set `is_symbol` to True if using hugo symbols"
|
|
147
|
+
)
|
|
148
|
+
if adata.obs["organism_ontology_term_id"].iloc[0] not in self.organisms:
|
|
134
149
|
raise ValueError(
|
|
135
150
|
"we cannot work with this organism",
|
|
136
|
-
adata
|
|
151
|
+
adata.obs["organism_ontology_term_id"],
|
|
137
152
|
)
|
|
138
|
-
if self.additional_preprocess is not None:
|
|
139
|
-
adata = self.additional_preprocess(adata)
|
|
140
153
|
if adata.raw is not None and self.use_raw:
|
|
141
154
|
adata.X = adata.raw.X
|
|
142
155
|
del adata.raw
|
|
@@ -152,11 +165,12 @@ class Preprocessor:
|
|
|
152
165
|
del adata.layers
|
|
153
166
|
if len(adata.varm.keys()) > 0 and not self.keepdata:
|
|
154
167
|
del adata.varm
|
|
155
|
-
if len(adata.obsm.keys()) > 0 and
|
|
168
|
+
if len(adata.obsm.keys()) > 0 and not self.keepdata:
|
|
156
169
|
del adata.obsm
|
|
157
|
-
if len(adata.obsp.keys()) > 0 and
|
|
170
|
+
if len(adata.obsp.keys()) > 0 and not self.keepdata:
|
|
158
171
|
del adata.obsp
|
|
159
172
|
# check that it is a count
|
|
173
|
+
|
|
160
174
|
print("checking raw counts")
|
|
161
175
|
if np.abs(
|
|
162
176
|
adata[:50_000].X.astype(int) - adata[:50_000].X
|
|
@@ -217,23 +231,51 @@ class Preprocessor:
|
|
|
217
231
|
)
|
|
218
232
|
)
|
|
219
233
|
|
|
220
|
-
if
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
234
|
+
# Check if we have a mix of gene names and ensembl IDs
|
|
235
|
+
has_ens = adata.var.index.str.match(r"ENS.*\d{6,}$").any()
|
|
236
|
+
all_ens = adata.var.index.str.match(r"ENS.*\d{6,}$").all()
|
|
237
|
+
|
|
238
|
+
if not has_ens:
|
|
239
|
+
print("No ENS genes found, assuming gene symbols...")
|
|
240
|
+
elif not all_ens:
|
|
241
|
+
print("Mix of ENS and gene symbols found, converting all to ENS IDs...")
|
|
242
|
+
|
|
243
|
+
genesdf["ensembl_gene_id"] = genesdf.index
|
|
244
|
+
|
|
245
|
+
# For genes that are already ENS IDs, use them directly
|
|
246
|
+
ens_mask = adata.var.index.str.match(r"ENS.*\d{6,}$")
|
|
247
|
+
symbol_mask = ~ens_mask
|
|
248
|
+
|
|
249
|
+
# Handle symbol genes
|
|
250
|
+
if symbol_mask.any():
|
|
251
|
+
symbol_var = adata.var[symbol_mask].merge(
|
|
252
|
+
genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
|
|
253
|
+
left_index=True,
|
|
254
|
+
right_index=True,
|
|
255
|
+
how="inner",
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Handle ENS genes
|
|
259
|
+
if ens_mask.any():
|
|
260
|
+
ens_var = adata.var[ens_mask].merge(
|
|
261
|
+
genesdf, left_index=True, right_index=True, how="inner"
|
|
233
262
|
)
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
263
|
+
|
|
264
|
+
# Combine and sort
|
|
265
|
+
if symbol_mask.any() and ens_mask.any():
|
|
266
|
+
var = pd.concat([symbol_var, ens_var])
|
|
267
|
+
elif symbol_mask.any():
|
|
268
|
+
var = symbol_var
|
|
269
|
+
else:
|
|
270
|
+
var = ens_var
|
|
271
|
+
|
|
272
|
+
adata = adata[:, var.index]
|
|
273
|
+
var = var.sort_values(by="ensembl_gene_id").set_index("ensembl_gene_id")
|
|
274
|
+
# Update adata with combined genes
|
|
275
|
+
adata.var = var
|
|
276
|
+
genesdf = genesdf.set_index("ensembl_gene_id")
|
|
277
|
+
# Drop duplicate genes, keeping first occurrence
|
|
278
|
+
adata = adata[:, ~adata.var.index.duplicated(keep="first")]
|
|
237
279
|
|
|
238
280
|
intersect_genes = set(adata.var.index).intersection(set(genesdf.index))
|
|
239
281
|
print(f"Removed {len(adata.var.index) - len(intersect_genes)} genes.")
|
|
@@ -462,13 +504,17 @@ class LaminPreprocessor(Preprocessor):
|
|
|
462
504
|
print(file)
|
|
463
505
|
|
|
464
506
|
path = cache_path(file) if self.force_preloaded else file.cache()
|
|
465
|
-
backed =
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
507
|
+
backed = file.open()
|
|
508
|
+
# backed = read_h5ad(path, backed="r")
|
|
509
|
+
if "is_primary_data" in backed.obs.columns:
|
|
510
|
+
if backed.obs.is_primary_data.sum() == 0:
|
|
511
|
+
print(f"{file.key} only contains non primary cells.. dropping")
|
|
512
|
+
# Save the stem_uid to a file to avoid loading it again
|
|
469
513
|
with open("nonprimary.txt", "a") as f:
|
|
470
514
|
f.write(f"{file.stem_uid}\n")
|
|
471
515
|
continue
|
|
516
|
+
else:
|
|
517
|
+
print("Warning: couldn't check unicity from is_primary_data column")
|
|
472
518
|
if backed.shape[1] < 1000:
|
|
473
519
|
print(
|
|
474
520
|
f"{file.key} only contains less than 1000 genes and is likely not scRNAseq... dropping"
|
|
@@ -489,16 +535,23 @@ class LaminPreprocessor(Preprocessor):
|
|
|
489
535
|
block_size = int(
|
|
490
536
|
(np.ceil(badata.shape[0] / 30_000) * 30_000) // num_blocks
|
|
491
537
|
)
|
|
492
|
-
print(
|
|
538
|
+
print(
|
|
539
|
+
"num blocks ",
|
|
540
|
+
num_blocks,
|
|
541
|
+
"block size ",
|
|
542
|
+
block_size,
|
|
543
|
+
"total elements ",
|
|
544
|
+
badata.shape[0],
|
|
545
|
+
)
|
|
493
546
|
for j in range(num_blocks):
|
|
494
|
-
if j == 0 and i == 390:
|
|
495
|
-
continue
|
|
496
547
|
start_index = j * block_size
|
|
497
548
|
end_index = min((j + 1) * block_size, badata.shape[0])
|
|
498
|
-
block = badata[start_index:end_index]
|
|
549
|
+
block = badata[start_index:end_index]
|
|
550
|
+
block = block.to_memory()
|
|
499
551
|
print(block)
|
|
500
552
|
block = super().__call__(
|
|
501
|
-
block,
|
|
553
|
+
block,
|
|
554
|
+
dataset_id=file.stem_uid + "_p" + str(j),
|
|
502
555
|
)
|
|
503
556
|
myfile = ln.Artifact.from_anndata(
|
|
504
557
|
block,
|
|
@@ -508,16 +561,19 @@ class LaminPreprocessor(Preprocessor):
|
|
|
508
561
|
+ " p"
|
|
509
562
|
+ str(j)
|
|
510
563
|
+ " ( revises file "
|
|
511
|
-
+ str(file.
|
|
564
|
+
+ str(file.stem_uid)
|
|
512
565
|
+ " )",
|
|
513
566
|
version=version,
|
|
514
567
|
)
|
|
515
568
|
myfile.save()
|
|
569
|
+
|
|
516
570
|
if self.keep_files:
|
|
517
571
|
files.append(myfile)
|
|
572
|
+
del block
|
|
518
573
|
else:
|
|
519
574
|
del myfile
|
|
520
575
|
del block
|
|
576
|
+
gc.collect()
|
|
521
577
|
|
|
522
578
|
else:
|
|
523
579
|
adata = super().__call__(adata, dataset_id=file.stem_uid)
|
|
@@ -530,6 +586,7 @@ class LaminPreprocessor(Preprocessor):
|
|
|
530
586
|
myfile.save()
|
|
531
587
|
if self.keep_files:
|
|
532
588
|
files.append(myfile)
|
|
589
|
+
del adata
|
|
533
590
|
else:
|
|
534
591
|
del myfile
|
|
535
592
|
del adata
|
|
@@ -549,7 +606,12 @@ class LaminPreprocessor(Preprocessor):
|
|
|
549
606
|
|
|
550
607
|
# issues with KLlggfw6I6lvmbqiZm46
|
|
551
608
|
if self.keep_files:
|
|
552
|
-
|
|
609
|
+
# Reconstruct collection using keys
|
|
610
|
+
dataset = ln.Collection(
|
|
611
|
+
[ln.Artifact.filter(key=k).one() for k in files],
|
|
612
|
+
name=name,
|
|
613
|
+
description=description,
|
|
614
|
+
)
|
|
553
615
|
dataset.save()
|
|
554
616
|
return dataset
|
|
555
617
|
else:
|
|
@@ -154,7 +154,7 @@ def getBiomartTable(
|
|
|
154
154
|
return res
|
|
155
155
|
|
|
156
156
|
|
|
157
|
-
def validate(adata: AnnData, organism: str, need_all=
|
|
157
|
+
def validate(adata: AnnData, organism: str, need_all=False):
|
|
158
158
|
"""
|
|
159
159
|
validate checks if the adata object is valid for lamindb
|
|
160
160
|
|
|
@@ -578,7 +578,6 @@ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10
|
|
|
578
578
|
|
|
579
579
|
|
|
580
580
|
def populate_my_ontology(
|
|
581
|
-
organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
|
|
582
581
|
sex: List[str] = ["PATO:0000384", "PATO:0000383"],
|
|
583
582
|
celltypes: List[str] = [],
|
|
584
583
|
ethnicities: List[str] = [],
|
|
@@ -586,7 +585,7 @@ def populate_my_ontology(
|
|
|
586
585
|
tissues: List[str] = [],
|
|
587
586
|
diseases: List[str] = [],
|
|
588
587
|
dev_stages: List[str] = [],
|
|
589
|
-
|
|
588
|
+
organisms_clade: List[str] = ["vertebrates", "plants"],
|
|
590
589
|
):
|
|
591
590
|
"""
|
|
592
591
|
creates a local version of the lamin ontologies and add the required missing values in base ontologies
|
|
@@ -622,23 +621,27 @@ def populate_my_ontology(
|
|
|
622
621
|
ln.save(records)
|
|
623
622
|
bt.CellType(name="unknown", ontology_id="unknown").save()
|
|
624
623
|
# Organism
|
|
625
|
-
if
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
if isinstance(organism_or_organismlist, bt.Organism)
|
|
635
|
-
else organism_or_organismlist[0]
|
|
636
|
-
for organism_or_organismlist in [
|
|
637
|
-
bt.Organism.from_source(ontology_id=name, source=source)
|
|
638
|
-
for name in names
|
|
624
|
+
if organisms_clade is not None:
|
|
625
|
+
records = []
|
|
626
|
+
for organism_clade in organisms_clade:
|
|
627
|
+
names = bt.Organism.public(organism=organism_clade).df().index
|
|
628
|
+
source = bt.PublicSource.filter(
|
|
629
|
+
name="ensembl", organism=organism_clade
|
|
630
|
+
).last()
|
|
631
|
+
records += [
|
|
632
|
+
bt.Organism.from_source(name=name, source=source) for name in names
|
|
639
633
|
]
|
|
640
|
-
]
|
|
641
|
-
|
|
634
|
+
nrecords = []
|
|
635
|
+
prevrec = set()
|
|
636
|
+
for rec in records:
|
|
637
|
+
if rec is None:
|
|
638
|
+
continue
|
|
639
|
+
if not isinstance(rec, bt.Organism):
|
|
640
|
+
rec = rec[0]
|
|
641
|
+
if rec.uid not in prevrec:
|
|
642
|
+
nrecords.append(rec)
|
|
643
|
+
prevrec.add(rec.uid)
|
|
644
|
+
ln.save(nrecords)
|
|
642
645
|
bt.Organism(name="unknown", ontology_id="unknown").save()
|
|
643
646
|
# Phenotype
|
|
644
647
|
if sex is not None:
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
1.8.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|