scdataloader 1.0.1__py3-none-any.whl → 1.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scdataloader/VERSION +1 -1
- scdataloader/__init__.py +2 -2
- scdataloader/collator.py +6 -66
- scdataloader/data.py +42 -5
- scdataloader/datamodule.py +1 -1
- scdataloader/mapped.py +113 -92
- scdataloader/preprocess.py +1 -6
- scdataloader/utils.py +75 -85
- {scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/METADATA +68 -9
- scdataloader-1.0.6.dist-info/RECORD +16 -0
- scdataloader-1.0.1.dist-info/RECORD +0 -16
- {scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/LICENSE +0 -0
- {scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/WHEEL +0 -0
- {scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/entry_points.txt +0 -0
scdataloader/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
1.0.
|
|
1
|
+
1.0.5
|
scdataloader/__init__.py
CHANGED
scdataloader/collator.py
CHANGED
|
@@ -92,7 +92,10 @@ class Collator:
|
|
|
92
92
|
)
|
|
93
93
|
for organism in self.organisms:
|
|
94
94
|
ogenedf = self.genedf[self.genedf.organism == organism]
|
|
95
|
-
|
|
95
|
+
if len(valid_genes) > 0:
|
|
96
|
+
tot = self.genedf[self.genedf.index.isin(valid_genes)]
|
|
97
|
+
else:
|
|
98
|
+
tot = self.genedf
|
|
96
99
|
org = org_to_id[organism] if org_to_id is not None else organism
|
|
97
100
|
self.start_idx.update({org: np.where(tot.organism == organism)[0][0]})
|
|
98
101
|
if len(valid_genes) > 0:
|
|
@@ -108,7 +111,7 @@ class Collator:
|
|
|
108
111
|
Args:
|
|
109
112
|
batch (list[dict[str: array]]): List of dicts of arrays containing gene expression data.
|
|
110
113
|
the first list is for the different samples, the second list is for the different elements with
|
|
111
|
-
elem["
|
|
114
|
+
elem["X"]: gene expression
|
|
112
115
|
elem["organism_name"]: organism ontology term id
|
|
113
116
|
elem["tp_name"]: heat diff
|
|
114
117
|
elem["class_names.."]: other classes
|
|
@@ -132,7 +135,7 @@ class Collator:
|
|
|
132
135
|
continue
|
|
133
136
|
if "_storage_idx" in elem:
|
|
134
137
|
dataset.append(elem["_storage_idx"])
|
|
135
|
-
expr = np.array(elem["
|
|
138
|
+
expr = np.array(elem["X"])
|
|
136
139
|
total_count.append(expr.sum())
|
|
137
140
|
if len(self.accepted_genes) > 0:
|
|
138
141
|
expr = expr[self.accepted_genes[organism_id]]
|
|
@@ -231,69 +234,6 @@ class Collator:
|
|
|
231
234
|
return ret
|
|
232
235
|
|
|
233
236
|
|
|
234
|
-
class AnnDataCollator(Collator):
|
|
235
|
-
def __init__(self, *args, **kwargs):
|
|
236
|
-
"""
|
|
237
|
-
AnnDataCollator Collator to use if working with AnnData's experimental dataloader (it is very slow!!!)
|
|
238
|
-
|
|
239
|
-
Args:
|
|
240
|
-
@see Collator
|
|
241
|
-
"""
|
|
242
|
-
super().__init__(*args, **kwargs)
|
|
243
|
-
|
|
244
|
-
def __call__(self, batch) -> dict[str, Tensor]:
|
|
245
|
-
exprs = []
|
|
246
|
-
total_count = []
|
|
247
|
-
other_classes = []
|
|
248
|
-
gene_locs = []
|
|
249
|
-
tp = []
|
|
250
|
-
for elem in batch:
|
|
251
|
-
organism_id = elem.obs[self.organism_name]
|
|
252
|
-
if organism_id.item() not in self.organism_ids:
|
|
253
|
-
print(organism_id)
|
|
254
|
-
expr = np.array(elem.X[0])
|
|
255
|
-
|
|
256
|
-
total_count.append(expr.sum())
|
|
257
|
-
if len(self.accepted_genes) > 0:
|
|
258
|
-
expr = expr[self.accepted_genes[organism_id]]
|
|
259
|
-
if self.how == "most expr":
|
|
260
|
-
loc = np.argsort(expr)[-(self.max_len) :][::-1]
|
|
261
|
-
elif self.how == "random expr":
|
|
262
|
-
nnz_loc = np.where(expr > 0)[0]
|
|
263
|
-
loc = nnz_loc[
|
|
264
|
-
np.random.choice(len(nnz_loc), self.max_len, replace=False)
|
|
265
|
-
]
|
|
266
|
-
else:
|
|
267
|
-
raise ValueError("how must be either most expr or random expr")
|
|
268
|
-
if self.add_zero_genes > 0:
|
|
269
|
-
zero_loc = np.where(expr == 0)[0]
|
|
270
|
-
zero_loc = [
|
|
271
|
-
np.random.choice(len(zero_loc), self.add_zero_genes, replace=False)
|
|
272
|
-
]
|
|
273
|
-
loc = np.concatenate((loc, zero_loc), axis=None)
|
|
274
|
-
exprs.append(expr[loc])
|
|
275
|
-
gene_locs.append(loc + self.start_idx[organism_id.item()])
|
|
276
|
-
|
|
277
|
-
if self.tp_name is not None:
|
|
278
|
-
tp.append(elem.obs[self.tp_name])
|
|
279
|
-
else:
|
|
280
|
-
tp.append(0)
|
|
281
|
-
|
|
282
|
-
other_classes.append([elem.obs[i].values[0] for i in self.class_names])
|
|
283
|
-
|
|
284
|
-
expr = np.array(exprs)
|
|
285
|
-
tp = np.array(tp)
|
|
286
|
-
gene_locs = np.array(gene_locs)
|
|
287
|
-
total_count = np.array(total_count)
|
|
288
|
-
other_classes = np.array(other_classes)
|
|
289
|
-
return {
|
|
290
|
-
"x": Tensor(expr),
|
|
291
|
-
"genes": Tensor(gene_locs).int(),
|
|
292
|
-
"depth": Tensor(total_count),
|
|
293
|
-
"class": Tensor(other_classes),
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
|
|
297
237
|
#############
|
|
298
238
|
#### WIP ####
|
|
299
239
|
#############
|
scdataloader/data.py
CHANGED
|
@@ -8,7 +8,7 @@ import bionty as bt
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from torch.utils.data import Dataset as torchDataset
|
|
10
10
|
from typing import Union, Optional, Literal
|
|
11
|
-
from scdataloader import
|
|
11
|
+
from scdataloader.mapped import MappedCollection
|
|
12
12
|
import warnings
|
|
13
13
|
|
|
14
14
|
from anndata import AnnData
|
|
@@ -74,9 +74,9 @@ class Dataset(torchDataset):
|
|
|
74
74
|
join_vars: Literal["inner", "outer"] | None = None
|
|
75
75
|
|
|
76
76
|
def __post_init__(self):
|
|
77
|
-
self.mapped_dataset = mapped
|
|
77
|
+
self.mapped_dataset = mapped(
|
|
78
78
|
self.lamin_dataset,
|
|
79
|
-
|
|
79
|
+
obs_keys=self.obs,
|
|
80
80
|
join=self.join_vars,
|
|
81
81
|
encode_labels=self.clss_to_pred,
|
|
82
82
|
unknown_label="unknown",
|
|
@@ -311,7 +311,7 @@ class SimpleAnnDataset(torchDataset):
|
|
|
311
311
|
for idx, obs in enumerate(self.adata.obs.itertuples(index=False)):
|
|
312
312
|
with warnings.catch_warnings():
|
|
313
313
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
314
|
-
out = {"
|
|
314
|
+
out = {"X": self.adataX[idx].reshape(-1)}
|
|
315
315
|
out.update(
|
|
316
316
|
{name: val for name, val in self.obs_to_output.iloc[idx].items()}
|
|
317
317
|
)
|
|
@@ -320,8 +320,45 @@ class SimpleAnnDataset(torchDataset):
|
|
|
320
320
|
def __getitem__(self, idx):
|
|
321
321
|
with warnings.catch_warnings():
|
|
322
322
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
323
|
-
out = {"
|
|
323
|
+
out = {"X": self.adataX[idx].reshape(-1)}
|
|
324
324
|
out.update(
|
|
325
325
|
{name: val for name, val in self.obs_to_output.iloc[idx].items()}
|
|
326
326
|
)
|
|
327
327
|
return out
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def mapped(
|
|
331
|
+
dataset,
|
|
332
|
+
obs_keys: list[str] | None = None,
|
|
333
|
+
join: Literal["inner", "outer"] | None = "inner",
|
|
334
|
+
encode_labels: bool | list[str] = True,
|
|
335
|
+
unknown_label: str | dict[str, str] | None = None,
|
|
336
|
+
cache_categories: bool = True,
|
|
337
|
+
parallel: bool = False,
|
|
338
|
+
dtype: str | None = None,
|
|
339
|
+
stream: bool = False,
|
|
340
|
+
is_run_input: bool | None = None,
|
|
341
|
+
) -> MappedCollection:
|
|
342
|
+
path_list = []
|
|
343
|
+
for artifact in dataset.artifacts.all():
|
|
344
|
+
if artifact.suffix not in {".h5ad", ".zrad", ".zarr"}:
|
|
345
|
+
print(f"Ignoring artifact with suffix {artifact.suffix}")
|
|
346
|
+
continue
|
|
347
|
+
elif not artifact.path.exists():
|
|
348
|
+
print(f"Path does not exist for artifact with suffix {artifact.suffix}")
|
|
349
|
+
continue
|
|
350
|
+
elif not stream:
|
|
351
|
+
path_list.append(artifact.stage())
|
|
352
|
+
else:
|
|
353
|
+
path_list.append(artifact.path)
|
|
354
|
+
ds = MappedCollection(
|
|
355
|
+
path_list=path_list,
|
|
356
|
+
obs_keys=obs_keys,
|
|
357
|
+
join=join,
|
|
358
|
+
encode_labels=encode_labels,
|
|
359
|
+
unknown_label=unknown_label,
|
|
360
|
+
cache_categories=cache_categories,
|
|
361
|
+
parallel=parallel,
|
|
362
|
+
dtype=dtype,
|
|
363
|
+
)
|
|
364
|
+
return ds
|
scdataloader/datamodule.py
CHANGED
|
@@ -130,7 +130,7 @@ class DataModule(L.LightningDataModule):
|
|
|
130
130
|
print(f"reduced the size to {len(set(c))/len(biomart)}")
|
|
131
131
|
biomart["pos"] = c
|
|
132
132
|
mdataset.genedf = biomart.loc[mdataset.genedf.index]
|
|
133
|
-
self.gene_pos = mdataset.genedf["pos"].tolist()
|
|
133
|
+
self.gene_pos = mdataset.genedf["pos"].astype(int).tolist()
|
|
134
134
|
|
|
135
135
|
if gene_embeddings != "":
|
|
136
136
|
mdataset.genedf = mdataset.genedf.join(
|
scdataloader/mapped.py
CHANGED
|
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
|
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import pandas as pd
|
|
10
|
-
from lamin_utils import logger
|
|
11
10
|
from lamindb_setup.core.upath import UPath
|
|
12
|
-
from lamindb.core._data import _track_run_input
|
|
13
11
|
|
|
14
|
-
from lamindb.core.storage.
|
|
12
|
+
from lamindb.core.storage._anndata_accessor import (
|
|
13
|
+
ArrayType,
|
|
15
14
|
ArrayTypes,
|
|
15
|
+
GroupType,
|
|
16
16
|
GroupTypes,
|
|
17
17
|
StorageType,
|
|
18
18
|
_safer_read_index,
|
|
@@ -47,42 +47,6 @@ class _Connect:
|
|
|
47
47
|
self.conn.close()
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
def mapped(
|
|
51
|
-
dataset,
|
|
52
|
-
label_keys: str | list[str] | None = None,
|
|
53
|
-
join: Literal["inner", "outer"] | None = "inner",
|
|
54
|
-
encode_labels: bool | list[str] = True,
|
|
55
|
-
unknown_label: str | dict[str, str] | None = None,
|
|
56
|
-
cache_categories: bool = True,
|
|
57
|
-
parallel: bool = False,
|
|
58
|
-
dtype: str | None = None,
|
|
59
|
-
stream: bool = False,
|
|
60
|
-
is_run_input: bool | None = None,
|
|
61
|
-
) -> MappedCollection:
|
|
62
|
-
path_list = []
|
|
63
|
-
for artifact in dataset.artifacts.all():
|
|
64
|
-
if artifact.suffix not in {".h5ad", ".zrad", ".zarr"}:
|
|
65
|
-
logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
|
|
66
|
-
continue
|
|
67
|
-
elif not stream:
|
|
68
|
-
path_list.append(artifact.stage())
|
|
69
|
-
else:
|
|
70
|
-
path_list.append(artifact.path)
|
|
71
|
-
ds = MappedCollection(
|
|
72
|
-
path_list,
|
|
73
|
-
label_keys,
|
|
74
|
-
join,
|
|
75
|
-
encode_labels,
|
|
76
|
-
unknown_label,
|
|
77
|
-
cache_categories,
|
|
78
|
-
parallel,
|
|
79
|
-
dtype,
|
|
80
|
-
)
|
|
81
|
-
# track only if successful
|
|
82
|
-
_track_run_input(dataset, is_run_input)
|
|
83
|
-
return ds
|
|
84
|
-
|
|
85
|
-
|
|
86
50
|
class MappedCollection:
|
|
87
51
|
"""Map-style collection for use in data loaders.
|
|
88
52
|
|
|
@@ -92,6 +56,12 @@ class MappedCollection:
|
|
|
92
56
|
If your `AnnData` collection is in the cloud, move them into a local cache
|
|
93
57
|
first for faster access.
|
|
94
58
|
|
|
59
|
+
`__getitem__` of the `MappedCollection` object takes a single integer index
|
|
60
|
+
and returns a dictionary with the observation data sample for this index from
|
|
61
|
+
the `AnnData` objects in `path_list`. The dictionary has keys for `layers_keys`
|
|
62
|
+
(`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"`
|
|
63
|
+
for the index of the `AnnData` object containing this observation sample.
|
|
64
|
+
|
|
95
65
|
.. note::
|
|
96
66
|
|
|
97
67
|
For a guide, see :doc:`docs:scrna5`.
|
|
@@ -107,53 +77,71 @@ class MappedCollection:
|
|
|
107
77
|
|
|
108
78
|
Args:
|
|
109
79
|
path_list: A list of paths to `AnnData` objects stored in `.h5ad` or `.zarr` formats.
|
|
110
|
-
|
|
80
|
+
layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
|
|
81
|
+
retrieves ``.X``.
|
|
82
|
+
obsm_keys: Keys from the ``.obsm`` slots.
|
|
83
|
+
obs_keys: Keys from the ``.obs`` slots.
|
|
111
84
|
join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
|
|
112
85
|
does not join.
|
|
113
86
|
encode_labels: Encode labels into integers.
|
|
114
|
-
Can be a list with elements from ``
|
|
87
|
+
Can be a list with elements from ``obs_keys``.
|
|
115
88
|
unknown_label: Encode this label to -1.
|
|
116
|
-
Can be a dictionary with keys from ``
|
|
89
|
+
Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``
|
|
117
90
|
or from ``encode_labels`` if it is a list.
|
|
118
|
-
cache_categories: Enable caching categories of ``
|
|
91
|
+
cache_categories: Enable caching categories of ``obs_keys`` for faster access.
|
|
119
92
|
parallel: Enable sampling with multiple processes.
|
|
120
|
-
dtype: Convert numpy arrays from ``.X``
|
|
93
|
+
dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
|
|
121
94
|
"""
|
|
122
95
|
|
|
123
96
|
def __init__(
|
|
124
97
|
self,
|
|
125
98
|
path_list: list[UPathStr],
|
|
126
|
-
|
|
127
|
-
|
|
99
|
+
layers_keys: str | list[str] | None = None,
|
|
100
|
+
obs_keys: str | list[str] | None = None,
|
|
101
|
+
obsm_keys: str | list[str] | None = None,
|
|
102
|
+
join: Literal["inner", "outer"] | None = "inner",
|
|
128
103
|
encode_labels: bool | list[str] = True,
|
|
129
104
|
unknown_label: str | dict[str, str] | None = None,
|
|
130
105
|
cache_categories: bool = True,
|
|
131
106
|
parallel: bool = False,
|
|
132
107
|
dtype: str | None = None,
|
|
133
108
|
):
|
|
134
|
-
|
|
109
|
+
if join not in {None, "inner", "outer"}: # pragma: nocover
|
|
110
|
+
raise ValueError(
|
|
111
|
+
f"join must be one of None, 'inner, or 'outer' but was {type(join)}"
|
|
112
|
+
)
|
|
135
113
|
|
|
136
|
-
|
|
137
|
-
|
|
114
|
+
if layers_keys is None:
|
|
115
|
+
self.layers_keys = ["X"]
|
|
116
|
+
else:
|
|
117
|
+
self.layers_keys = (
|
|
118
|
+
[layers_keys] if isinstance(layers_keys, str) else layers_keys
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
obsm_keys = [obsm_keys] if isinstance(obsm_keys, str) else obsm_keys
|
|
122
|
+
self.obsm_keys = obsm_keys
|
|
123
|
+
|
|
124
|
+
obs_keys = [obs_keys] if isinstance(obs_keys, str) else obs_keys
|
|
125
|
+
self.obs_keys = obs_keys
|
|
138
126
|
|
|
139
127
|
if isinstance(encode_labels, list):
|
|
140
128
|
if len(encode_labels) == 0:
|
|
141
129
|
encode_labels = False
|
|
142
|
-
elif
|
|
143
|
-
enc_label in
|
|
130
|
+
elif obs_keys is None or not all(
|
|
131
|
+
enc_label in obs_keys for enc_label in encode_labels
|
|
144
132
|
):
|
|
145
133
|
raise ValueError(
|
|
146
|
-
"All elements of `encode_labels` should be in `
|
|
134
|
+
"All elements of `encode_labels` should be in `obs_keys`."
|
|
147
135
|
)
|
|
148
136
|
else:
|
|
149
137
|
if encode_labels:
|
|
150
|
-
encode_labels =
|
|
138
|
+
encode_labels = obs_keys if obs_keys is not None else False
|
|
151
139
|
self.encode_labels = encode_labels
|
|
152
140
|
|
|
153
141
|
if encode_labels and isinstance(unknown_label, dict):
|
|
154
142
|
if not all(unkey in encode_labels for unkey in unknown_label): # type: ignore
|
|
155
143
|
raise ValueError(
|
|
156
|
-
"All keys of `unknown_label` should be in `encode_labels` and `
|
|
144
|
+
"All keys of `unknown_label` should be in `encode_labels` and `obs_keys`."
|
|
157
145
|
)
|
|
158
146
|
self.unknown_label = unknown_label
|
|
159
147
|
|
|
@@ -194,12 +182,16 @@ class MappedCollection:
|
|
|
194
182
|
|
|
195
183
|
self.join_vars = join
|
|
196
184
|
self.var_indices = None
|
|
185
|
+
self.var_joint = None
|
|
186
|
+
self.n_vars_list = None
|
|
187
|
+
self.n_vars = None
|
|
197
188
|
if self.join_vars is not None:
|
|
198
189
|
self._make_join_vars()
|
|
190
|
+
self.n_vars = len(self.var_joint)
|
|
199
191
|
|
|
200
|
-
if self.
|
|
192
|
+
if self.obs_keys is not None:
|
|
201
193
|
if cache_categories:
|
|
202
|
-
self._cache_categories(self.
|
|
194
|
+
self._cache_categories(self.obs_keys)
|
|
203
195
|
else:
|
|
204
196
|
self._cache_cats: dict = {}
|
|
205
197
|
self.encoders: dict = {}
|
|
@@ -222,10 +214,10 @@ class MappedCollection:
|
|
|
222
214
|
self.conns.append(conn)
|
|
223
215
|
self.storages.append(storage)
|
|
224
216
|
|
|
225
|
-
def _cache_categories(self,
|
|
217
|
+
def _cache_categories(self, obs_keys: list):
|
|
226
218
|
self._cache_cats = {}
|
|
227
219
|
decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
|
|
228
|
-
for label in
|
|
220
|
+
for label in obs_keys:
|
|
229
221
|
self._cache_cats[label] = []
|
|
230
222
|
for storage in self.storages:
|
|
231
223
|
with _Connect(storage) as store:
|
|
@@ -252,9 +244,12 @@ class MappedCollection:
|
|
|
252
244
|
|
|
253
245
|
def _make_join_vars(self):
|
|
254
246
|
var_list = []
|
|
247
|
+
self.n_vars_list = []
|
|
255
248
|
for storage in self.storages:
|
|
256
249
|
with _Connect(storage) as store:
|
|
257
|
-
|
|
250
|
+
vars = _safer_read_index(store["var"])
|
|
251
|
+
var_list.append(vars)
|
|
252
|
+
self.n_vars_list.append(len(vars))
|
|
258
253
|
|
|
259
254
|
self.var_joint = None
|
|
260
255
|
vars_eq = all(var_list[0].equals(vrs) for vrs in var_list[1:])
|
|
@@ -262,6 +257,7 @@ class MappedCollection:
|
|
|
262
257
|
self.join_vars = None
|
|
263
258
|
self.var_joint = var_list[0]
|
|
264
259
|
return
|
|
260
|
+
|
|
265
261
|
if self.join_vars == "inner":
|
|
266
262
|
self.var_joint = reduce(pd.Index.intersection, var_list)
|
|
267
263
|
if len(self.var_joint) == 0:
|
|
@@ -285,6 +281,20 @@ class MappedCollection:
|
|
|
285
281
|
def __len__(self):
|
|
286
282
|
return self.n_obs
|
|
287
283
|
|
|
284
|
+
@property
|
|
285
|
+
def shape(self):
|
|
286
|
+
"""Shape of the (virtually aligned) dataset."""
|
|
287
|
+
return (self.n_obs, self.n_vars)
|
|
288
|
+
|
|
289
|
+
@property
|
|
290
|
+
def original_shapes(self):
|
|
291
|
+
"""Shapes of the underlying AnnData objects."""
|
|
292
|
+
if self.n_vars_list is None:
|
|
293
|
+
n_vars_list = [None] * len(self.n_obs_list)
|
|
294
|
+
else:
|
|
295
|
+
n_vars_list = self.n_vars_list
|
|
296
|
+
return list(zip(self.n_obs_list, n_vars_list))
|
|
297
|
+
|
|
288
298
|
def __getitem__(self, idx: int):
|
|
289
299
|
obs_idx = self.indices[idx]
|
|
290
300
|
storage_idx = self.storage_idx[idx]
|
|
@@ -294,17 +304,28 @@ class MappedCollection:
|
|
|
294
304
|
var_idxs_join = None
|
|
295
305
|
|
|
296
306
|
with _Connect(self.storages[storage_idx]) as store:
|
|
297
|
-
out = {
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
307
|
+
out = {}
|
|
308
|
+
for layers_key in self.layers_keys:
|
|
309
|
+
lazy_data = (
|
|
310
|
+
store["X"] if layers_key == "X" else store["layers"][layers_key]
|
|
311
|
+
)
|
|
312
|
+
out[layers_key] = self._get_data_idx(
|
|
313
|
+
lazy_data, obs_idx, self.join_vars, var_idxs_join, self.n_vars
|
|
314
|
+
)
|
|
315
|
+
if self.obsm_keys is not None:
|
|
316
|
+
for obsm_key in self.obsm_keys:
|
|
317
|
+
lazy_data = store["obsm"][obsm_key]
|
|
318
|
+
out[f"obsm_{obsm_key}"] = self._get_data_idx(lazy_data, obs_idx)
|
|
319
|
+
out["_store_idx"] = storage_idx
|
|
320
|
+
if self.obs_keys is not None:
|
|
321
|
+
for label in self.obs_keys:
|
|
301
322
|
if label in self._cache_cats:
|
|
302
323
|
cats = self._cache_cats[label][storage_idx]
|
|
303
324
|
if cats is None:
|
|
304
325
|
cats = []
|
|
305
326
|
else:
|
|
306
327
|
cats = None
|
|
307
|
-
label_idx = self.
|
|
328
|
+
label_idx = self._get_obs_idx(store, obs_idx, label, cats)
|
|
308
329
|
if label in self.encoders:
|
|
309
330
|
label_idx = self.encoders[label][label_idx]
|
|
310
331
|
out[label] = label_idx
|
|
@@ -312,46 +333,46 @@ class MappedCollection:
|
|
|
312
333
|
|
|
313
334
|
def _get_data_idx(
|
|
314
335
|
self,
|
|
315
|
-
|
|
336
|
+
lazy_data: ArrayType | GroupType, # type: ignore
|
|
316
337
|
idx: int,
|
|
338
|
+
join_vars: Literal["inner", "outer"] | None = None,
|
|
317
339
|
var_idxs_join: list | None = None,
|
|
318
|
-
|
|
340
|
+
n_vars_out: int | None = None,
|
|
319
341
|
):
|
|
320
342
|
"""Get the index for the data."""
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
result = layer_idx
|
|
343
|
+
if isinstance(lazy_data, ArrayTypes): # type: ignore
|
|
344
|
+
lazy_data_idx = lazy_data[idx] # type: ignore
|
|
345
|
+
if join_vars is None:
|
|
346
|
+
result = lazy_data_idx
|
|
326
347
|
if self._dtype is not None:
|
|
327
348
|
result = result.astype(self._dtype, copy=False)
|
|
328
|
-
elif
|
|
329
|
-
dtype =
|
|
330
|
-
result = np.zeros(
|
|
331
|
-
result[var_idxs_join] =
|
|
349
|
+
elif join_vars == "outer":
|
|
350
|
+
dtype = lazy_data_idx.dtype if self._dtype is None else self._dtype
|
|
351
|
+
result = np.zeros(n_vars_out, dtype=dtype)
|
|
352
|
+
result[var_idxs_join] = lazy_data_idx
|
|
332
353
|
else: # inner join
|
|
333
|
-
result =
|
|
354
|
+
result = lazy_data_idx[var_idxs_join]
|
|
334
355
|
if self._dtype is not None:
|
|
335
356
|
result = result.astype(self._dtype, copy=False)
|
|
336
357
|
return result
|
|
337
358
|
else: # assume csr_matrix here
|
|
338
|
-
data =
|
|
339
|
-
indices =
|
|
340
|
-
indptr =
|
|
359
|
+
data = lazy_data["data"] # type: ignore
|
|
360
|
+
indices = lazy_data["indices"] # type: ignore
|
|
361
|
+
indptr = lazy_data["indptr"] # type: ignore
|
|
341
362
|
s = slice(*(indptr[idx : idx + 2]))
|
|
342
363
|
data_s = data[s]
|
|
343
364
|
dtype = data_s.dtype if self._dtype is None else self._dtype
|
|
344
|
-
if
|
|
345
|
-
|
|
346
|
-
|
|
365
|
+
if join_vars == "outer":
|
|
366
|
+
lazy_data_idx = np.zeros(n_vars_out, dtype=dtype)
|
|
367
|
+
lazy_data_idx[var_idxs_join[indices[s]]] = data_s
|
|
347
368
|
else:
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
if
|
|
351
|
-
|
|
352
|
-
return
|
|
369
|
+
lazy_data_idx = np.zeros(lazy_data.attrs["shape"][1], dtype=dtype) # type: ignore
|
|
370
|
+
lazy_data_idx[indices[s]] = data_s
|
|
371
|
+
if join_vars == "inner":
|
|
372
|
+
lazy_data_idx = lazy_data_idx[var_idxs_join]
|
|
373
|
+
return lazy_data_idx
|
|
353
374
|
|
|
354
|
-
def
|
|
375
|
+
def _get_obs_idx(
|
|
355
376
|
self,
|
|
356
377
|
storage: StorageType,
|
|
357
378
|
idx: int,
|
|
@@ -379,12 +400,12 @@ class MappedCollection:
|
|
|
379
400
|
label = label.decode("utf-8")
|
|
380
401
|
return label
|
|
381
402
|
|
|
382
|
-
def get_label_weights(self,
|
|
403
|
+
def get_label_weights(self, obs_keys: str | list[str], scaler: int = 10):
|
|
383
404
|
"""Get all weights for the given label keys."""
|
|
384
|
-
if isinstance(
|
|
385
|
-
|
|
405
|
+
if isinstance(obs_keys, str):
|
|
406
|
+
obs_keys = [obs_keys]
|
|
386
407
|
labels_list = []
|
|
387
|
-
for label_key in
|
|
408
|
+
for label_key in obs_keys:
|
|
388
409
|
labels_to_str = self.get_merged_labels(label_key).astype(str).astype("O")
|
|
389
410
|
labels_list.append(labels_to_str)
|
|
390
411
|
if len(labels_list) > 1:
|
scdataloader/preprocess.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Callable, Optional, Union
|
|
2
2
|
from uuid import uuid4
|
|
3
3
|
|
|
4
4
|
import anndata as ad
|
|
@@ -7,9 +7,7 @@ import numpy as np
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
import scanpy as sc
|
|
9
9
|
from anndata import AnnData
|
|
10
|
-
from django.db import IntegrityError
|
|
11
10
|
from scipy.sparse import csr_matrix
|
|
12
|
-
import os
|
|
13
11
|
|
|
14
12
|
from scdataloader import utils as data_utils
|
|
15
13
|
|
|
@@ -268,9 +266,6 @@ class Preprocessor:
|
|
|
268
266
|
# QC
|
|
269
267
|
|
|
270
268
|
adata.var[genesdf.columns] = genesdf.loc[adata.var.index]
|
|
271
|
-
for name in ["stable_id", "created_at", "updated_at"]:
|
|
272
|
-
if name in adata.var.columns:
|
|
273
|
-
adata.var = adata.var.drop(columns=name)
|
|
274
269
|
print("startin QC")
|
|
275
270
|
sc.pp.calculate_qc_metrics(
|
|
276
271
|
adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20]
|
scdataloader/utils.py
CHANGED
|
@@ -138,7 +138,6 @@ def getBiomartTable(
|
|
|
138
138
|
|
|
139
139
|
res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
|
|
140
140
|
res.to_csv(cachefile, index=False)
|
|
141
|
-
|
|
142
141
|
res.columns = attr + attributes
|
|
143
142
|
if type(res) is not type(pd.DataFrame()):
|
|
144
143
|
raise ValueError("should be a dataframe")
|
|
@@ -355,13 +354,12 @@ def load_dataset_local(
|
|
|
355
354
|
|
|
356
355
|
def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10090",
|
|
357
356
|
organismdf = []
|
|
358
|
-
if type(organisms)
|
|
357
|
+
if type(organisms) is str:
|
|
359
358
|
organisms = [organisms]
|
|
360
359
|
for organism in organisms:
|
|
361
360
|
genesdf = bt.Gene.filter(
|
|
362
361
|
organism_id=bt.Organism.filter(ontology_id=organism).first().id
|
|
363
362
|
).df()
|
|
364
|
-
genesdf = genesdf[~genesdf["public_source_id"].isna()]
|
|
365
363
|
genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
|
|
366
364
|
genesdf = genesdf.set_index("ensembl_gene_id").sort_index()
|
|
367
365
|
# mitochondrial genes
|
|
@@ -372,7 +370,12 @@ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10
|
|
|
372
370
|
genesdf["hb"] = genesdf.symbol.astype(str).str.contains(("^HB[^(P)]"))
|
|
373
371
|
genesdf["organism"] = organism
|
|
374
372
|
organismdf.append(genesdf)
|
|
375
|
-
|
|
373
|
+
organismdf = pd.concat(organismdf)
|
|
374
|
+
organismdf.drop(
|
|
375
|
+
columns=["source_id", "run_id", "created_by_id", "updated_at", "stable_id"],
|
|
376
|
+
inplace=True,
|
|
377
|
+
)
|
|
378
|
+
return organismdf
|
|
376
379
|
|
|
377
380
|
|
|
378
381
|
def populate_my_ontology(
|
|
@@ -409,75 +412,82 @@ def populate_my_ontology(
|
|
|
409
412
|
diseases (list, optional): List of diseases. Defaults to [].
|
|
410
413
|
dev_stages (list, optional): List of developmental stages. Defaults to [].
|
|
411
414
|
"""
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
415
|
+
# cell type
|
|
416
|
+
if celltypes is not None:
|
|
417
|
+
names = bt.CellType.public().df().index if not celltypes else celltypes
|
|
418
|
+
records = bt.CellType.from_values(names, field="ontology_id")
|
|
419
|
+
ln.save(records)
|
|
420
|
+
bt.CellType(name="unknown", ontology_id="unknown").save()
|
|
417
421
|
# Organism
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
422
|
+
if organisms is not None:
|
|
423
|
+
names = bt.Organism.public().df().index if not organisms else organisms
|
|
424
|
+
records = [
|
|
425
|
+
i[0] if type(i) is list else i
|
|
426
|
+
for i in [bt.Organism.from_source(ontology_id=i) for i in names]
|
|
427
|
+
]
|
|
428
|
+
ln.save(records)
|
|
429
|
+
bt.Organism(name="unknown", ontology_id="unknown").save()
|
|
430
|
+
organism_names = names
|
|
425
431
|
# Phenotype
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
ln.save(records, parents=bool(sex))
|
|
437
|
-
bt.Phenotype(name="unknown", ontology_id="unknown").save()
|
|
432
|
+
if sex is not None:
|
|
433
|
+
names = bt.Phenotype.public().df().index if not sex else sex
|
|
434
|
+
records = [
|
|
435
|
+
bt.Phenotype.from_source(
|
|
436
|
+
ontology_id=i, source=bt.PublicSource.filter(name="pato").first()
|
|
437
|
+
)
|
|
438
|
+
for i in names
|
|
439
|
+
]
|
|
440
|
+
ln.save(records)
|
|
441
|
+
bt.Phenotype(name="unknown", ontology_id="unknown").save()
|
|
438
442
|
# ethnicity
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
443
|
+
if ethnicities is not None:
|
|
444
|
+
names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
|
|
445
|
+
records = bt.Ethnicity.from_values(names, field="ontology_id")
|
|
446
|
+
ln.save(records)
|
|
447
|
+
bt.Ethnicity(
|
|
448
|
+
name="unknown", ontology_id="unknown"
|
|
449
|
+
).save() # multi ethnic will have to get renamed
|
|
445
450
|
# ExperimentalFactor
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
451
|
+
if assays is not None:
|
|
452
|
+
names = bt.ExperimentalFactor.public().df().index if not assays else assays
|
|
453
|
+
records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
|
|
454
|
+
ln.save(records)
|
|
455
|
+
bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
|
|
456
|
+
# lookup = bt.ExperimentalFactor.lookup()
|
|
457
|
+
# lookup.smart_seq_v4.parents.add(lookup.smart_like)
|
|
452
458
|
# Tissue
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
459
|
+
if tissues is not None:
|
|
460
|
+
names = bt.Tissue.public().df().index if not tissues else tissues
|
|
461
|
+
records = bt.Tissue.from_values(names, field="ontology_id")
|
|
462
|
+
ln.save(records)
|
|
463
|
+
bt.Tissue(name="unknown", ontology_id="unknown").save()
|
|
457
464
|
# DevelopmentalStage
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
465
|
+
if dev_stages is not None:
|
|
466
|
+
names = (
|
|
467
|
+
bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
|
|
468
|
+
)
|
|
469
|
+
records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
|
|
470
|
+
ln.save(records)
|
|
471
|
+
bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
|
|
472
|
+
|
|
473
|
+
names = bt.DevelopmentalStage.public(organism="mouse").df().index
|
|
474
|
+
records = [
|
|
475
|
+
bt.DevelopmentalStage.from_source(
|
|
476
|
+
ontology_id=i,
|
|
477
|
+
source=bt.PublicSource.filter(organism="mouse", name="mmusdv").first(),
|
|
478
|
+
)
|
|
479
|
+
for i in names.tolist()
|
|
480
|
+
]
|
|
481
|
+
ln.save(records)
|
|
473
482
|
# Disease
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
483
|
+
if diseases is not None:
|
|
484
|
+
names = bt.Disease.public().df().index if not diseases else diseases
|
|
485
|
+
records = bt.Disease.from_values(names, field="ontology_id")
|
|
486
|
+
ln.save(records)
|
|
487
|
+
bt.Disease(name="normal", ontology_id="PATO:0000461").save()
|
|
488
|
+
bt.Disease(name="unknown", ontology_id="unknown").save()
|
|
479
489
|
# genes
|
|
480
|
-
for organism in
|
|
490
|
+
for organism in organism_names:
|
|
481
491
|
# convert onto to name
|
|
482
492
|
organism = bt.Organism.filter(ontology_id=organism).one().name
|
|
483
493
|
names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]
|
|
@@ -523,26 +533,6 @@ def length_normalize(adata: AnnData, gene_lengths: list):
|
|
|
523
533
|
return adata
|
|
524
534
|
|
|
525
535
|
|
|
526
|
-
def pd_load_cached(url: str, loc: str = "/tmp/", cache: bool = True, **kwargs):
|
|
527
|
-
"""
|
|
528
|
-
pd_load_cached downloads a file from a url and loads it as a pandas dataframe
|
|
529
|
-
|
|
530
|
-
Args:
|
|
531
|
-
url (str): the url to download the file from
|
|
532
|
-
loc (str, optional): the location to save the file to. Defaults to "/tmp/".
|
|
533
|
-
cache (bool, optional): whether to use the cached file or not. Defaults to True.
|
|
534
|
-
|
|
535
|
-
Returns:
|
|
536
|
-
pd.DataFrame: the dataframe
|
|
537
|
-
"""
|
|
538
|
-
# Check if the file exists, if not, download it
|
|
539
|
-
loc += url.split("/")[-1]
|
|
540
|
-
if not os.path.isfile(loc) or not cache:
|
|
541
|
-
urllib.request.urlretrieve(url, loc)
|
|
542
|
-
# Load the data from the file
|
|
543
|
-
return pd.read_csv(loc, **kwargs)
|
|
544
|
-
|
|
545
|
-
|
|
546
536
|
def translate(
|
|
547
537
|
val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
|
|
548
538
|
):
|
|
@@ -1,28 +1,37 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.6
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Home-page: https://github.com/jkobject/scDataLoader
|
|
6
6
|
License: GPL3
|
|
7
|
-
Keywords: scRNAseq,dataloader,pytorch,lamindb,
|
|
7
|
+
Keywords: scRNAseq,dataloader,pytorch,lamindb,scPRINT
|
|
8
8
|
Author: jkobject
|
|
9
9
|
Requires-Python: ==3.10.*
|
|
10
10
|
Classifier: License :: Other/Proprietary License
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Provides-Extra: dev
|
|
13
14
|
Requires-Dist: anndata
|
|
14
15
|
Requires-Dist: biomart
|
|
15
|
-
Requires-Dist: bionty
|
|
16
|
+
Requires-Dist: bionty (==0.48.0)
|
|
17
|
+
Requires-Dist: black (>=23.10.1,<24.0.0) ; extra == "dev"
|
|
16
18
|
Requires-Dist: cellxgene-census
|
|
19
|
+
Requires-Dist: coverage (>=7.3.2,<8.0.0) ; extra == "dev"
|
|
17
20
|
Requires-Dist: decoupler
|
|
18
21
|
Requires-Dist: django
|
|
22
|
+
Requires-Dist: flake8 (>=6.1.0,<7.0.0) ; extra == "dev"
|
|
23
|
+
Requires-Dist: gitchangelog (>=3.0.4,<4.0.0) ; extra == "dev"
|
|
19
24
|
Requires-Dist: ipykernel
|
|
20
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: isort (>=5.12.0,<6.0.0) ; extra == "dev"
|
|
26
|
+
Requires-Dist: lamindb (==0.75.1)
|
|
21
27
|
Requires-Dist: leidenalg
|
|
22
28
|
Requires-Dist: lightning
|
|
23
|
-
Requires-Dist: lnschema-bionty
|
|
24
29
|
Requires-Dist: matplotlib
|
|
30
|
+
Requires-Dist: mkdocs (>=1.5.3,<2.0.0) ; extra == "dev"
|
|
31
|
+
Requires-Dist: mypy (>=1.6.1,<2.0.0) ; extra == "dev"
|
|
25
32
|
Requires-Dist: pandas (>=2.0.0)
|
|
33
|
+
Requires-Dist: pytest (>=7.4.3,<8.0.0) ; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov (>=4.1.0,<5.0.0) ; extra == "dev"
|
|
26
35
|
Requires-Dist: scikit-misc
|
|
27
36
|
Requires-Dist: seaborn
|
|
28
37
|
Requires-Dist: torch
|
|
@@ -61,6 +70,8 @@ It allows you to:
|
|
|
61
70
|
|
|
62
71
|
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
63
72
|
|
|
73
|
+
The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
|
|
74
|
+
|
|
64
75
|
## More
|
|
65
76
|
|
|
66
77
|
I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
|
|
@@ -71,16 +82,42 @@ I needed to create this Data Loader for my PhD project. I am using it to load &
|
|
|
71
82
|
|
|
72
83
|
```bash
|
|
73
84
|
pip install scdataloader
|
|
85
|
+
# or
|
|
86
|
+
pip install scDataLoader[dev] # for dev dependencies
|
|
87
|
+
|
|
88
|
+
lamin login <email> --key <API-key>
|
|
89
|
+
lamin init --storage [folder-name-where-lamin-data-will-be-stored] --schema bionty
|
|
74
90
|
```
|
|
75
91
|
|
|
76
|
-
|
|
92
|
+
if you start with lamin and had to do a `lamin init`, you will also need to populate your ontologies. This is because scPRINT is using ontologies to define its cell types, diseases, sexes, ethnicities, etc.
|
|
93
|
+
|
|
94
|
+
you can do it manually or with our function:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from scdataloader.utils import populate_my_ontology
|
|
98
|
+
|
|
99
|
+
populate_my_ontology() #to populate everything (recommended) (can take 2-10mns)
|
|
100
|
+
|
|
101
|
+
populate_my_ontology( #the minimum for scprint to run some inferences (denoising, grn inference)
|
|
102
|
+
organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
|
|
103
|
+
sex: List[str] = ["PATO:0000384", "PATO:0000383"],
|
|
104
|
+
celltypes = None,
|
|
105
|
+
ethnicities = None,
|
|
106
|
+
assays = None,
|
|
107
|
+
tissues = None,
|
|
108
|
+
diseases = None,
|
|
109
|
+
dev_stages = None,
|
|
110
|
+
)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Dev install
|
|
114
|
+
|
|
115
|
+
If you want to use the latest version of scDataLoader and work on the code yourself use `git clone` and `pip -e` instead of `pip install`.
|
|
77
116
|
|
|
78
117
|
```bash
|
|
79
118
|
git clone https://github.com/jkobject/scDataLoader.git
|
|
80
|
-
|
|
81
|
-
poetry install
|
|
119
|
+
pip install -e scDataLoader[dev]
|
|
82
120
|
```
|
|
83
|
-
then run the notebooks with the poetry installed environment
|
|
84
121
|
|
|
85
122
|
## Usage
|
|
86
123
|
|
|
@@ -147,6 +184,27 @@ The main way to use
|
|
|
147
184
|
|
|
148
185
|
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
|
|
149
186
|
|
|
187
|
+
## FAQ
|
|
188
|
+
|
|
189
|
+
### how to update my ontologies?
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
import bionty as bt
|
|
193
|
+
bt.reset_sources()
|
|
194
|
+
|
|
195
|
+
# Run via CLI: lamin load <your instance>
|
|
196
|
+
|
|
197
|
+
import lnschema_bionty as lb
|
|
198
|
+
lb.dev.sync_bionty_source_to_latest()
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### how to load all ontologies?
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from scdataloader import utils
|
|
205
|
+
utils.populate_ontologies() # this might take from 5-20mins
|
|
206
|
+
```
|
|
207
|
+
|
|
150
208
|
## Development
|
|
151
209
|
|
|
152
210
|
Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
|
@@ -163,6 +221,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
|
|
|
163
221
|
- [scprint](https://www.jkobject.com/scPRINT/)
|
|
164
222
|
|
|
165
223
|
Awesome single cell dataloader created by @jkobject
|
|
224
|
+
|
|
166
225
|
GNU GENERAL PUBLIC LICENSE
|
|
167
226
|
Version 3, 29 June 2007
|
|
168
227
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
scdataloader/VERSION,sha256=jFS_q38a6b0acUjq5B57Co9K03JuDKxw-COi1F255gw,6
|
|
2
|
+
scdataloader/__init__.py,sha256=lbO3lGiXXgirB07KXj1Fu0BzL7T43VmitqJBTyfSz7M,147
|
|
3
|
+
scdataloader/__main__.py,sha256=db_upDq3tNEtcDH17mPIczToAqGkSKfLy0Qbj6B4YmE,6385
|
|
4
|
+
scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
|
|
5
|
+
scdataloader/collator.py,sha256=O5VK2asIfFIQc-Ozm55Bc-OORIlPj_yOt7qn6xqXd74,11292
|
|
6
|
+
scdataloader/config.py,sha256=rrW2DZxG4J2_pmpDbXXsaKJkpNC57w5dIlItiFbANYw,2905
|
|
7
|
+
scdataloader/data.py,sha256=3a9jUhREIzbxC797COGNSn6QqbRiiC30FzxXCoYsTNo,13773
|
|
8
|
+
scdataloader/datamodule.py,sha256=JZq8g274ce3ARW59qwg5GKAt2SzOTaMPGh3CySGQS70,16893
|
|
9
|
+
scdataloader/mapped.py,sha256=s_Fg-lwaXjHFyQcKnp9El2IceMoaEajynyUgOnpVnXQ,20750
|
|
10
|
+
scdataloader/preprocess.py,sha256=9dgsq7c5jD2l-CUGfwC2uG98MCIgnrYFkqknqAyu5dU,28841
|
|
11
|
+
scdataloader/utils.py,sha256=8YIVpqJzNKkIIpAS5p01gyt57X2CrfaMEsC1EJs-q_A,21451
|
|
12
|
+
scdataloader-1.0.6.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
13
|
+
scdataloader-1.0.6.dist-info/METADATA,sha256=LhqbssiiI5Y-GZeYk1nXnHFNo2SNhuc6W9LwlIX_OCo,43336
|
|
14
|
+
scdataloader-1.0.6.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
15
|
+
scdataloader-1.0.6.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
|
|
16
|
+
scdataloader-1.0.6.dist-info/RECORD,,
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
scdataloader/VERSION,sha256=WYVJhIUxBN9cNT4vaBoV_HkkdC-aLkaMKa8kjc5FzgM,6
|
|
2
|
-
scdataloader/__init__.py,sha256=NIlE4oTUPRZ3uSW_maozoEHp470I7PV1vMOJ4XpSmL4,122
|
|
3
|
-
scdataloader/__main__.py,sha256=db_upDq3tNEtcDH17mPIczToAqGkSKfLy0Qbj6B4YmE,6385
|
|
4
|
-
scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
|
|
5
|
-
scdataloader/collator.py,sha256=zkFdxirTDub1dJ1OJXO0p48kvd2r2ncKMdevAKIdTTc,13447
|
|
6
|
-
scdataloader/config.py,sha256=rrW2DZxG4J2_pmpDbXXsaKJkpNC57w5dIlItiFbANYw,2905
|
|
7
|
-
scdataloader/data.py,sha256=VugtHo9T9PqoJSv3lkJJAB89KD-fRwdVw1D76gnCc9c,12584
|
|
8
|
-
scdataloader/datamodule.py,sha256=WLEWcDMcC1G3VD5tORfhfqRRHcTscpI0EzPikg3udbI,16881
|
|
9
|
-
scdataloader/mapped.py,sha256=yF9l3obuRWbQjW8QZGRSKhc50fizXTWf3Pe1m542fW8,19481
|
|
10
|
-
scdataloader/preprocess.py,sha256=noynYWuy9clhFu9UnN-vSvAHJHwakDttkI5aj1e_T98,29055
|
|
11
|
-
scdataloader/utils.py,sha256=xyDsWaqkjhzlVBP8FiYdBUWHsel3twcVWmI53PhKqTM,21888
|
|
12
|
-
scdataloader-1.0.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
13
|
-
scdataloader-1.0.1.dist-info/METADATA,sha256=2Xd8M1dq_JmvmFjmrrzn-1U4eOtwU6L51Y_7MCkGxvY,41327
|
|
14
|
-
scdataloader-1.0.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
15
|
-
scdataloader-1.0.1.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
|
|
16
|
-
scdataloader-1.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|