lamindb 1.2a2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +3 -1
- lamindb/_view.py +2 -2
- lamindb/base/types.py +50 -11
- lamindb/core/_compat.py +60 -0
- lamindb/core/_context.py +15 -12
- lamindb/core/datasets/__init__.py +1 -0
- lamindb/core/datasets/_core.py +23 -0
- lamindb/core/datasets/_small.py +16 -2
- lamindb/core/loaders.py +22 -12
- lamindb/core/storage/_tiledbsoma.py +2 -2
- lamindb/core/storage/_zarr.py +84 -26
- lamindb/core/storage/objects.py +45 -44
- lamindb/core/types.py +11 -1
- lamindb/curators/__init__.py +1430 -1665
- lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
- lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
- lamindb/models/_feature_manager.py +86 -42
- lamindb/models/_from_values.py +110 -119
- lamindb/models/_label_manager.py +17 -10
- lamindb/models/artifact.py +170 -102
- lamindb/models/can_curate.py +200 -231
- lamindb/models/feature.py +76 -47
- lamindb/models/project.py +69 -7
- lamindb/models/query_set.py +12 -2
- lamindb/models/record.py +77 -50
- lamindb/models/run.py +20 -7
- lamindb/models/schema.py +7 -15
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/METADATA +8 -7
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/RECORD +31 -30
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/LICENSE +0 -0
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/WHEEL +0 -0
lamindb/curators/__init__.py
CHANGED
@@ -1,25 +1,27 @@
|
|
1
1
|
"""Curators.
|
2
2
|
|
3
|
-
.. versionadded:: 1.1.0
|
4
|
-
|
5
3
|
.. autosummary::
|
6
4
|
:toctree: .
|
7
5
|
|
8
|
-
Curator
|
9
6
|
DataFrameCurator
|
10
7
|
AnnDataCurator
|
8
|
+
MuDataCurator
|
9
|
+
SpatialDataCurator
|
11
10
|
|
12
|
-
|
11
|
+
Helper classes.
|
13
12
|
|
14
13
|
.. autosummary::
|
15
14
|
:toctree: .
|
16
15
|
|
16
|
+
Curator
|
17
|
+
SlotsCurator
|
17
18
|
CatManager
|
19
|
+
CatLookup
|
18
20
|
DataFrameCatManager
|
19
21
|
AnnDataCatManager
|
20
22
|
MuDataCatManager
|
23
|
+
SpatialDataCatManager
|
21
24
|
TiledbsomaCatManager
|
22
|
-
CurateLookup
|
23
25
|
|
24
26
|
"""
|
25
27
|
|
@@ -27,9 +29,8 @@ from __future__ import annotations
|
|
27
29
|
|
28
30
|
import copy
|
29
31
|
import re
|
30
|
-
from importlib import resources
|
31
32
|
from itertools import chain
|
32
|
-
from typing import TYPE_CHECKING, Any, Literal
|
33
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal
|
33
34
|
|
34
35
|
import anndata as ad
|
35
36
|
import lamindb_setup as ln_setup
|
@@ -37,45 +38,44 @@ import pandas as pd
|
|
37
38
|
import pandera
|
38
39
|
import pyarrow as pa
|
39
40
|
from lamin_utils import colors, logger
|
40
|
-
from lamindb_setup.core import deprecated
|
41
|
+
from lamindb_setup.core import deprecated
|
41
42
|
from lamindb_setup.core._docs import doc_args
|
42
43
|
from lamindb_setup.core.upath import UPath
|
43
44
|
|
44
|
-
from lamindb.core.storage._backed_access import backed_access
|
45
|
-
|
46
|
-
from ._cellxgene_schemas import _read_schema_versions
|
47
|
-
|
48
45
|
if TYPE_CHECKING:
|
49
|
-
from anndata import AnnData
|
50
46
|
from lamindb_setup.core.types import UPathStr
|
47
|
+
from mudata import MuData
|
48
|
+
from spatialdata import SpatialData
|
51
49
|
|
52
|
-
from lamindb.
|
50
|
+
from lamindb.core.types import ScverseDataStructures
|
53
51
|
from lamindb.models import Record
|
54
52
|
from lamindb.base.types import FieldAttr # noqa
|
55
53
|
from lamindb.core._settings import settings
|
56
54
|
from lamindb.models import (
|
57
55
|
Artifact,
|
58
|
-
Collection,
|
59
56
|
Feature,
|
60
57
|
Record,
|
61
58
|
Run,
|
62
59
|
Schema,
|
63
60
|
ULabel,
|
64
61
|
)
|
65
|
-
from lamindb.models.
|
66
|
-
|
67
|
-
|
62
|
+
from lamindb.models.artifact import (
|
63
|
+
add_labels,
|
64
|
+
data_is_anndata,
|
65
|
+
data_is_mudata,
|
66
|
+
data_is_spatialdata,
|
67
|
+
)
|
68
|
+
from lamindb.models.feature import parse_dtype, parse_cat_dtype
|
68
69
|
from lamindb.models._from_values import _format_values
|
69
70
|
|
70
71
|
from ..errors import InvalidArgument, ValidationError
|
72
|
+
from anndata import AnnData
|
71
73
|
|
72
74
|
if TYPE_CHECKING:
|
73
75
|
from collections.abc import Iterable, MutableMapping
|
74
76
|
from typing import Any
|
75
77
|
|
76
78
|
from lamindb_setup.core.types import UPathStr
|
77
|
-
from mudata import MuData
|
78
|
-
from spatialdata import SpatialData
|
79
79
|
|
80
80
|
from lamindb.models.query_set import RecordList
|
81
81
|
|
@@ -86,7 +86,7 @@ def strip_ansi_codes(text):
|
|
86
86
|
return ansi_pattern.sub("", text)
|
87
87
|
|
88
88
|
|
89
|
-
class
|
89
|
+
class CatLookup:
|
90
90
|
"""Lookup categories from the reference instance.
|
91
91
|
|
92
92
|
Args:
|
@@ -94,10 +94,10 @@ class CurateLookup:
|
|
94
94
|
slots: A dictionary of slot fields to lookup.
|
95
95
|
public: Whether to lookup from the public instance. Defaults to False.
|
96
96
|
|
97
|
-
Example
|
98
|
-
|
99
|
-
|
100
|
-
|
97
|
+
Example::
|
98
|
+
|
99
|
+
curator = ln.curators.DataFrameCurator(...)
|
100
|
+
curator.cat.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
|
101
101
|
|
102
102
|
"""
|
103
103
|
|
@@ -106,16 +106,22 @@ class CurateLookup:
|
|
106
106
|
categoricals: dict[str, FieldAttr],
|
107
107
|
slots: dict[str, FieldAttr] = None,
|
108
108
|
public: bool = False,
|
109
|
+
organism: str | None = None,
|
110
|
+
sources: dict[str, Record] | None = None,
|
109
111
|
) -> None:
|
110
112
|
slots = slots or {}
|
111
113
|
self._categoricals = {**categoricals, **slots}
|
112
114
|
self._public = public
|
115
|
+
self._organism = organism
|
116
|
+
self._sources = sources
|
113
117
|
|
114
118
|
def __getattr__(self, name):
|
115
119
|
if name in self._categoricals:
|
116
120
|
registry = self._categoricals[name].field.model
|
117
121
|
if self._public and hasattr(registry, "public"):
|
118
|
-
return registry.public(
|
122
|
+
return registry.public(
|
123
|
+
organism=self._organism, source=self._sources.get(name)
|
124
|
+
).lookup()
|
119
125
|
else:
|
120
126
|
return registry.lookup()
|
121
127
|
raise AttributeError(
|
@@ -126,7 +132,9 @@ class CurateLookup:
|
|
126
132
|
if name in self._categoricals:
|
127
133
|
registry = self._categoricals[name].field.model
|
128
134
|
if self._public and hasattr(registry, "public"):
|
129
|
-
return registry.public(
|
135
|
+
return registry.public(
|
136
|
+
organism=self._organism, source=self._sources.get(name)
|
137
|
+
).lookup()
|
130
138
|
else:
|
131
139
|
return registry.lookup()
|
132
140
|
raise AttributeError(
|
@@ -150,7 +158,7 @@ class CurateLookup:
|
|
150
158
|
" → categories.alveolar_type_1_fibroblast_cell\n\n"
|
151
159
|
"To look up public ontologies, use .lookup(public=True)"
|
152
160
|
)
|
153
|
-
else: #
|
161
|
+
else: # pragma: no cover
|
154
162
|
return colors.warning("No fields are found!")
|
155
163
|
|
156
164
|
|
@@ -163,7 +171,7 @@ SLOTS_DOCSTRING = """Curator objects by slot.
|
|
163
171
|
"""
|
164
172
|
|
165
173
|
|
166
|
-
VALIDATE_DOCSTRING = """Validate dataset.
|
174
|
+
VALIDATE_DOCSTRING = """Validate dataset against Schema.
|
167
175
|
|
168
176
|
Raises:
|
169
177
|
lamindb.errors.ValidationError: If validation fails.
|
@@ -183,15 +191,17 @@ Returns:
|
|
183
191
|
|
184
192
|
|
185
193
|
class Curator:
|
186
|
-
"""
|
194
|
+
"""Curator base class.
|
187
195
|
|
188
196
|
A `Curator` object makes it easy to validate, standardize & annotate datasets.
|
189
197
|
|
190
|
-
.. versionadded:: 1.1.0
|
191
|
-
|
192
198
|
See:
|
193
199
|
- :class:`~lamindb.curators.DataFrameCurator`
|
194
200
|
- :class:`~lamindb.curators.AnnDataCurator`
|
201
|
+
- :class:`~lamindb.curators.MuDataCurator`
|
202
|
+
- :class:`~lamindb.curators.SpatialDataCurator`
|
203
|
+
|
204
|
+
.. versionadded:: 1.1.0
|
195
205
|
"""
|
196
206
|
|
197
207
|
def __init__(self, dataset: Any, schema: Schema | None = None):
|
@@ -199,7 +209,12 @@ class Curator:
|
|
199
209
|
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
200
210
|
if isinstance(self._dataset, Artifact):
|
201
211
|
self._artifact = self._dataset
|
202
|
-
if self._artifact.otype in {
|
212
|
+
if self._artifact.otype in {
|
213
|
+
"DataFrame",
|
214
|
+
"AnnData",
|
215
|
+
"MuData",
|
216
|
+
"SpatialData",
|
217
|
+
}:
|
203
218
|
self._dataset = self._dataset.load()
|
204
219
|
self._schema: Schema | None = schema
|
205
220
|
self._is_validated: bool = False
|
@@ -208,7 +223,7 @@ class Curator:
|
|
208
223
|
@doc_args(VALIDATE_DOCSTRING)
|
209
224
|
def validate(self) -> bool | str:
|
210
225
|
"""{}""" # noqa: D415
|
211
|
-
pass #
|
226
|
+
pass # pragma: no cover
|
212
227
|
|
213
228
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
214
229
|
def save_artifact(
|
@@ -222,12 +237,97 @@ class Curator:
|
|
222
237
|
"""{}""" # noqa: D415
|
223
238
|
# Note that this docstring has to be consistent with the Artifact()
|
224
239
|
# constructor signature
|
225
|
-
pass
|
240
|
+
pass # pragma: no cover
|
241
|
+
|
242
|
+
|
243
|
+
class SlotsCurator(Curator):
|
244
|
+
"""Curator for a dataset with slots.
|
245
|
+
|
246
|
+
Args:
|
247
|
+
dataset: The dataset to validate & annotate.
|
248
|
+
schema: A `Schema` object that defines the validation constraints.
|
249
|
+
|
250
|
+
.. versionadded:: 1.3.0
|
251
|
+
"""
|
252
|
+
|
253
|
+
def __init__(
|
254
|
+
self,
|
255
|
+
dataset: Any,
|
256
|
+
schema: Schema,
|
257
|
+
) -> None:
|
258
|
+
super().__init__(dataset=dataset, schema=schema)
|
259
|
+
self._slots: dict[str, DataFrameCurator] = {}
|
260
|
+
|
261
|
+
# used in MuDataCurator and SpatialDataCurator
|
262
|
+
# in form of {table/modality_key: var_field}
|
263
|
+
self._var_fields: dict[str, FieldAttr] = {}
|
264
|
+
# in form of {table/modality_key: categoricals}
|
265
|
+
self._categoricals: dict[str, dict[str, FieldAttr]] = {}
|
266
|
+
|
267
|
+
@property
|
268
|
+
@doc_args(SLOTS_DOCSTRING)
|
269
|
+
def slots(self) -> dict[str, DataFrameCurator]:
|
270
|
+
"""{}""" # noqa: D415
|
271
|
+
return self._slots
|
272
|
+
|
273
|
+
@doc_args(VALIDATE_DOCSTRING)
|
274
|
+
def validate(self) -> None:
|
275
|
+
"""{}""" # noqa: D415
|
276
|
+
for _, curator in self._slots.items():
|
277
|
+
curator.validate()
|
278
|
+
|
279
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
280
|
+
def save_artifact(
|
281
|
+
self,
|
282
|
+
*,
|
283
|
+
key: str | None = None,
|
284
|
+
description: str | None = None,
|
285
|
+
revises: Artifact | None = None,
|
286
|
+
run: Run | None = None,
|
287
|
+
) -> Artifact:
|
288
|
+
"""{}""" # noqa: D415
|
289
|
+
if not self._is_validated:
|
290
|
+
self.validate()
|
291
|
+
|
292
|
+
# default implementation for MuDataCurator and SpatialDataCurator
|
293
|
+
return save_artifact( # type: ignore
|
294
|
+
self._dataset,
|
295
|
+
key=key,
|
296
|
+
description=description,
|
297
|
+
fields=self._categoricals,
|
298
|
+
index_field=self._var_fields,
|
299
|
+
artifact=self._artifact,
|
300
|
+
revises=revises,
|
301
|
+
run=run,
|
302
|
+
schema=self._schema,
|
303
|
+
)
|
304
|
+
|
305
|
+
|
306
|
+
def check_dtype(expected_type) -> Callable:
|
307
|
+
"""Creates a check function for Pandera that validates a column's dtype.
|
308
|
+
|
309
|
+
Args:
|
310
|
+
expected_type: String identifier for the expected type ('int', 'float', or 'num')
|
311
|
+
|
312
|
+
Returns:
|
313
|
+
A function that checks if a series has the expected dtype
|
314
|
+
"""
|
315
|
+
|
316
|
+
def check_function(series):
|
317
|
+
if expected_type == "int":
|
318
|
+
is_valid = pd.api.types.is_integer_dtype(series.dtype)
|
319
|
+
elif expected_type == "float":
|
320
|
+
is_valid = pd.api.types.is_float_dtype(series.dtype)
|
321
|
+
elif expected_type == "num":
|
322
|
+
is_valid = pd.api.types.is_numeric_dtype(series.dtype)
|
323
|
+
return is_valid
|
324
|
+
|
325
|
+
return check_function
|
226
326
|
|
227
327
|
|
228
328
|
class DataFrameCurator(Curator):
|
229
329
|
# the example in the docstring is tested in test_curators_quickstart_example
|
230
|
-
"""Curator for
|
330
|
+
"""Curator for `DataFrame`.
|
231
331
|
|
232
332
|
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
233
333
|
|
@@ -278,12 +378,33 @@ class DataFrameCurator(Curator):
|
|
278
378
|
# populate features
|
279
379
|
pandera_columns = {}
|
280
380
|
for feature in schema.features.all():
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
381
|
+
if feature.dtype in {"int", "float", "num"}:
|
382
|
+
dtype = (
|
383
|
+
self._dataset[feature.name].dtype
|
384
|
+
if feature.name in self._dataset.columns
|
385
|
+
else None
|
386
|
+
)
|
387
|
+
pandera_columns[feature.name] = pandera.Column(
|
388
|
+
dtype=None,
|
389
|
+
checks=pandera.Check(
|
390
|
+
check_dtype(feature.dtype),
|
391
|
+
element_wise=False,
|
392
|
+
error=f"Column '{feature.name}' failed dtype check for '{feature.dtype}': got {dtype}",
|
393
|
+
),
|
394
|
+
nullable=feature.nullable,
|
395
|
+
coerce=feature.coerce_dtype,
|
396
|
+
)
|
397
|
+
else:
|
398
|
+
pandera_dtype = (
|
399
|
+
feature.dtype
|
400
|
+
if not feature.dtype.startswith("cat")
|
401
|
+
else "category"
|
402
|
+
)
|
403
|
+
pandera_columns[feature.name] = pandera.Column(
|
404
|
+
pandera_dtype,
|
405
|
+
nullable=feature.nullable,
|
406
|
+
coerce=feature.coerce_dtype,
|
407
|
+
)
|
287
408
|
if feature.dtype.startswith("cat"):
|
288
409
|
categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
|
289
410
|
self._pandera_schema = pandera.DataFrameSchema(
|
@@ -293,7 +414,7 @@ class DataFrameCurator(Curator):
|
|
293
414
|
assert schema.itype is not None # noqa: S101
|
294
415
|
self._cat_manager = DataFrameCatManager(
|
295
416
|
self._dataset,
|
296
|
-
columns=
|
417
|
+
columns=parse_cat_dtype(schema.itype, is_itype=True)["field"],
|
297
418
|
categoricals=categoricals,
|
298
419
|
)
|
299
420
|
|
@@ -378,16 +499,16 @@ class DataFrameCurator(Curator):
|
|
378
499
|
description: str | None = None,
|
379
500
|
revises: Artifact | None = None,
|
380
501
|
run: Run | None = None,
|
381
|
-
):
|
502
|
+
) -> Artifact:
|
382
503
|
"""{}""" # noqa: D415
|
383
504
|
if not self._is_validated:
|
384
505
|
self.validate() # raises ValidationError if doesn't validate
|
385
|
-
result =
|
506
|
+
result = parse_cat_dtype(self._schema.itype, is_itype=True)
|
386
507
|
return save_artifact( # type: ignore
|
387
508
|
self._dataset,
|
388
509
|
description=description,
|
389
510
|
fields=self._cat_manager.categoricals,
|
390
|
-
|
511
|
+
index_field=result["field"],
|
391
512
|
key=key,
|
392
513
|
artifact=self._artifact,
|
393
514
|
revises=revises,
|
@@ -396,9 +517,9 @@ class DataFrameCurator(Curator):
|
|
396
517
|
)
|
397
518
|
|
398
519
|
|
399
|
-
class AnnDataCurator(
|
520
|
+
class AnnDataCurator(SlotsCurator):
|
400
521
|
# the example in the docstring is tested in test_curators_quickstart_example
|
401
|
-
"""Curator for
|
522
|
+
"""Curator for `AnnData`.
|
402
523
|
|
403
524
|
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
404
525
|
|
@@ -446,7 +567,7 @@ class AnnDataCurator(Curator):
|
|
446
567
|
).save()
|
447
568
|
|
448
569
|
# curate an AnnData
|
449
|
-
adata = datasets.small_dataset1(otype="AnnData")
|
570
|
+
adata = ln.core.datasets.small_dataset1(otype="AnnData")
|
450
571
|
curator = ln.curators.AnnDataCurator(adata, anndata_schema)
|
451
572
|
artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
|
452
573
|
assert artifact.schema == anndata_schema
|
@@ -466,28 +587,16 @@ class AnnDataCurator(Curator):
|
|
466
587
|
self._slots = {
|
467
588
|
slot: DataFrameCurator(
|
468
589
|
(
|
469
|
-
self._dataset
|
590
|
+
getattr(self._dataset, slot).T
|
470
591
|
if slot == "var"
|
471
|
-
else self._dataset
|
592
|
+
else getattr(self._dataset, slot)
|
472
593
|
),
|
473
594
|
slot_schema,
|
474
595
|
)
|
475
596
|
for slot, slot_schema in schema.slots.items()
|
476
|
-
if slot in {"obs", "var"}
|
597
|
+
if slot in {"obs", "var", "uns"}
|
477
598
|
}
|
478
599
|
|
479
|
-
@property
|
480
|
-
@doc_args(SLOTS_DOCSTRING)
|
481
|
-
def slots(self) -> dict[str, DataFrameCurator]:
|
482
|
-
"""{}""" # noqa: D415
|
483
|
-
return self._slots
|
484
|
-
|
485
|
-
@doc_args(VALIDATE_DOCSTRING)
|
486
|
-
def validate(self) -> None:
|
487
|
-
"""{}""" # noqa: D415
|
488
|
-
for _, curator in self._slots.items():
|
489
|
-
curator.validate()
|
490
|
-
|
491
600
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
492
601
|
def save_artifact(
|
493
602
|
self,
|
@@ -496,18 +605,20 @@ class AnnDataCurator(Curator):
|
|
496
605
|
description: str | None = None,
|
497
606
|
revises: Artifact | None = None,
|
498
607
|
run: Run | None = None,
|
499
|
-
):
|
608
|
+
) -> Artifact:
|
500
609
|
"""{}""" # noqa: D415
|
501
610
|
if not self._is_validated:
|
502
611
|
self.validate()
|
612
|
+
if "obs" in self.slots:
|
613
|
+
categoricals = self.slots["obs"]._cat_manager.categoricals
|
614
|
+
else:
|
615
|
+
categoricals = {}
|
503
616
|
return save_artifact( # type: ignore
|
504
617
|
self._dataset,
|
505
618
|
description=description,
|
506
|
-
fields=
|
507
|
-
|
508
|
-
|
509
|
-
"field"
|
510
|
-
]
|
619
|
+
fields=categoricals,
|
620
|
+
index_field=(
|
621
|
+
parse_cat_dtype(self.slots["var"]._schema.itype, is_itype=True)["field"]
|
511
622
|
if "var" in self._slots
|
512
623
|
else None
|
513
624
|
),
|
@@ -519,34 +630,286 @@ class AnnDataCurator(Curator):
|
|
519
630
|
)
|
520
631
|
|
521
632
|
|
522
|
-
|
523
|
-
|
633
|
+
def _assign_var_fields_categoricals_multimodal(
|
634
|
+
modality: str | None,
|
635
|
+
slot_type: str,
|
636
|
+
slot: str,
|
637
|
+
slot_schema: Schema,
|
638
|
+
var_fields: dict[str, FieldAttr],
|
639
|
+
categoricals: dict[str, dict[str, FieldAttr]],
|
640
|
+
slots: dict[str, DataFrameCurator],
|
641
|
+
) -> None:
|
642
|
+
"""Assigns var_fields and categoricals for multimodal data curators."""
|
643
|
+
if modality is not None:
|
644
|
+
# Makes sure that all tables are present
|
645
|
+
var_fields[modality] = None
|
646
|
+
categoricals[modality] = {}
|
647
|
+
|
648
|
+
if slot_type == "var":
|
649
|
+
var_field = parse_cat_dtype(slot_schema.itype, is_itype=True)["field"]
|
650
|
+
if modality is None:
|
651
|
+
# This should rarely/never be used since tables should have different var fields
|
652
|
+
var_fields[slot] = var_field # pragma: no cover
|
653
|
+
else:
|
654
|
+
# Note that this is NOT nested since the nested key is always "var"
|
655
|
+
var_fields[modality] = var_field
|
656
|
+
else:
|
657
|
+
obs_fields = slots[slot]._cat_manager.categoricals
|
658
|
+
if modality is None:
|
659
|
+
categoricals[slot] = obs_fields
|
660
|
+
else:
|
661
|
+
# Note that this is NOT nested since the nested key is always "obs"
|
662
|
+
categoricals[modality] = obs_fields
|
663
|
+
|
664
|
+
|
665
|
+
class MuDataCurator(SlotsCurator):
|
666
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
667
|
+
"""Curator for `MuData`.
|
524
668
|
|
525
|
-
|
669
|
+
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
526
670
|
|
527
|
-
|
671
|
+
.. versionadded:: 1.3.0
|
528
672
|
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
>>> columns=Feature.name, # map column names
|
533
|
-
>>> categoricals={"perturbation": ULabel.name}, # map categories
|
534
|
-
>>> )
|
535
|
-
>>> cat_manager.validate() # validate the dataframe
|
536
|
-
>>> artifact = cat_manager.save_artifact(description="my RNA-seq")
|
537
|
-
>>> artifact.describe() # see annotations
|
673
|
+
Args:
|
674
|
+
dataset: The MuData-like object to validate & annotate.
|
675
|
+
schema: A `Schema` object that defines the validation constraints.
|
538
676
|
|
539
|
-
|
677
|
+
Example::
|
540
678
|
|
541
|
-
|
679
|
+
import lamindb as ln
|
680
|
+
import bionty as bt
|
681
|
+
|
682
|
+
# define the global obs schema
|
683
|
+
obs_schema = ln.Schema(
|
684
|
+
name="mudata_papalexi21_subset_obs_schema",
|
685
|
+
features=[
|
686
|
+
ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
|
687
|
+
ln.Feature(name="replicate", dtype="cat[ULabel[Replicate]]").save(),
|
688
|
+
],
|
689
|
+
).save()
|
690
|
+
|
691
|
+
# define the ['rna'].obs schema
|
692
|
+
obs_schema_rna = ln.Schema(
|
693
|
+
name="mudata_papalexi21_subset_rna_obs_schema",
|
694
|
+
features=[
|
695
|
+
ln.Feature(name="nCount_RNA", dtype=int).save(),
|
696
|
+
ln.Feature(name="nFeature_RNA", dtype=int).save(),
|
697
|
+
ln.Feature(name="percent.mito", dtype=float).save(),
|
698
|
+
],
|
699
|
+
coerce_dtype=True,
|
700
|
+
).save()
|
701
|
+
|
702
|
+
# define the ['hto'].obs schema
|
703
|
+
obs_schema_hto = ln.Schema(
|
704
|
+
name="mudata_papalexi21_subset_hto_obs_schema",
|
705
|
+
features=[
|
706
|
+
ln.Feature(name="nCount_HTO", dtype=int).save(),
|
707
|
+
ln.Feature(name="nFeature_HTO", dtype=int).save(),
|
708
|
+
ln.Feature(name="technique", dtype=bt.ExperimentalFactor).save(),
|
709
|
+
],
|
710
|
+
coerce_dtype=True,
|
711
|
+
).save()
|
712
|
+
|
713
|
+
# define ['rna'].var schema
|
714
|
+
var_schema_rna = ln.Schema(
|
715
|
+
name="mudata_papalexi21_subset_rna_var_schema",
|
716
|
+
itype=bt.Gene.symbol,
|
717
|
+
dtype=float,
|
718
|
+
).save()
|
542
719
|
|
543
|
-
|
544
|
-
|
720
|
+
# define composite schema
|
721
|
+
mudata_schema = ln.Schema(
|
722
|
+
name="mudata_papalexi21_subset_mudata_schema",
|
723
|
+
otype="MuData",
|
724
|
+
components={
|
725
|
+
"obs": obs_schema,
|
726
|
+
"rna:obs": obs_schema_rna,
|
727
|
+
"hto:obs": obs_schema_hto,
|
728
|
+
"rna:var": var_schema_rna,
|
729
|
+
},
|
730
|
+
).save()
|
731
|
+
|
732
|
+
# curate a MuData
|
733
|
+
mdata = ln.core.datasets.mudata_papalexi21_subset()
|
734
|
+
bt.settings.organism = "human" # set the organism
|
735
|
+
curator = ln.curators.MuDataCurator(mdata, mudata_schema)
|
736
|
+
artifact = curator.save_artifact(key="example_datasets/mudata_papalexi21_subset.h5mu")
|
737
|
+
assert artifact.schema == mudata_schema
|
545
738
|
"""
|
546
739
|
|
547
740
|
def __init__(
|
548
|
-
self,
|
549
|
-
|
741
|
+
self,
|
742
|
+
dataset: MuData | Artifact,
|
743
|
+
schema: Schema,
|
744
|
+
) -> None:
|
745
|
+
super().__init__(dataset=dataset, schema=schema)
|
746
|
+
if not data_is_mudata(self._dataset):
|
747
|
+
raise InvalidArgument("dataset must be MuData-like.")
|
748
|
+
if schema.otype != "MuData":
|
749
|
+
raise InvalidArgument("Schema otype must be 'MuData'.")
|
750
|
+
|
751
|
+
for slot, slot_schema in schema.slots.items():
|
752
|
+
# Assign to _slots
|
753
|
+
if ":" in slot:
|
754
|
+
modality, modality_slot = slot.split(":")
|
755
|
+
schema_dataset = self._dataset.__getitem__(modality)
|
756
|
+
else:
|
757
|
+
modality, modality_slot = None, slot
|
758
|
+
schema_dataset = self._dataset
|
759
|
+
self._slots[slot] = DataFrameCurator(
|
760
|
+
(
|
761
|
+
getattr(schema_dataset, modality_slot).T
|
762
|
+
if modality_slot == "var"
|
763
|
+
else getattr(schema_dataset, modality_slot)
|
764
|
+
),
|
765
|
+
slot_schema,
|
766
|
+
)
|
767
|
+
_assign_var_fields_categoricals_multimodal(
|
768
|
+
modality=modality,
|
769
|
+
slot_type=modality_slot,
|
770
|
+
slot=slot,
|
771
|
+
slot_schema=slot_schema,
|
772
|
+
var_fields=self._var_fields,
|
773
|
+
categoricals=self._categoricals,
|
774
|
+
slots=self._slots,
|
775
|
+
)
|
776
|
+
|
777
|
+
# for consistency with BaseCatManager
|
778
|
+
self._columns_field = self._var_fields
|
779
|
+
|
780
|
+
|
781
|
+
class SpatialDataCurator(SlotsCurator):
|
782
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
783
|
+
"""Curator for `SpatialData`.
|
784
|
+
|
785
|
+
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
786
|
+
|
787
|
+
.. versionadded:: 1.3.0
|
788
|
+
|
789
|
+
Args:
|
790
|
+
dataset: The SpatialData-like object to validate & annotate.
|
791
|
+
schema: A `Schema` object that defines the validation constraints.
|
792
|
+
|
793
|
+
Example::
|
794
|
+
|
795
|
+
import lamindb as ln
|
796
|
+
import bionty as bt
|
797
|
+
|
798
|
+
# define sample schema
|
799
|
+
sample_schema = ln.Schema(
|
800
|
+
name="blobs_sample_level_metadata",
|
801
|
+
features=[
|
802
|
+
ln.Feature(name="assay", dtype=bt.ExperimentalFactor).save(),
|
803
|
+
ln.Feature(name="disease", dtype=bt.Disease).save(),
|
804
|
+
ln.Feature(name="development_stage", dtype=bt.DevelopmentalStage).save(),
|
805
|
+
],
|
806
|
+
coerce_dtype=True
|
807
|
+
).save()
|
808
|
+
|
809
|
+
# define table obs schema
|
810
|
+
blobs_obs_schema = ln.Schema(
|
811
|
+
name="blobs_obs_level_metadata",
|
812
|
+
features=[
|
813
|
+
ln.Feature(name="sample_region", dtype="str").save(),
|
814
|
+
],
|
815
|
+
coerce_dtype=True
|
816
|
+
).save()
|
817
|
+
|
818
|
+
# define table var schema
|
819
|
+
blobs_var_schema = ln.Schema(
|
820
|
+
name="blobs_var_schema",
|
821
|
+
itype=bt.Gene.ensembl_gene_id,
|
822
|
+
dtype=int
|
823
|
+
).save()
|
824
|
+
|
825
|
+
# define composite schema
|
826
|
+
spatialdata_schema = ln.Schema(
|
827
|
+
name="blobs_spatialdata_schema",
|
828
|
+
otype="SpatialData",
|
829
|
+
components={
|
830
|
+
"sample": sample_schema,
|
831
|
+
"table:obs": blobs_obs_schema,
|
832
|
+
"table:var": blobs_var_schema,
|
833
|
+
}).save()
|
834
|
+
|
835
|
+
# curate a SpatialData
|
836
|
+
spatialdata = ln.core.datasets.spatialdata_blobs()
|
837
|
+
curator = ln.curators.SpatialDataCurator(spatialdata, spatialdata_schema)
|
838
|
+
try:
|
839
|
+
curator.validate()
|
840
|
+
except ln.errors.ValidationError as error:
|
841
|
+
print(error)
|
842
|
+
|
843
|
+
# validate again (must pass now) and save artifact
|
844
|
+
artifact = curator.save_artifact(key="example_datasets/spatialdata1.zarr")
|
845
|
+
assert artifact.schema == spatialdata_schema
|
846
|
+
"""
|
847
|
+
|
848
|
+
def __init__(
|
849
|
+
self,
|
850
|
+
dataset: SpatialData | Artifact,
|
851
|
+
schema: Schema,
|
852
|
+
*,
|
853
|
+
sample_metadata_key: str | None = "sample",
|
854
|
+
) -> None:
|
855
|
+
super().__init__(dataset=dataset, schema=schema)
|
856
|
+
if not data_is_spatialdata(self._dataset):
|
857
|
+
raise InvalidArgument("dataset must be SpatialData-like.")
|
858
|
+
if schema.otype != "SpatialData":
|
859
|
+
raise InvalidArgument("Schema otype must be 'SpatialData'.")
|
860
|
+
|
861
|
+
for slot, slot_schema in schema.slots.items():
|
862
|
+
# Assign to _slots
|
863
|
+
if ":" in slot:
|
864
|
+
table_key, table_slot = slot.split(":")
|
865
|
+
schema_dataset = self._dataset.tables.__getitem__(table_key)
|
866
|
+
# sample metadata (does not have a `:` separator)
|
867
|
+
else:
|
868
|
+
table_key = None
|
869
|
+
table_slot = slot
|
870
|
+
schema_dataset = self._dataset.get_attrs(
|
871
|
+
key=sample_metadata_key, return_as="df", flatten=True
|
872
|
+
)
|
873
|
+
|
874
|
+
self._slots[slot] = DataFrameCurator(
|
875
|
+
(
|
876
|
+
getattr(schema_dataset, table_slot).T
|
877
|
+
if table_slot == "var"
|
878
|
+
else (
|
879
|
+
getattr(schema_dataset, table_slot)
|
880
|
+
if table_slot != sample_metadata_key
|
881
|
+
else schema_dataset
|
882
|
+
) # just take the schema_dataset if it's the sample metadata key
|
883
|
+
),
|
884
|
+
slot_schema,
|
885
|
+
)
|
886
|
+
|
887
|
+
_assign_var_fields_categoricals_multimodal(
|
888
|
+
modality=table_key,
|
889
|
+
slot_type=table_slot,
|
890
|
+
slot=slot,
|
891
|
+
slot_schema=slot_schema,
|
892
|
+
var_fields=self._var_fields,
|
893
|
+
categoricals=self._categoricals,
|
894
|
+
slots=self._slots,
|
895
|
+
)
|
896
|
+
|
897
|
+
# for consistency with BaseCatManager
|
898
|
+
self._columns_field = self._var_fields
|
899
|
+
|
900
|
+
|
901
|
+
class CatManager:
|
902
|
+
"""Manage categoricals by updating registries.
|
903
|
+
|
904
|
+
This class is accessible from within a `DataFrameCurator` via the `.cat` attribute.
|
905
|
+
|
906
|
+
If you find non-validated values, you have several options:
|
907
|
+
|
908
|
+
- new values found in the data can be registered via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.add_new_from`
|
909
|
+
- non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
|
910
|
+
"""
|
911
|
+
|
912
|
+
def __init__(self, *, dataset, categoricals, sources, organism, columns_field=None):
|
550
913
|
# the below is shared with Curator
|
551
914
|
self._artifact: Artifact = None # pass the dataset as an artifact
|
552
915
|
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
@@ -558,11 +921,16 @@ class CatManager:
|
|
558
921
|
# shared until here
|
559
922
|
self._categoricals = categoricals or {}
|
560
923
|
self._non_validated = None
|
561
|
-
self._organism = organism
|
562
924
|
self._sources = sources or {}
|
563
|
-
self._exclude = exclude or {}
|
564
925
|
self._columns_field = columns_field
|
565
926
|
self._validate_category_error_messages: str = ""
|
927
|
+
# make sure to only fetch organism once at the beginning
|
928
|
+
if organism:
|
929
|
+
self._organism = organism
|
930
|
+
else:
|
931
|
+
fields = list(self._categoricals.values()) + [columns_field]
|
932
|
+
organisms = {get_organism_kwargs(field).get("organism") for field in fields}
|
933
|
+
self._organism = organisms.pop() if len(organisms) > 0 else None
|
566
934
|
|
567
935
|
@property
|
568
936
|
def non_validated(self) -> dict[str, list[str]]:
|
@@ -607,7 +975,7 @@ class CatManager:
|
|
607
975
|
Returns:
|
608
976
|
The boolean `True` if the dataset is validated. Otherwise, a string with the error message.
|
609
977
|
"""
|
610
|
-
pass
|
978
|
+
pass # pragma: no cover
|
611
979
|
|
612
980
|
def standardize(self, key: str) -> None:
|
613
981
|
"""Replace synonyms with standardized values.
|
@@ -620,7 +988,7 @@ class CatManager:
|
|
620
988
|
Returns:
|
621
989
|
None
|
622
990
|
"""
|
623
|
-
pass #
|
991
|
+
pass # pragma: no cover
|
624
992
|
|
625
993
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
626
994
|
def save_artifact(
|
@@ -632,64 +1000,30 @@ class CatManager:
|
|
632
1000
|
run: Run | None = None,
|
633
1001
|
) -> Artifact:
|
634
1002
|
"""{}""" # noqa: D415
|
635
|
-
|
636
|
-
|
1003
|
+
# Make sure all labels are saved in the current instance
|
637
1004
|
if not self._is_validated:
|
638
1005
|
self.validate() # returns True or False
|
639
1006
|
if not self._is_validated: # need to raise error manually
|
640
1007
|
raise ValidationError("Dataset does not validate. Please curate.")
|
641
1008
|
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
self.
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
run=run,
|
655
|
-
schema=None,
|
656
|
-
organism=self._organism,
|
657
|
-
)
|
658
|
-
finally:
|
659
|
-
settings.verbosity = verbosity
|
1009
|
+
self._artifact = save_artifact( # type: ignore
|
1010
|
+
self._dataset,
|
1011
|
+
key=key,
|
1012
|
+
description=description,
|
1013
|
+
fields=self.categoricals,
|
1014
|
+
index_field=self._columns_field,
|
1015
|
+
artifact=self._artifact,
|
1016
|
+
revises=revises,
|
1017
|
+
run=run,
|
1018
|
+
schema=None,
|
1019
|
+
organism=self._organism,
|
1020
|
+
)
|
660
1021
|
|
661
1022
|
return self._artifact
|
662
1023
|
|
663
1024
|
|
664
1025
|
class DataFrameCatManager(CatManager):
|
665
|
-
"""
|
666
|
-
|
667
|
-
See also :class:`~lamindb.Curator`.
|
668
|
-
|
669
|
-
Args:
|
670
|
-
df: The DataFrame object to curate.
|
671
|
-
columns: The field attribute for the feature column.
|
672
|
-
categoricals: A dictionary mapping column names to registry_field.
|
673
|
-
verbosity: The verbosity level.
|
674
|
-
organism: The organism name.
|
675
|
-
sources: A dictionary mapping column names to Source records.
|
676
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
677
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
678
|
-
using the exclude parameter ensures they are not validated.
|
679
|
-
|
680
|
-
Returns:
|
681
|
-
A curator object.
|
682
|
-
|
683
|
-
Examples:
|
684
|
-
>>> import bionty as bt
|
685
|
-
>>> curator = ln.Curator.from_df(
|
686
|
-
... df,
|
687
|
-
... categoricals={
|
688
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
689
|
-
... "donor_id": ULabel.name
|
690
|
-
... }
|
691
|
-
... )
|
692
|
-
"""
|
1026
|
+
"""Categorical manager for `DataFrame`."""
|
693
1027
|
|
694
1028
|
def __init__(
|
695
1029
|
self,
|
@@ -699,10 +1033,7 @@ class DataFrameCatManager(CatManager):
|
|
699
1033
|
verbosity: str = "hint",
|
700
1034
|
organism: str | None = None,
|
701
1035
|
sources: dict[str, Record] | None = None,
|
702
|
-
exclude: dict | None = None,
|
703
1036
|
) -> None:
|
704
|
-
from lamindb.core._settings import settings
|
705
|
-
|
706
1037
|
if organism is not None and not isinstance(organism, str):
|
707
1038
|
raise ValueError("organism must be a string such as 'human' or 'mouse'!")
|
708
1039
|
|
@@ -714,20 +1045,21 @@ class DataFrameCatManager(CatManager):
|
|
714
1045
|
organism=organism,
|
715
1046
|
categoricals=categoricals,
|
716
1047
|
sources=sources,
|
717
|
-
exclude=exclude,
|
718
1048
|
)
|
719
1049
|
self._save_columns()
|
720
1050
|
|
721
|
-
def lookup(self, public: bool = False) ->
|
1051
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
722
1052
|
"""Lookup categories.
|
723
1053
|
|
724
1054
|
Args:
|
725
1055
|
public: If "public", the lookup is performed on the public reference.
|
726
1056
|
"""
|
727
|
-
return
|
1057
|
+
return CatLookup(
|
728
1058
|
categoricals=self._categoricals,
|
729
1059
|
slots={"columns": self._columns_field},
|
730
1060
|
public=public,
|
1061
|
+
organism=self._organism,
|
1062
|
+
sources=self._sources,
|
731
1063
|
)
|
732
1064
|
|
733
1065
|
def _save_columns(self, validated_only: bool = True) -> None:
|
@@ -736,28 +1068,26 @@ class DataFrameCatManager(CatManager):
|
|
736
1068
|
update_registry(
|
737
1069
|
values=list(self.categoricals.keys()),
|
738
1070
|
field=self._columns_field,
|
739
|
-
key="columns",
|
1071
|
+
key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
|
740
1072
|
validated_only=False,
|
741
1073
|
source=self._sources.get("columns"),
|
742
|
-
exclude=self._exclude.get("columns"),
|
743
1074
|
)
|
744
1075
|
|
745
1076
|
# Save the rest of the columns based on validated_only
|
746
|
-
additional_columns = set(self._dataset.
|
1077
|
+
additional_columns = set(self._dataset.keys()) - set(self.categoricals.keys())
|
747
1078
|
if additional_columns:
|
748
1079
|
update_registry(
|
749
1080
|
values=list(additional_columns),
|
750
1081
|
field=self._columns_field,
|
751
|
-
key="columns",
|
1082
|
+
key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
|
752
1083
|
validated_only=validated_only,
|
753
1084
|
df=self._dataset, # Get the Feature type from df
|
754
1085
|
source=self._sources.get("columns"),
|
755
|
-
exclude=self._exclude.get("columns"),
|
756
1086
|
)
|
757
1087
|
|
758
1088
|
@deprecated(new_name="is run by default")
|
759
1089
|
def add_new_from_columns(self, organism: str | None = None, **kwargs):
|
760
|
-
pass
|
1090
|
+
pass # pragma: no cover
|
761
1091
|
|
762
1092
|
def validate(self) -> bool:
|
763
1093
|
"""Validate variables and categorical observations.
|
@@ -778,7 +1108,6 @@ class DataFrameCatManager(CatManager):
|
|
778
1108
|
self._dataset,
|
779
1109
|
fields=self.categoricals,
|
780
1110
|
sources=self._sources,
|
781
|
-
exclude=self._exclude,
|
782
1111
|
curator=self,
|
783
1112
|
organism=self._organism,
|
784
1113
|
)
|
@@ -814,7 +1143,7 @@ class DataFrameCatManager(CatManager):
|
|
814
1143
|
else:
|
815
1144
|
if key not in avail_keys:
|
816
1145
|
if key in self._categoricals:
|
817
|
-
logger.
|
1146
|
+
logger.warning(f"No non-standardized values found for {key!r}")
|
818
1147
|
else:
|
819
1148
|
raise KeyError(
|
820
1149
|
f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
|
@@ -852,7 +1181,6 @@ class DataFrameCatManager(CatManager):
|
|
852
1181
|
key=categorical,
|
853
1182
|
validated_only=validated_only,
|
854
1183
|
source=self._sources.get(categorical),
|
855
|
-
exclude=self._exclude.get(categorical),
|
856
1184
|
organism=self._organism,
|
857
1185
|
)
|
858
1186
|
# adding new records removes them from non_validated
|
@@ -882,32 +1210,7 @@ class DataFrameCatManager(CatManager):
|
|
882
1210
|
|
883
1211
|
|
884
1212
|
class AnnDataCatManager(CatManager):
|
885
|
-
"""
|
886
|
-
|
887
|
-
Args:
|
888
|
-
data: The AnnData object or an AnnData-like path.
|
889
|
-
var_index: The registry field for mapping the ``.var`` index.
|
890
|
-
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
891
|
-
obs_columns: The registry field for mapping the ``.obs.columns``.
|
892
|
-
verbosity: The verbosity level.
|
893
|
-
organism: The organism name.
|
894
|
-
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
895
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
896
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
897
|
-
using the exclude parameter ensures they are not validated.
|
898
|
-
|
899
|
-
Examples:
|
900
|
-
>>> import bionty as bt
|
901
|
-
>>> curator = ln.Curator.from_anndata(
|
902
|
-
... adata,
|
903
|
-
... var_index=bt.Gene.ensembl_gene_id,
|
904
|
-
... categoricals={
|
905
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
906
|
-
... "donor_id": ULabel.name
|
907
|
-
... },
|
908
|
-
... organism="human",
|
909
|
-
... )
|
910
|
-
"""
|
1213
|
+
"""Categorical manager for `AnnData`."""
|
911
1214
|
|
912
1215
|
def __init__(
|
913
1216
|
self,
|
@@ -918,13 +1221,12 @@ class AnnDataCatManager(CatManager):
|
|
918
1221
|
verbosity: str = "hint",
|
919
1222
|
organism: str | None = None,
|
920
1223
|
sources: dict[str, Record] | None = None,
|
921
|
-
exclude: dict | None = None,
|
922
1224
|
) -> None:
|
923
1225
|
if isinstance(var_index, str):
|
924
|
-
raise TypeError(
|
1226
|
+
raise TypeError(
|
1227
|
+
"var_index parameter has to be a field, e.g. Gene.ensembl_gene_id"
|
1228
|
+
)
|
925
1229
|
|
926
|
-
if sources is None:
|
927
|
-
sources = {}
|
928
1230
|
if not data_is_anndata(data):
|
929
1231
|
raise TypeError("data has to be an AnnData object")
|
930
1232
|
|
@@ -935,12 +1237,12 @@ class AnnDataCatManager(CatManager):
|
|
935
1237
|
|
936
1238
|
self._obs_fields = categoricals or {}
|
937
1239
|
self._var_field = var_index
|
1240
|
+
self._sources = sources or {}
|
938
1241
|
super().__init__(
|
939
1242
|
dataset=data,
|
940
1243
|
categoricals=categoricals,
|
941
|
-
sources=
|
1244
|
+
sources=self._sources,
|
942
1245
|
organism=organism,
|
943
|
-
exclude=exclude,
|
944
1246
|
columns_field=var_index,
|
945
1247
|
)
|
946
1248
|
self._adata = self._dataset
|
@@ -950,8 +1252,7 @@ class AnnDataCatManager(CatManager):
|
|
950
1252
|
columns=obs_columns,
|
951
1253
|
verbosity=verbosity,
|
952
1254
|
organism=None,
|
953
|
-
sources=
|
954
|
-
exclude=exclude,
|
1255
|
+
sources=self._sources,
|
955
1256
|
)
|
956
1257
|
|
957
1258
|
@property
|
@@ -964,16 +1265,18 @@ class AnnDataCatManager(CatManager):
|
|
964
1265
|
"""Return the obs fields to validate against."""
|
965
1266
|
return self._obs_fields
|
966
1267
|
|
967
|
-
def lookup(self, public: bool = False) ->
|
1268
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
968
1269
|
"""Lookup categories.
|
969
1270
|
|
970
1271
|
Args:
|
971
1272
|
public: If "public", the lookup is performed on the public reference.
|
972
1273
|
"""
|
973
|
-
return
|
1274
|
+
return CatLookup(
|
974
1275
|
categoricals=self._obs_fields,
|
975
1276
|
slots={"columns": self._columns_field, "var_index": self._var_field},
|
976
1277
|
public=public,
|
1278
|
+
organism=self._organism,
|
1279
|
+
sources=self._sources,
|
977
1280
|
)
|
978
1281
|
|
979
1282
|
def _save_from_var_index(
|
@@ -989,7 +1292,6 @@ class AnnDataCatManager(CatManager):
|
|
989
1292
|
validated_only=validated_only,
|
990
1293
|
organism=self._organism,
|
991
1294
|
source=self._sources.get("var_index"),
|
992
|
-
exclude=self._exclude.get("var_index"),
|
993
1295
|
)
|
994
1296
|
|
995
1297
|
def add_new_from(self, key: str, **kwargs):
|
@@ -1033,7 +1335,6 @@ class AnnDataCatManager(CatManager):
|
|
1033
1335
|
key="var_index",
|
1034
1336
|
source=self._sources.get("var_index"),
|
1035
1337
|
hint_print=".add_new_from_var_index()",
|
1036
|
-
exclude=self._exclude.get("var_index"),
|
1037
1338
|
organism=self._organism, # type: ignore
|
1038
1339
|
)
|
1039
1340
|
else:
|
@@ -1077,59 +1378,29 @@ class AnnDataCatManager(CatManager):
|
|
1077
1378
|
|
1078
1379
|
|
1079
1380
|
class MuDataCatManager(CatManager):
|
1080
|
-
"""
|
1081
|
-
|
1082
|
-
Args:
|
1083
|
-
mdata: The MuData object to curate.
|
1084
|
-
var_index: The registry field for mapping the ``.var`` index for each modality.
|
1085
|
-
For example:
|
1086
|
-
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
|
1087
|
-
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
1088
|
-
Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
|
1089
|
-
verbosity: The verbosity level.
|
1090
|
-
organism: The organism name.
|
1091
|
-
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
1092
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
1093
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1094
|
-
using the exclude parameter ensures they are not validated.
|
1095
|
-
|
1096
|
-
Examples:
|
1097
|
-
>>> import bionty as bt
|
1098
|
-
>>> curator = ln.Curator.from_mudata(
|
1099
|
-
... mdata,
|
1100
|
-
... var_index={
|
1101
|
-
... "rna": bt.Gene.ensembl_gene_id,
|
1102
|
-
... "adt": CellMarker.name
|
1103
|
-
... },
|
1104
|
-
... categoricals={
|
1105
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
1106
|
-
... "donor_id": ULabel.name
|
1107
|
-
... },
|
1108
|
-
... organism="human",
|
1109
|
-
... )
|
1110
|
-
"""
|
1381
|
+
"""Categorical manager for `MuData`."""
|
1111
1382
|
|
1112
1383
|
def __init__(
|
1113
1384
|
self,
|
1114
1385
|
mdata: MuData | Artifact,
|
1115
|
-
var_index: dict[str, FieldAttr],
|
1386
|
+
var_index: dict[str, FieldAttr] | None = None,
|
1116
1387
|
categoricals: dict[str, FieldAttr] | None = None,
|
1117
1388
|
verbosity: str = "hint",
|
1118
1389
|
organism: str | None = None,
|
1119
1390
|
sources: dict[str, Record] | None = None,
|
1120
|
-
exclude: dict | None = None, # {modality: {field: [values]}}
|
1121
1391
|
) -> None:
|
1122
1392
|
super().__init__(
|
1123
1393
|
dataset=mdata,
|
1124
1394
|
categoricals={},
|
1125
1395
|
sources=sources,
|
1126
1396
|
organism=organism,
|
1127
|
-
exclude=exclude,
|
1128
1397
|
)
|
1129
|
-
self._columns_field =
|
1130
|
-
|
1398
|
+
self._columns_field = (
|
1399
|
+
var_index or {}
|
1400
|
+
) # this is for consistency with BaseCatManager
|
1401
|
+
self._var_fields = var_index or {}
|
1131
1402
|
self._verify_modality(self._var_fields.keys())
|
1132
|
-
self._obs_fields = self._parse_categoricals(categoricals)
|
1403
|
+
self._obs_fields = self._parse_categoricals(categoricals or {})
|
1133
1404
|
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
1134
1405
|
self._verbosity = verbosity
|
1135
1406
|
self._obs_df_curator = None
|
@@ -1140,7 +1411,6 @@ class MuDataCatManager(CatManager):
|
|
1140
1411
|
categoricals=self._obs_fields.get("obs", {}),
|
1141
1412
|
verbosity=verbosity,
|
1142
1413
|
sources=self._sources.get("obs"),
|
1143
|
-
exclude=self._exclude.get("obs"),
|
1144
1414
|
organism=organism,
|
1145
1415
|
)
|
1146
1416
|
self._mod_adata_curators = {
|
@@ -1150,7 +1420,6 @@ class MuDataCatManager(CatManager):
|
|
1150
1420
|
categoricals=self._obs_fields.get(modality),
|
1151
1421
|
verbosity=verbosity,
|
1152
1422
|
sources=self._sources.get(modality),
|
1153
|
-
exclude=self._exclude.get(modality),
|
1154
1423
|
organism=organism,
|
1155
1424
|
)
|
1156
1425
|
for modality in self._modalities
|
@@ -1199,7 +1468,7 @@ class MuDataCatManager(CatManager):
|
|
1199
1468
|
obs_fields["obs"][k] = v
|
1200
1469
|
return obs_fields
|
1201
1470
|
|
1202
|
-
def lookup(self, public: bool = False) ->
|
1471
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
1203
1472
|
"""Lookup categories.
|
1204
1473
|
|
1205
1474
|
Args:
|
@@ -1212,12 +1481,14 @@ class MuDataCatManager(CatManager):
|
|
1212
1481
|
obs_fields[k] = v
|
1213
1482
|
else:
|
1214
1483
|
obs_fields[f"{mod}:{k}"] = v
|
1215
|
-
return
|
1484
|
+
return CatLookup(
|
1216
1485
|
categoricals=obs_fields,
|
1217
1486
|
slots={
|
1218
1487
|
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
1219
1488
|
},
|
1220
1489
|
public=public,
|
1490
|
+
organism=self._organism,
|
1491
|
+
sources=self._sources,
|
1221
1492
|
)
|
1222
1493
|
|
1223
1494
|
@deprecated(new_name="is run by default")
|
@@ -1227,7 +1498,7 @@ class MuDataCatManager(CatManager):
|
|
1227
1498
|
column_names: list[str] | None = None,
|
1228
1499
|
**kwargs,
|
1229
1500
|
):
|
1230
|
-
pass
|
1501
|
+
pass # pragma: no cover
|
1231
1502
|
|
1232
1503
|
def add_new_from_var_index(self, modality: str, **kwargs):
|
1233
1504
|
"""Update variable records.
|
@@ -1271,16 +1542,8 @@ class MuDataCatManager(CatManager):
|
|
1271
1542
|
|
1272
1543
|
def validate(self) -> bool:
|
1273
1544
|
"""Validate categories."""
|
1274
|
-
from lamindb.core._settings import settings
|
1275
|
-
|
1276
1545
|
# add all validated records to the current instance
|
1277
|
-
|
1278
|
-
try:
|
1279
|
-
settings.verbosity = "error"
|
1280
|
-
self._update_registry_all()
|
1281
|
-
finally:
|
1282
|
-
settings.verbosity = verbosity
|
1283
|
-
|
1546
|
+
self._update_registry_all()
|
1284
1547
|
self._non_validated = {} # type: ignore
|
1285
1548
|
|
1286
1549
|
obs_validated = True
|
@@ -1329,393 +1592,287 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
|
1329
1592
|
)
|
1330
1593
|
|
1331
1594
|
|
1332
|
-
class
|
1333
|
-
"""
|
1334
|
-
|
1335
|
-
Args:
|
1336
|
-
experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
|
1337
|
-
var_index: The registry fields for mapping the `.var` indices for measurements.
|
1338
|
-
Should be in the form `{"measurement name": ("var column", field)}`.
|
1339
|
-
These keys should be used in the flattened form (`'{measurement name}__{column name in .var}'`)
|
1340
|
-
in `.standardize` or `.add_new_from`, see the output of `.var_index`.
|
1341
|
-
categoricals: A dictionary mapping categorical `.obs` columns to a registry field.
|
1342
|
-
obs_columns: The registry field for mapping the names of the `.obs` columns.
|
1343
|
-
organism: The organism name.
|
1344
|
-
sources: A dictionary mapping `.obs` columns to Source records.
|
1345
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
1346
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1347
|
-
using the exclude parameter ensures they are not validated.
|
1348
|
-
|
1349
|
-
Examples:
|
1350
|
-
>>> import bionty as bt
|
1351
|
-
>>> curator = ln.Curator.from_tiledbsoma(
|
1352
|
-
... "./my_array_store.tiledbsoma",
|
1353
|
-
... var_index={"RNA": ("var_id", bt.Gene.symbol)},
|
1354
|
-
... categoricals={
|
1355
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
1356
|
-
... "donor_id": ULabel.name
|
1357
|
-
... },
|
1358
|
-
... organism="human",
|
1359
|
-
... )
|
1360
|
-
"""
|
1595
|
+
class SpatialDataCatManager(CatManager):
|
1596
|
+
"""Categorical manager for `SpatialData`."""
|
1361
1597
|
|
1362
1598
|
def __init__(
|
1363
1599
|
self,
|
1364
|
-
|
1365
|
-
var_index: dict[str,
|
1366
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
1367
|
-
|
1600
|
+
sdata: Any,
|
1601
|
+
var_index: dict[str, FieldAttr],
|
1602
|
+
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
1603
|
+
verbosity: str = "hint",
|
1368
1604
|
organism: str | None = None,
|
1369
|
-
sources: dict[str, Record] | None = None,
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1605
|
+
sources: dict[str, dict[str, Record]] | None = None,
|
1606
|
+
*,
|
1607
|
+
sample_metadata_key: str | None = "sample",
|
1608
|
+
) -> None:
|
1609
|
+
super().__init__(
|
1610
|
+
dataset=sdata,
|
1611
|
+
categoricals={},
|
1612
|
+
sources=sources,
|
1613
|
+
organism=organism,
|
1614
|
+
)
|
1615
|
+
if isinstance(sdata, Artifact):
|
1616
|
+
self._sdata = sdata.load()
|
1378
1617
|
else:
|
1379
|
-
self.
|
1380
|
-
|
1381
|
-
self.
|
1382
|
-
self.
|
1383
|
-
self.
|
1384
|
-
|
1385
|
-
self.
|
1386
|
-
|
1387
|
-
self._validated_values: dict[str, list] = {}
|
1388
|
-
# filled by _check_save_keys
|
1389
|
-
self._n_obs: int | None = None
|
1390
|
-
self._valid_obs_keys: list[str] | None = None
|
1391
|
-
self._obs_pa_schema: pa.lib.Schema | None = (
|
1392
|
-
None # this is needed to create the obs feature set
|
1618
|
+
self._sdata = self._dataset
|
1619
|
+
self._sample_metadata_key = sample_metadata_key
|
1620
|
+
self._write_path = None
|
1621
|
+
self._var_fields = var_index
|
1622
|
+
self._verify_accessor_exists(self._var_fields.keys())
|
1623
|
+
self._categoricals = categoricals
|
1624
|
+
self._table_keys = set(self._var_fields.keys()) | set(
|
1625
|
+
self._categoricals.keys() - {self._sample_metadata_key}
|
1393
1626
|
)
|
1394
|
-
self.
|
1395
|
-
self.
|
1396
|
-
self.
|
1397
|
-
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1402
|
-
|
1403
|
-
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1404
|
-
experiment_obs = experiment.obs
|
1405
|
-
self._n_obs = len(experiment_obs)
|
1406
|
-
self._obs_pa_schema = experiment_obs.schema
|
1407
|
-
valid_obs_keys = [
|
1408
|
-
k for k in self._obs_pa_schema.names if k != "soma_joinid"
|
1409
|
-
]
|
1410
|
-
self._valid_obs_keys = valid_obs_keys
|
1411
|
-
|
1412
|
-
valid_var_keys = []
|
1413
|
-
ms_list = []
|
1414
|
-
for ms in experiment.ms.keys():
|
1415
|
-
ms_list.append(ms)
|
1416
|
-
var_ms = experiment.ms[ms].var
|
1417
|
-
valid_var_keys += [
|
1418
|
-
f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
|
1419
|
-
]
|
1420
|
-
self._valid_var_keys = valid_var_keys
|
1627
|
+
self._verbosity = verbosity
|
1628
|
+
self._sample_df_curator = None
|
1629
|
+
if self._sample_metadata_key is not None:
|
1630
|
+
self._sample_metadata = self._sdata.get_attrs(
|
1631
|
+
key=self._sample_metadata_key, return_as="df", flatten=True
|
1632
|
+
)
|
1633
|
+
self._is_validated = False
|
1421
1634
|
|
1422
|
-
#
|
1635
|
+
# Check validity of keys in categoricals
|
1423
1636
|
nonval_keys = []
|
1424
|
-
for
|
1425
|
-
if
|
1426
|
-
|
1637
|
+
for accessor, accessor_categoricals in self._categoricals.items():
|
1638
|
+
if (
|
1639
|
+
accessor == self._sample_metadata_key
|
1640
|
+
and self._sample_metadata is not None
|
1641
|
+
):
|
1642
|
+
for key in accessor_categoricals.keys():
|
1643
|
+
if key not in self._sample_metadata.columns:
|
1644
|
+
nonval_keys.append(key)
|
1645
|
+
else:
|
1646
|
+
for key in accessor_categoricals.keys():
|
1647
|
+
if key not in self._sdata[accessor].obs.columns:
|
1648
|
+
nonval_keys.append(key)
|
1649
|
+
|
1427
1650
|
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1428
1651
|
|
1429
|
-
# check validity of keys in
|
1430
|
-
self._var_fields_flat = {}
|
1652
|
+
# check validity of keys in sources
|
1431
1653
|
nonval_keys = []
|
1432
|
-
for
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1654
|
+
for accessor, accessor_sources in self._sources.items():
|
1655
|
+
if (
|
1656
|
+
accessor == self._sample_metadata_key
|
1657
|
+
and self._sample_metadata is not None
|
1658
|
+
):
|
1659
|
+
columns = self._sample_metadata.columns
|
1660
|
+
elif accessor != self._sample_metadata_key:
|
1661
|
+
columns = self._sdata[accessor].obs.columns
|
1437
1662
|
else:
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1443
|
-
for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
|
1444
|
-
nonval_keys = []
|
1445
|
-
for arg_key in dct.keys():
|
1446
|
-
if arg_key not in valid_arg_keys:
|
1447
|
-
nonval_keys.append(arg_key)
|
1448
|
-
_maybe_curation_keys_not_present(nonval_keys, name)
|
1663
|
+
continue
|
1664
|
+
for key in accessor_sources:
|
1665
|
+
if key not in columns:
|
1666
|
+
nonval_keys.append(key)
|
1667
|
+
_maybe_curation_keys_not_present(nonval_keys, "sources")
|
1449
1668
|
|
1450
|
-
#
|
1451
|
-
register_columns = list(self._obs_fields.keys())
|
1452
|
-
organism = check_registry_organism(
|
1453
|
-
self._columns_field.field.model, self._organism
|
1454
|
-
).get("organism")
|
1455
|
-
update_registry(
|
1456
|
-
values=register_columns,
|
1457
|
-
field=self._columns_field,
|
1458
|
-
key="columns",
|
1459
|
-
validated_only=False,
|
1460
|
-
organism=organism,
|
1461
|
-
source=self._sources.get("columns"),
|
1462
|
-
exclude=self._exclude.get("columns"),
|
1463
|
-
)
|
1464
|
-
additional_columns = [k for k in valid_obs_keys if k not in register_columns]
|
1465
|
-
# no need to register with validated_only=True if columns are features
|
1669
|
+
# Set up sample level metadata and table Curator objects
|
1466
1670
|
if (
|
1467
|
-
|
1468
|
-
and self.
|
1671
|
+
self._sample_metadata_key is not None
|
1672
|
+
and self._sample_metadata_key in self._categoricals
|
1469
1673
|
):
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
|
1474
|
-
|
1674
|
+
self._sample_df_curator = DataFrameCatManager(
|
1675
|
+
df=self._sample_metadata,
|
1676
|
+
columns=Feature.name,
|
1677
|
+
categoricals=self._categoricals.get(self._sample_metadata_key, {}),
|
1678
|
+
verbosity=verbosity,
|
1679
|
+
sources=self._sources.get(self._sample_metadata_key),
|
1680
|
+
organism=organism,
|
1681
|
+
)
|
1682
|
+
self._table_adata_curators = {
|
1683
|
+
table: AnnDataCatManager(
|
1684
|
+
data=self._sdata[table],
|
1685
|
+
var_index=var_index.get(table),
|
1686
|
+
categoricals=self._categoricals.get(table),
|
1687
|
+
verbosity=verbosity,
|
1688
|
+
sources=self._sources.get(table),
|
1475
1689
|
organism=organism,
|
1476
|
-
source=self._sources.get("columns"),
|
1477
|
-
exclude=self._exclude.get("columns"),
|
1478
1690
|
)
|
1691
|
+
for table in self._table_keys
|
1692
|
+
}
|
1479
1693
|
|
1480
|
-
|
1481
|
-
"""Validate categories."""
|
1482
|
-
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1694
|
+
self._non_validated = None
|
1483
1695
|
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
var_ms = experiment.ms[ms].var
|
1489
|
-
var_ms_key = f"{ms}__{key}"
|
1490
|
-
# it was already validated and cached
|
1491
|
-
if var_ms_key in self._validated_values:
|
1492
|
-
continue
|
1493
|
-
var_ms_values = (
|
1494
|
-
var_ms.read(column_names=[key]).concat()[key].to_pylist()
|
1495
|
-
)
|
1496
|
-
organism = check_registry_organism(
|
1497
|
-
field.field.model, self._organism
|
1498
|
-
).get("organism")
|
1499
|
-
update_registry(
|
1500
|
-
values=var_ms_values,
|
1501
|
-
field=field,
|
1502
|
-
key=var_ms_key,
|
1503
|
-
validated_only=True,
|
1504
|
-
organism=organism,
|
1505
|
-
source=self._sources.get(var_ms_key),
|
1506
|
-
exclude=self._exclude.get(var_ms_key),
|
1507
|
-
)
|
1508
|
-
_, non_val = validate_categories(
|
1509
|
-
values=var_ms_values,
|
1510
|
-
field=field,
|
1511
|
-
key=var_ms_key,
|
1512
|
-
organism=organism,
|
1513
|
-
source=self._sources.get(var_ms_key),
|
1514
|
-
exclude=self._exclude.get(var_ms_key),
|
1515
|
-
)
|
1516
|
-
if len(non_val) > 0:
|
1517
|
-
validated = False
|
1518
|
-
self._non_validated_values[var_ms_key] = non_val
|
1519
|
-
else:
|
1520
|
-
self._validated_values[var_ms_key] = var_ms_values
|
1696
|
+
@property
|
1697
|
+
def var_index(self) -> FieldAttr:
|
1698
|
+
"""Return the registry fields to validate variables indices against."""
|
1699
|
+
return self._var_fields
|
1521
1700
|
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
continue
|
1527
|
-
values = pa.compute.unique(
|
1528
|
-
obs.read(column_names=[key]).concat()[key]
|
1529
|
-
).to_pylist()
|
1530
|
-
organism = check_registry_organism(
|
1531
|
-
field.field.model, self._organism
|
1532
|
-
).get("organism")
|
1533
|
-
update_registry(
|
1534
|
-
values=values,
|
1535
|
-
field=field,
|
1536
|
-
key=key,
|
1537
|
-
validated_only=True,
|
1538
|
-
organism=organism,
|
1539
|
-
source=self._sources.get(key),
|
1540
|
-
exclude=self._exclude.get(key),
|
1541
|
-
)
|
1542
|
-
_, non_val = validate_categories(
|
1543
|
-
values=values,
|
1544
|
-
field=field,
|
1545
|
-
key=key,
|
1546
|
-
organism=organism,
|
1547
|
-
source=self._sources.get(key),
|
1548
|
-
exclude=self._exclude.get(key),
|
1549
|
-
)
|
1550
|
-
if len(non_val) > 0:
|
1551
|
-
validated = False
|
1552
|
-
self._non_validated_values[key] = non_val
|
1553
|
-
else:
|
1554
|
-
self._validated_values[key] = values
|
1555
|
-
self._is_validated = validated
|
1556
|
-
return self._is_validated
|
1701
|
+
@property
|
1702
|
+
def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
|
1703
|
+
"""Return the categorical keys and fields to validate against."""
|
1704
|
+
return self._categoricals
|
1557
1705
|
|
1558
|
-
|
1559
|
-
|
1706
|
+
@property
|
1707
|
+
def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
|
1708
|
+
"""Return the non-validated features and labels."""
|
1709
|
+
if self._non_validated is None:
|
1710
|
+
raise ValidationError("Please run validate() first!")
|
1711
|
+
return self._non_validated
|
1560
1712
|
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1713
|
+
def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
|
1714
|
+
"""Verify that the accessors exist (either a valid table or in attrs)."""
|
1715
|
+
for acc in accessors:
|
1716
|
+
is_present = False
|
1717
|
+
try:
|
1718
|
+
self._sdata.get_attrs(key=acc)
|
1719
|
+
is_present = True
|
1720
|
+
except KeyError:
|
1721
|
+
if acc in self._sdata.tables.keys():
|
1722
|
+
is_present = True
|
1723
|
+
if not is_present:
|
1724
|
+
raise ValidationError(f"Accessor '{acc}' does not exist!")
|
1570
1725
|
|
1571
|
-
def
|
1572
|
-
"""
|
1726
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
1727
|
+
"""Look up categories.
|
1573
1728
|
|
1574
1729
|
Args:
|
1575
|
-
|
1576
|
-
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
1577
|
-
or a column name in `.obs`.
|
1730
|
+
public: Whether the lookup is performed on the public reference.
|
1578
1731
|
"""
|
1579
|
-
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
|
1584
|
-
|
1585
|
-
|
1586
|
-
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1590
|
-
|
1591
|
-
|
1592
|
-
|
1593
|
-
values, field = self._non_validated_values_field(k)
|
1594
|
-
if len(values) == 0:
|
1595
|
-
continue
|
1596
|
-
organism = check_registry_organism(field.field.model, self._organism).get(
|
1597
|
-
"organism"
|
1732
|
+
cat_values_dict = list(self.categoricals.values())[0]
|
1733
|
+
return CatLookup(
|
1734
|
+
categoricals=cat_values_dict,
|
1735
|
+
slots={"accessors": cat_values_dict.keys()},
|
1736
|
+
public=public,
|
1737
|
+
organism=self._organism,
|
1738
|
+
sources=self._sources,
|
1739
|
+
)
|
1740
|
+
|
1741
|
+
def _update_registry_all(self) -> None:
|
1742
|
+
"""Saves labels of all features for sample and table metadata."""
|
1743
|
+
if self._sample_df_curator is not None:
|
1744
|
+
self._sample_df_curator._update_registry_all(
|
1745
|
+
validated_only=True,
|
1598
1746
|
)
|
1599
|
-
|
1600
|
-
|
1601
|
-
|
1602
|
-
key=k,
|
1603
|
-
validated_only=False,
|
1604
|
-
organism=organism,
|
1605
|
-
source=self._sources.get(k),
|
1606
|
-
exclude=self._exclude.get(k),
|
1607
|
-
**kwargs,
|
1747
|
+
for _, adata_curator in self._table_adata_curators.items():
|
1748
|
+
adata_curator._obs_df_curator._update_registry_all(
|
1749
|
+
validated_only=True,
|
1608
1750
|
)
|
1609
|
-
# update non-validated values list but keep the key there
|
1610
|
-
# it will be removed by .validate()
|
1611
|
-
if k in self._non_validated_values:
|
1612
|
-
self._non_validated_values[k] = []
|
1613
1751
|
|
1614
|
-
|
1615
|
-
|
1616
|
-
"""Return the non-validated features and labels."""
|
1617
|
-
non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
|
1618
|
-
return non_val
|
1752
|
+
def add_new_from_var_index(self, table: str, **kwargs) -> None:
|
1753
|
+
"""Save new values from ``.var.index`` of table.
|
1619
1754
|
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1755
|
+
Args:
|
1756
|
+
table: The table key.
|
1757
|
+
organism: The organism name.
|
1758
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
1759
|
+
"""
|
1760
|
+
if self._non_validated is None:
|
1761
|
+
raise ValidationError("Run .validate() first.")
|
1762
|
+
self._table_adata_curators[table].add_new_from_var_index(**kwargs)
|
1763
|
+
if table in self.non_validated.keys():
|
1764
|
+
if "var_index" in self._non_validated[table]:
|
1765
|
+
self._non_validated[table].pop("var_index")
|
1624
1766
|
|
1625
|
-
|
1626
|
-
|
1627
|
-
"""Return the obs fields to validate against."""
|
1628
|
-
return self._obs_fields
|
1767
|
+
if len(self.non_validated[table].values()) == 0:
|
1768
|
+
self.non_validated.pop(table)
|
1629
1769
|
|
1630
|
-
def
|
1631
|
-
|
1770
|
+
def add_new_from(
|
1771
|
+
self,
|
1772
|
+
key: str,
|
1773
|
+
accessor: str | None = None,
|
1774
|
+
**kwargs,
|
1775
|
+
) -> None:
|
1776
|
+
"""Save new values of categorical from sample level metadata or table.
|
1632
1777
|
|
1633
1778
|
Args:
|
1634
|
-
|
1779
|
+
key: The key referencing the slot in the DataFrame.
|
1780
|
+
accessor: The accessor key such as 'sample' or 'table x'.
|
1781
|
+
organism: The organism name.
|
1782
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
1635
1783
|
"""
|
1636
|
-
|
1637
|
-
|
1638
|
-
slots={"columns": self._columns_field, **self._var_fields_flat},
|
1639
|
-
public=public,
|
1640
|
-
)
|
1784
|
+
if self._non_validated is None:
|
1785
|
+
raise ValidationError("Run .validate() first.")
|
1641
1786
|
|
1642
|
-
|
1643
|
-
|
1787
|
+
if len(kwargs) > 0 and key == "all":
|
1788
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
1789
|
+
|
1790
|
+
if accessor not in self.categoricals:
|
1791
|
+
raise ValueError(
|
1792
|
+
f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
|
1793
|
+
)
|
1794
|
+
|
1795
|
+
if accessor in self._table_adata_curators:
|
1796
|
+
adata_curator = self._table_adata_curators[accessor]
|
1797
|
+
adata_curator.add_new_from(key=key, **kwargs)
|
1798
|
+
if accessor == self._sample_metadata_key:
|
1799
|
+
self._sample_df_curator.add_new_from(key=key, **kwargs)
|
1800
|
+
|
1801
|
+
if accessor in self.non_validated.keys():
|
1802
|
+
if len(self.non_validated[accessor].values()) == 0:
|
1803
|
+
self.non_validated.pop(accessor)
|
1804
|
+
|
1805
|
+
def standardize(self, key: str, accessor: str | None = None) -> None:
|
1806
|
+
"""Replace synonyms with canonical values.
|
1644
1807
|
|
1645
1808
|
Modifies the dataset inplace.
|
1646
1809
|
|
1647
1810
|
Args:
|
1648
|
-
key: The key referencing the slot in the
|
1649
|
-
|
1650
|
-
or a column name in `.obs`.
|
1811
|
+
key: The key referencing the slot in the table or sample metadata.
|
1812
|
+
accessor: The accessor key such as 'sample_key' or 'table_key'.
|
1651
1813
|
"""
|
1652
1814
|
if len(self.non_validated) == 0:
|
1653
1815
|
logger.warning("values are already standardized")
|
1654
1816
|
return
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1817
|
+
if self._artifact is not None:
|
1818
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
1819
|
+
|
1820
|
+
if accessor == self._sample_metadata_key:
|
1821
|
+
if key not in self._sample_metadata.columns:
|
1822
|
+
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
1658
1823
|
else:
|
1659
|
-
if
|
1660
|
-
|
1661
|
-
|
1662
|
-
|
1663
|
-
|
1824
|
+
if (
|
1825
|
+
key == "var_index" and self._sdata.tables[accessor].var.index is None
|
1826
|
+
) or (
|
1827
|
+
key != "var_index"
|
1828
|
+
and key not in self._sdata.tables[accessor].obs.columns
|
1829
|
+
):
|
1830
|
+
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
1664
1831
|
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
ms, _, slot_key = k.partition("__")
|
1671
|
-
slot = lambda experiment: experiment.ms[ms].var # noqa: B023
|
1672
|
-
else:
|
1673
|
-
slot = lambda experiment: experiment.obs
|
1674
|
-
slot_key = k
|
1675
|
-
# errors if public ontology and the model has no organism
|
1676
|
-
# has to be fixed in bionty
|
1677
|
-
organism = check_registry_organism(field.field.model, self._organism).get(
|
1678
|
-
"organism"
|
1679
|
-
)
|
1680
|
-
syn_mapper = standardize_categories(
|
1681
|
-
values=values,
|
1682
|
-
field=field,
|
1683
|
-
source=self._sources.get(k),
|
1684
|
-
organism=organism,
|
1685
|
-
)
|
1686
|
-
if (n_syn_mapper := len(syn_mapper)) == 0:
|
1687
|
-
continue
|
1832
|
+
if accessor in self._table_adata_curators.keys():
|
1833
|
+
adata_curator = self._table_adata_curators[accessor]
|
1834
|
+
adata_curator.standardize(key)
|
1835
|
+
if accessor == self._sample_metadata_key:
|
1836
|
+
self._sample_df_curator.standardize(key)
|
1688
1837
|
|
1689
|
-
|
1838
|
+
if len(self.non_validated[accessor].values()) == 0:
|
1839
|
+
self.non_validated.pop(accessor)
|
1690
1840
|
|
1691
|
-
|
1692
|
-
|
1693
|
-
table = slot(experiment).read(value_filter=value_filter).concat()
|
1841
|
+
def validate(self) -> bool:
|
1842
|
+
"""Validate variables and categorical observations.
|
1694
1843
|
|
1695
|
-
|
1696
|
-
|
1844
|
+
This method also registers the validated records in the current instance:
|
1845
|
+
- from public sources
|
1697
1846
|
|
1698
|
-
|
1699
|
-
|
1700
|
-
df[slot_key] = df[slot_key].map(
|
1701
|
-
lambda val: syn_mapper.get(val, val) # noqa
|
1702
|
-
)
|
1703
|
-
# write the mapped values
|
1704
|
-
with _open_tiledbsoma(self._dataset, mode="w") as experiment:
|
1705
|
-
slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
|
1706
|
-
# update non_validated dict
|
1707
|
-
non_val_k = [
|
1708
|
-
nv for nv in self._non_validated_values[k] if nv not in syn_mapper
|
1709
|
-
]
|
1710
|
-
self._non_validated_values[k] = non_val_k
|
1847
|
+
Args:
|
1848
|
+
organism: The organism name.
|
1711
1849
|
|
1712
|
-
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1718
|
-
|
1850
|
+
Returns:
|
1851
|
+
Whether the SpatialData object is validated.
|
1852
|
+
"""
|
1853
|
+
# add all validated records to the current instance
|
1854
|
+
self._update_registry_all()
|
1855
|
+
|
1856
|
+
self._non_validated = {} # type: ignore
|
1857
|
+
|
1858
|
+
sample_validated = True
|
1859
|
+
if self._sample_df_curator:
|
1860
|
+
logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
|
1861
|
+
sample_validated &= self._sample_df_curator.validate()
|
1862
|
+
if len(self._sample_df_curator.non_validated) > 0:
|
1863
|
+
self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
|
1864
|
+
logger.print("")
|
1865
|
+
|
1866
|
+
mods_validated = True
|
1867
|
+
for table, adata_curator in self._table_adata_curators.items():
|
1868
|
+
logger.info(f"validating categoricals of table '{table}' ...")
|
1869
|
+
mods_validated &= adata_curator.validate()
|
1870
|
+
if len(adata_curator.non_validated) > 0:
|
1871
|
+
self._non_validated[table] = adata_curator.non_validated # type: ignore
|
1872
|
+
logger.print("")
|
1873
|
+
|
1874
|
+
self._is_validated = sample_validated & mods_validated
|
1875
|
+
return self._is_validated
|
1719
1876
|
|
1720
1877
|
def save_artifact(
|
1721
1878
|
self,
|
@@ -1725,424 +1882,373 @@ class TiledbsomaCatManager(CatManager):
|
|
1725
1882
|
revises: Artifact | None = None,
|
1726
1883
|
run: Run | None = None,
|
1727
1884
|
) -> Artifact:
|
1728
|
-
"""Save the validated
|
1885
|
+
"""Save the validated SpatialData store and metadata.
|
1729
1886
|
|
1730
1887
|
Args:
|
1731
|
-
description: A description of the
|
1888
|
+
description: A description of the dataset.
|
1732
1889
|
key: A path-like key to reference artifact in default storage,
|
1733
|
-
e.g., `"
|
1890
|
+
e.g., `"myartifact.zarr"`. Artifacts with the same key form a version family.
|
1734
1891
|
revises: Previous version of the artifact. Triggers a revision.
|
1735
1892
|
run: The run that creates the artifact.
|
1736
1893
|
|
1737
1894
|
Returns:
|
1738
1895
|
A saved artifact record.
|
1739
1896
|
"""
|
1740
|
-
from lamindb.models.artifact import add_labels
|
1741
|
-
|
1742
1897
|
if not self._is_validated:
|
1743
1898
|
self.validate()
|
1744
1899
|
if not self._is_validated:
|
1745
1900
|
raise ValidationError("Dataset does not validate. Please curate.")
|
1746
1901
|
|
1747
|
-
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1758
|
-
|
1759
|
-
|
1760
|
-
|
1761
|
-
feature_sets = {}
|
1762
|
-
if len(self._obs_fields) > 0:
|
1763
|
-
organism = check_registry_organism(
|
1764
|
-
self._columns_field.field.model, self._organism
|
1765
|
-
).get("organism")
|
1766
|
-
empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
|
1767
|
-
mock_df = pa.Table.from_pydict(
|
1768
|
-
empty_dict, schema=self._obs_pa_schema
|
1769
|
-
).to_pandas()
|
1770
|
-
# in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
|
1771
|
-
feature_sets["obs"] = Schema.from_df(
|
1772
|
-
df=mock_df,
|
1773
|
-
field=self._columns_field,
|
1774
|
-
mute=True,
|
1775
|
-
organism=organism,
|
1776
|
-
)
|
1777
|
-
for ms in self._var_fields:
|
1778
|
-
var_key, var_field = self._var_fields[ms]
|
1779
|
-
organism = check_registry_organism(
|
1780
|
-
var_field.field.model, self._organism
|
1781
|
-
).get("organism")
|
1782
|
-
feature_sets[f"{ms}__var"] = Schema.from_values(
|
1783
|
-
values=self._validated_values[f"{ms}__{var_key}"],
|
1784
|
-
field=var_field,
|
1785
|
-
organism=organism,
|
1786
|
-
raise_validation_error=False,
|
1787
|
-
)
|
1788
|
-
artifact._staged_feature_sets = feature_sets
|
1789
|
-
|
1790
|
-
feature_ref_is_name = _ref_is_name(self._columns_field)
|
1791
|
-
features = Feature.lookup().dict()
|
1792
|
-
for key, field in self._obs_fields.items():
|
1793
|
-
feature = features.get(key)
|
1794
|
-
registry = field.field.model
|
1795
|
-
organism = check_registry_organism(field.field.model, self._organism).get(
|
1796
|
-
"organism"
|
1797
|
-
)
|
1798
|
-
labels = registry.from_values(
|
1799
|
-
values=self._validated_values[key], field=field, organism=organism
|
1800
|
-
)
|
1801
|
-
if len(labels) == 0:
|
1802
|
-
continue
|
1803
|
-
if hasattr(registry, "_name_field"):
|
1804
|
-
label_ref_is_name = field.field.name == registry._name_field
|
1805
|
-
add_labels(
|
1806
|
-
artifact,
|
1807
|
-
records=labels,
|
1808
|
-
feature=feature,
|
1809
|
-
feature_ref_is_name=feature_ref_is_name,
|
1810
|
-
label_ref_is_name=label_ref_is_name,
|
1811
|
-
from_curator=True,
|
1812
|
-
)
|
1813
|
-
|
1814
|
-
return artifact.save()
|
1815
|
-
|
1816
|
-
|
1817
|
-
class SpatialDataCatManager(CatManager):
|
1818
|
-
"""Curation flow for a ``Spatialdata`` object.
|
1819
|
-
|
1820
|
-
See also :class:`~lamindb.Curator`.
|
1821
|
-
|
1822
|
-
Note that if genes or other measurements are removed from the SpatialData object,
|
1823
|
-
the object should be recreated.
|
1824
|
-
|
1825
|
-
In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
|
1902
|
+
return save_artifact(
|
1903
|
+
self._sdata,
|
1904
|
+
description=description,
|
1905
|
+
fields=self.categoricals,
|
1906
|
+
index_field=self.var_index,
|
1907
|
+
key=key,
|
1908
|
+
artifact=self._artifact,
|
1909
|
+
revises=revises,
|
1910
|
+
run=run,
|
1911
|
+
schema=None,
|
1912
|
+
organism=self._organism,
|
1913
|
+
sample_metadata_key=self._sample_metadata_key,
|
1914
|
+
)
|
1826
1915
|
|
1827
|
-
Args:
|
1828
|
-
sdata: The SpatialData object to curate.
|
1829
|
-
var_index: A dictionary mapping table keys to the ``.var`` indices.
|
1830
|
-
categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
|
1831
1916
|
|
1832
|
-
|
1833
|
-
|
1834
|
-
exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
|
1835
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1836
|
-
using the exclude parameter ensures they are not validated.
|
1837
|
-
verbosity: The verbosity level of the logger.
|
1838
|
-
sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
|
1839
|
-
|
1840
|
-
Examples:
|
1841
|
-
>>> import bionty as bt
|
1842
|
-
>>> curator = SpatialDataCatManager(
|
1843
|
-
... sdata,
|
1844
|
-
... var_index={
|
1845
|
-
... "table_1": bt.Gene.ensembl_gene_id,
|
1846
|
-
... },
|
1847
|
-
... categoricals={
|
1848
|
-
... "table1":
|
1849
|
-
... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
|
1850
|
-
... "sample":
|
1851
|
-
... {"experimental_factor": bt.ExperimentalFactor.name},
|
1852
|
-
... },
|
1853
|
-
... organism="human",
|
1854
|
-
... )
|
1855
|
-
"""
|
1917
|
+
class TiledbsomaCatManager(CatManager):
|
1918
|
+
"""Categorical manager for `tiledbsoma.Experiment`."""
|
1856
1919
|
|
1857
1920
|
def __init__(
|
1858
1921
|
self,
|
1859
|
-
|
1860
|
-
var_index: dict[str, FieldAttr],
|
1861
|
-
categoricals: dict[str,
|
1862
|
-
|
1922
|
+
experiment_uri: UPathStr | Artifact,
|
1923
|
+
var_index: dict[str, tuple[str, FieldAttr]],
|
1924
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
1925
|
+
obs_columns: FieldAttr = Feature.name,
|
1863
1926
|
organism: str | None = None,
|
1864
|
-
sources: dict[str,
|
1865
|
-
|
1866
|
-
|
1867
|
-
sample_metadata_key: str | None = "sample",
|
1868
|
-
) -> None:
|
1869
|
-
super().__init__(
|
1870
|
-
dataset=sdata,
|
1871
|
-
categoricals={},
|
1872
|
-
sources=sources,
|
1873
|
-
organism=organism,
|
1874
|
-
exclude=exclude,
|
1875
|
-
)
|
1876
|
-
if isinstance(sdata, Artifact):
|
1877
|
-
# TODO: load() doesn't yet work
|
1878
|
-
self._sdata = sdata.load()
|
1879
|
-
else:
|
1880
|
-
self._sdata = self._dataset
|
1881
|
-
self._sample_metadata_key = sample_metadata_key
|
1882
|
-
self._write_path = None
|
1927
|
+
sources: dict[str, Record] | None = None,
|
1928
|
+
):
|
1929
|
+
self._obs_fields = categoricals or {}
|
1883
1930
|
self._var_fields = var_index
|
1884
|
-
self.
|
1885
|
-
|
1886
|
-
|
1887
|
-
self.
|
1888
|
-
|
1889
|
-
|
1890
|
-
|
1891
|
-
|
1892
|
-
|
1893
|
-
key=self._sample_metadata_key, return_as="df", flatten=True
|
1894
|
-
)
|
1895
|
-
self._is_validated = False
|
1896
|
-
|
1897
|
-
# Check validity of keys in categoricals
|
1898
|
-
nonval_keys = []
|
1899
|
-
for accessor, accessor_categoricals in self._categoricals.items():
|
1900
|
-
if (
|
1901
|
-
accessor == self._sample_metadata_key
|
1902
|
-
and self._sample_metadata is not None
|
1903
|
-
):
|
1904
|
-
for key in accessor_categoricals.keys():
|
1905
|
-
if key not in self._sample_metadata.columns:
|
1906
|
-
nonval_keys.append(key)
|
1907
|
-
else:
|
1908
|
-
for key in accessor_categoricals.keys():
|
1909
|
-
if key not in self._sdata[accessor].obs.columns:
|
1910
|
-
nonval_keys.append(key)
|
1911
|
-
|
1912
|
-
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1913
|
-
|
1914
|
-
# check validity of keys in sources and exclude
|
1915
|
-
for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
|
1916
|
-
nonval_keys = []
|
1917
|
-
for accessor, accessor_sources in dct.items():
|
1918
|
-
if (
|
1919
|
-
accessor == self._sample_metadata_key
|
1920
|
-
and self._sample_metadata is not None
|
1921
|
-
):
|
1922
|
-
columns = self._sample_metadata.columns
|
1923
|
-
elif accessor != self._sample_metadata_key:
|
1924
|
-
columns = self._sdata[accessor].obs.columns
|
1925
|
-
else:
|
1926
|
-
continue
|
1927
|
-
for key in accessor_sources:
|
1928
|
-
if key not in columns:
|
1929
|
-
nonval_keys.append(key)
|
1930
|
-
_maybe_curation_keys_not_present(nonval_keys, name)
|
1931
|
+
self._columns_field = obs_columns
|
1932
|
+
if isinstance(experiment_uri, Artifact):
|
1933
|
+
self._dataset = experiment_uri.path
|
1934
|
+
self._artifact = experiment_uri
|
1935
|
+
else:
|
1936
|
+
self._dataset = UPath(experiment_uri)
|
1937
|
+
self._artifact = None
|
1938
|
+
self._organism = organism
|
1939
|
+
self._sources = sources or {}
|
1931
1940
|
|
1932
|
-
|
1933
|
-
|
1934
|
-
|
1935
|
-
|
1936
|
-
|
1937
|
-
|
1938
|
-
|
1939
|
-
|
1940
|
-
|
1941
|
-
|
1942
|
-
|
1943
|
-
|
1944
|
-
organism=organism,
|
1945
|
-
)
|
1946
|
-
self._table_adata_curators = {
|
1947
|
-
table: AnnDataCatManager(
|
1948
|
-
data=self._sdata[table],
|
1949
|
-
var_index=var_index.get(table),
|
1950
|
-
categoricals=self._categoricals.get(table),
|
1951
|
-
verbosity=verbosity,
|
1952
|
-
sources=self._sources.get(table),
|
1953
|
-
exclude=self._exclude.get(table),
|
1954
|
-
organism=organism,
|
1955
|
-
)
|
1956
|
-
for table in self._table_keys
|
1957
|
-
}
|
1941
|
+
self._is_validated: bool | None = False
|
1942
|
+
self._non_validated_values: dict[str, list] | None = None
|
1943
|
+
self._validated_values: dict[str, list] = {}
|
1944
|
+
# filled by _check_save_keys
|
1945
|
+
self._n_obs: int | None = None
|
1946
|
+
self._valid_obs_keys: list[str] | None = None
|
1947
|
+
self._obs_pa_schema: pa.lib.Schema | None = (
|
1948
|
+
None # this is needed to create the obs feature set
|
1949
|
+
)
|
1950
|
+
self._valid_var_keys: list[str] | None = None
|
1951
|
+
self._var_fields_flat: dict[str, FieldAttr] | None = None
|
1952
|
+
self._check_save_keys()
|
1958
1953
|
|
1959
|
-
|
1954
|
+
# check that the provided keys in var_index and categoricals are available in the store
|
1955
|
+
# and save features
|
1956
|
+
def _check_save_keys(self):
|
1957
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1960
1958
|
|
1961
|
-
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1959
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1960
|
+
experiment_obs = experiment.obs
|
1961
|
+
self._n_obs = len(experiment_obs)
|
1962
|
+
self._obs_pa_schema = experiment_obs.schema
|
1963
|
+
valid_obs_keys = [
|
1964
|
+
k for k in self._obs_pa_schema.names if k != "soma_joinid"
|
1965
|
+
]
|
1966
|
+
self._valid_obs_keys = valid_obs_keys
|
1965
1967
|
|
1966
|
-
|
1967
|
-
|
1968
|
-
|
1969
|
-
|
1968
|
+
valid_var_keys = []
|
1969
|
+
ms_list = []
|
1970
|
+
for ms in experiment.ms.keys():
|
1971
|
+
ms_list.append(ms)
|
1972
|
+
var_ms = experiment.ms[ms].var
|
1973
|
+
valid_var_keys += [
|
1974
|
+
f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
|
1975
|
+
]
|
1976
|
+
self._valid_var_keys = valid_var_keys
|
1970
1977
|
|
1971
|
-
|
1972
|
-
|
1973
|
-
|
1974
|
-
|
1975
|
-
|
1976
|
-
|
1978
|
+
# check validity of keys in categoricals
|
1979
|
+
nonval_keys = []
|
1980
|
+
for obs_key in self._obs_fields.keys():
|
1981
|
+
if obs_key not in valid_obs_keys:
|
1982
|
+
nonval_keys.append(obs_key)
|
1983
|
+
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1977
1984
|
|
1978
|
-
|
1979
|
-
|
1980
|
-
|
1981
|
-
|
1982
|
-
|
1983
|
-
|
1984
|
-
|
1985
|
-
|
1986
|
-
|
1987
|
-
|
1988
|
-
|
1989
|
-
raise ValidationError(f"Accessor '{acc}' does not exist!")
|
1985
|
+
# check validity of keys in var_index
|
1986
|
+
self._var_fields_flat = {}
|
1987
|
+
nonval_keys = []
|
1988
|
+
for ms_key in self._var_fields.keys():
|
1989
|
+
var_key, var_field = self._var_fields[ms_key]
|
1990
|
+
var_key_flat = f"{ms_key}__{var_key}"
|
1991
|
+
if var_key_flat not in valid_var_keys:
|
1992
|
+
nonval_keys.append(f"({ms_key}, {var_key})")
|
1993
|
+
else:
|
1994
|
+
self._var_fields_flat[var_key_flat] = var_field
|
1995
|
+
_maybe_curation_keys_not_present(nonval_keys, "var_index")
|
1990
1996
|
|
1991
|
-
|
1992
|
-
""
|
1997
|
+
# check validity of keys in sources
|
1998
|
+
valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
|
1999
|
+
nonval_keys = []
|
2000
|
+
for arg_key in self._sources.keys():
|
2001
|
+
if arg_key not in valid_arg_keys:
|
2002
|
+
nonval_keys.append(arg_key)
|
2003
|
+
_maybe_curation_keys_not_present(nonval_keys, "sources")
|
1993
2004
|
|
1994
|
-
|
1995
|
-
|
1996
|
-
|
1997
|
-
|
1998
|
-
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2005
|
+
# register obs columns' names
|
2006
|
+
register_columns = list(self._obs_fields.keys())
|
2007
|
+
update_registry(
|
2008
|
+
values=register_columns,
|
2009
|
+
field=self._columns_field,
|
2010
|
+
key="columns",
|
2011
|
+
validated_only=False,
|
2012
|
+
organism=self._organism,
|
2013
|
+
source=self._sources.get("columns"),
|
2002
2014
|
)
|
2003
|
-
|
2004
|
-
|
2005
|
-
|
2006
|
-
|
2007
|
-
self.
|
2008
|
-
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2015
|
+
additional_columns = [k for k in valid_obs_keys if k not in register_columns]
|
2016
|
+
# no need to register with validated_only=True if columns are features
|
2017
|
+
if (
|
2018
|
+
len(additional_columns) > 0
|
2019
|
+
and self._columns_field.field.model is not Feature
|
2020
|
+
):
|
2021
|
+
update_registry(
|
2022
|
+
values=additional_columns,
|
2023
|
+
field=self._columns_field,
|
2024
|
+
key="columns",
|
2012
2025
|
validated_only=True,
|
2026
|
+
organism=self._organism,
|
2027
|
+
source=self._sources.get("columns"),
|
2013
2028
|
)
|
2014
2029
|
|
2015
|
-
def
|
2016
|
-
"""
|
2030
|
+
def validate(self):
|
2031
|
+
"""Validate categories."""
|
2032
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
2017
2033
|
|
2018
|
-
|
2019
|
-
|
2020
|
-
|
2021
|
-
|
2022
|
-
|
2023
|
-
|
2024
|
-
|
2025
|
-
|
2026
|
-
|
2027
|
-
|
2028
|
-
|
2034
|
+
validated = True
|
2035
|
+
self._non_validated_values = {}
|
2036
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
2037
|
+
for ms, (key, field) in self._var_fields.items():
|
2038
|
+
var_ms = experiment.ms[ms].var
|
2039
|
+
var_ms_key = f"{ms}__{key}"
|
2040
|
+
# it was already validated and cached
|
2041
|
+
if var_ms_key in self._validated_values:
|
2042
|
+
continue
|
2043
|
+
var_ms_values = (
|
2044
|
+
var_ms.read(column_names=[key]).concat()[key].to_pylist()
|
2045
|
+
)
|
2046
|
+
update_registry(
|
2047
|
+
values=var_ms_values,
|
2048
|
+
field=field,
|
2049
|
+
key=var_ms_key,
|
2050
|
+
validated_only=True,
|
2051
|
+
organism=self._organism,
|
2052
|
+
source=self._sources.get(var_ms_key),
|
2053
|
+
)
|
2054
|
+
_, non_val = validate_categories(
|
2055
|
+
values=var_ms_values,
|
2056
|
+
field=field,
|
2057
|
+
key=var_ms_key,
|
2058
|
+
organism=self._organism,
|
2059
|
+
source=self._sources.get(var_ms_key),
|
2060
|
+
)
|
2061
|
+
if len(non_val) > 0:
|
2062
|
+
validated = False
|
2063
|
+
self._non_validated_values[var_ms_key] = non_val
|
2064
|
+
else:
|
2065
|
+
self._validated_values[var_ms_key] = var_ms_values
|
2066
|
+
|
2067
|
+
obs = experiment.obs
|
2068
|
+
for key, field in self._obs_fields.items():
|
2069
|
+
# already validated and cached
|
2070
|
+
if key in self._validated_values:
|
2071
|
+
continue
|
2072
|
+
values = pa.compute.unique(
|
2073
|
+
obs.read(column_names=[key]).concat()[key]
|
2074
|
+
).to_pylist()
|
2075
|
+
update_registry(
|
2076
|
+
values=values,
|
2077
|
+
field=field,
|
2078
|
+
key=key,
|
2079
|
+
validated_only=True,
|
2080
|
+
organism=self._organism,
|
2081
|
+
source=self._sources.get(key),
|
2082
|
+
)
|
2083
|
+
_, non_val = validate_categories(
|
2084
|
+
values=values,
|
2085
|
+
field=field,
|
2086
|
+
key=key,
|
2087
|
+
organism=self._organism,
|
2088
|
+
source=self._sources.get(key),
|
2089
|
+
)
|
2090
|
+
if len(non_val) > 0:
|
2091
|
+
validated = False
|
2092
|
+
self._non_validated_values[key] = non_val
|
2093
|
+
else:
|
2094
|
+
self._validated_values[key] = values
|
2095
|
+
self._is_validated = validated
|
2096
|
+
return self._is_validated
|
2097
|
+
|
2098
|
+
def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
|
2099
|
+
assert self._non_validated_values is not None # noqa: S101
|
2029
2100
|
|
2030
|
-
|
2031
|
-
|
2101
|
+
if key in self._valid_obs_keys:
|
2102
|
+
field = self._obs_fields[key]
|
2103
|
+
elif key in self._valid_var_keys:
|
2104
|
+
ms = key.partition("__")[0]
|
2105
|
+
field = self._var_fields[ms][1]
|
2106
|
+
else:
|
2107
|
+
raise KeyError(f"key {key} is invalid!")
|
2108
|
+
values = self._non_validated_values.get(key, [])
|
2109
|
+
return values, field
|
2032
2110
|
|
2033
|
-
def add_new_from(
|
2034
|
-
|
2035
|
-
key: str,
|
2036
|
-
accessor: str | None = None,
|
2037
|
-
**kwargs,
|
2038
|
-
) -> None:
|
2039
|
-
"""Save new values of categorical from sample level metadata or table.
|
2111
|
+
def add_new_from(self, key: str, **kwargs) -> None:
|
2112
|
+
"""Add validated & new categories.
|
2040
2113
|
|
2041
2114
|
Args:
|
2042
|
-
key: The key referencing the slot in the
|
2043
|
-
|
2044
|
-
|
2045
|
-
**kwargs: Additional keyword arguments to pass to create new records.
|
2115
|
+
key: The key referencing the slot in the `tiledbsoma` store.
|
2116
|
+
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
2117
|
+
or a column name in `.obs`.
|
2046
2118
|
"""
|
2047
|
-
if self.
|
2119
|
+
if self._non_validated_values is None:
|
2048
2120
|
raise ValidationError("Run .validate() first.")
|
2121
|
+
if key == "all":
|
2122
|
+
keys = list(self._non_validated_values.keys())
|
2123
|
+
else:
|
2124
|
+
avail_keys = list(
|
2125
|
+
chain(self._non_validated_values.keys(), self._validated_values.keys())
|
2126
|
+
)
|
2127
|
+
if key not in avail_keys:
|
2128
|
+
raise KeyError(
|
2129
|
+
f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
|
2130
|
+
)
|
2131
|
+
keys = [key]
|
2132
|
+
for k in keys:
|
2133
|
+
values, field = self._non_validated_values_field(k)
|
2134
|
+
if len(values) == 0:
|
2135
|
+
continue
|
2136
|
+
update_registry(
|
2137
|
+
values=values,
|
2138
|
+
field=field,
|
2139
|
+
key=k,
|
2140
|
+
validated_only=False,
|
2141
|
+
organism=self._organism,
|
2142
|
+
source=self._sources.get(k),
|
2143
|
+
**kwargs,
|
2144
|
+
)
|
2145
|
+
# update non-validated values list but keep the key there
|
2146
|
+
# it will be removed by .validate()
|
2147
|
+
if k in self._non_validated_values:
|
2148
|
+
self._non_validated_values[k] = []
|
2049
2149
|
|
2050
|
-
|
2051
|
-
|
2150
|
+
@property
|
2151
|
+
def non_validated(self) -> dict[str, list]:
|
2152
|
+
"""Return the non-validated features and labels."""
|
2153
|
+
non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
|
2154
|
+
return non_val
|
2052
2155
|
|
2053
|
-
|
2054
|
-
|
2055
|
-
|
2056
|
-
|
2156
|
+
@property
|
2157
|
+
def var_index(self) -> dict[str, FieldAttr]:
|
2158
|
+
"""Return the registry fields with flattened keys to validate variables indices against."""
|
2159
|
+
return self._var_fields_flat
|
2057
2160
|
|
2058
|
-
|
2059
|
-
|
2060
|
-
|
2061
|
-
|
2062
|
-
self._sample_df_curator.add_new_from(key=key, **kwargs)
|
2161
|
+
@property
|
2162
|
+
def categoricals(self) -> dict[str, FieldAttr]:
|
2163
|
+
"""Return the obs fields to validate against."""
|
2164
|
+
return self._obs_fields
|
2063
2165
|
|
2064
|
-
|
2065
|
-
|
2066
|
-
self.non_validated.pop(accessor)
|
2166
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
2167
|
+
"""Lookup categories.
|
2067
2168
|
|
2068
|
-
|
2069
|
-
|
2169
|
+
Args:
|
2170
|
+
public: If "public", the lookup is performed on the public reference.
|
2171
|
+
"""
|
2172
|
+
return CatLookup(
|
2173
|
+
categoricals=self._obs_fields,
|
2174
|
+
slots={"columns": self._columns_field, **self._var_fields_flat},
|
2175
|
+
public=public,
|
2176
|
+
organism=self._organism,
|
2177
|
+
sources=self._sources,
|
2178
|
+
)
|
2179
|
+
|
2180
|
+
def standardize(self, key: str):
|
2181
|
+
"""Replace synonyms with standardized values.
|
2070
2182
|
|
2071
2183
|
Modifies the dataset inplace.
|
2072
2184
|
|
2073
2185
|
Args:
|
2074
|
-
key: The key referencing the slot in the
|
2075
|
-
|
2186
|
+
key: The key referencing the slot in the `tiledbsoma` store.
|
2187
|
+
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
2188
|
+
or a column name in `.obs`.
|
2076
2189
|
"""
|
2077
2190
|
if len(self.non_validated) == 0:
|
2078
2191
|
logger.warning("values are already standardized")
|
2079
2192
|
return
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
if accessor == self._sample_metadata_key:
|
2084
|
-
if key not in self._sample_metadata.columns:
|
2085
|
-
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
2193
|
+
avail_keys = list(self._non_validated_values.keys())
|
2194
|
+
if key == "all":
|
2195
|
+
keys = avail_keys
|
2086
2196
|
else:
|
2087
|
-
if
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
):
|
2093
|
-
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
2094
|
-
|
2095
|
-
if accessor in self._table_adata_curators.keys():
|
2096
|
-
adata_curator = self._table_adata_curators[accessor]
|
2097
|
-
adata_curator.standardize(key)
|
2098
|
-
if accessor == self._sample_metadata_key:
|
2099
|
-
self._sample_df_curator.standardize(key)
|
2100
|
-
|
2101
|
-
if len(self.non_validated[accessor].values()) == 0:
|
2102
|
-
self.non_validated.pop(accessor)
|
2103
|
-
|
2104
|
-
def validate(self) -> bool:
|
2105
|
-
"""Validate variables and categorical observations.
|
2106
|
-
|
2107
|
-
This method also registers the validated records in the current instance:
|
2108
|
-
- from public sources
|
2109
|
-
|
2110
|
-
Args:
|
2111
|
-
organism: The organism name.
|
2197
|
+
if key not in avail_keys:
|
2198
|
+
raise KeyError(
|
2199
|
+
f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
|
2200
|
+
)
|
2201
|
+
keys = [key]
|
2112
2202
|
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2203
|
+
for k in keys:
|
2204
|
+
values, field = self._non_validated_values_field(k)
|
2205
|
+
if len(values) == 0:
|
2206
|
+
continue
|
2207
|
+
if k in self._valid_var_keys:
|
2208
|
+
ms, _, slot_key = k.partition("__")
|
2209
|
+
slot = lambda experiment: experiment.ms[ms].var # noqa: B023
|
2210
|
+
else:
|
2211
|
+
slot = lambda experiment: experiment.obs
|
2212
|
+
slot_key = k
|
2213
|
+
syn_mapper = standardize_categories(
|
2214
|
+
values=values,
|
2215
|
+
field=field,
|
2216
|
+
source=self._sources.get(k),
|
2217
|
+
organism=self._organism,
|
2218
|
+
)
|
2219
|
+
if (n_syn_mapper := len(syn_mapper)) == 0:
|
2220
|
+
continue
|
2117
2221
|
|
2118
|
-
|
2119
|
-
verbosity = settings.verbosity
|
2120
|
-
try:
|
2121
|
-
settings.verbosity = "error"
|
2122
|
-
self._update_registry_all()
|
2123
|
-
finally:
|
2124
|
-
settings.verbosity = verbosity
|
2222
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
2125
2223
|
|
2126
|
-
|
2224
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
2225
|
+
value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
|
2226
|
+
table = slot(experiment).read(value_filter=value_filter).concat()
|
2127
2227
|
|
2128
|
-
|
2129
|
-
|
2130
|
-
logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
|
2131
|
-
sample_validated &= self._sample_df_curator.validate()
|
2132
|
-
if len(self._sample_df_curator.non_validated) > 0:
|
2133
|
-
self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
|
2134
|
-
logger.print("")
|
2228
|
+
if len(table) == 0:
|
2229
|
+
continue
|
2135
2230
|
|
2136
|
-
|
2137
|
-
|
2138
|
-
|
2139
|
-
|
2140
|
-
|
2141
|
-
|
2142
|
-
|
2231
|
+
df = table.to_pandas()
|
2232
|
+
# map values
|
2233
|
+
df[slot_key] = df[slot_key].map(
|
2234
|
+
lambda val: syn_mapper.get(val, val) # noqa
|
2235
|
+
)
|
2236
|
+
# write the mapped values
|
2237
|
+
with _open_tiledbsoma(self._dataset, mode="w") as experiment:
|
2238
|
+
slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
|
2239
|
+
# update non_validated dict
|
2240
|
+
non_val_k = [
|
2241
|
+
nv for nv in self._non_validated_values[k] if nv not in syn_mapper
|
2242
|
+
]
|
2243
|
+
self._non_validated_values[k] = non_val_k
|
2143
2244
|
|
2144
|
-
|
2145
|
-
|
2245
|
+
syn_mapper_print = _format_values(
|
2246
|
+
[f'"{m_k}" → "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
|
2247
|
+
)
|
2248
|
+
s = "s" if n_syn_mapper > 1 else ""
|
2249
|
+
logger.success(
|
2250
|
+
f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
|
2251
|
+
)
|
2146
2252
|
|
2147
2253
|
def save_artifact(
|
2148
2254
|
self,
|
@@ -2152,217 +2258,112 @@ class SpatialDataCatManager(CatManager):
|
|
2152
2258
|
revises: Artifact | None = None,
|
2153
2259
|
run: Run | None = None,
|
2154
2260
|
) -> Artifact:
|
2261
|
+
"""Save the validated `tiledbsoma` store and metadata.
|
2262
|
+
|
2263
|
+
Args:
|
2264
|
+
description: A description of the ``tiledbsoma`` store.
|
2265
|
+
key: A path-like key to reference artifact in default storage,
|
2266
|
+
e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
|
2267
|
+
revises: Previous version of the artifact. Triggers a revision.
|
2268
|
+
run: The run that creates the artifact.
|
2269
|
+
|
2270
|
+
Returns:
|
2271
|
+
A saved artifact record.
|
2272
|
+
"""
|
2155
2273
|
if not self._is_validated:
|
2156
2274
|
self.validate()
|
2157
2275
|
if not self._is_validated:
|
2158
2276
|
raise ValidationError("Dataset does not validate. Please curate.")
|
2159
2277
|
|
2160
|
-
|
2161
|
-
|
2162
|
-
|
2163
|
-
|
2164
|
-
self._artifact = Artifact.from_spatialdata(
|
2165
|
-
self._sdata,
|
2166
|
-
key=key,
|
2278
|
+
if self._artifact is None:
|
2279
|
+
artifact = Artifact(
|
2280
|
+
self._dataset,
|
2167
2281
|
description=description,
|
2282
|
+
key=key,
|
2168
2283
|
revises=revises,
|
2169
2284
|
run=run,
|
2170
2285
|
)
|
2171
|
-
self.
|
2286
|
+
artifact.n_observations = self._n_obs
|
2287
|
+
artifact.otype = "tiledbsoma"
|
2288
|
+
artifact.save()
|
2289
|
+
else:
|
2290
|
+
artifact = self._artifact
|
2291
|
+
|
2292
|
+
feature_sets = {}
|
2293
|
+
if len(self._obs_fields) > 0:
|
2294
|
+
empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
|
2295
|
+
mock_df = pa.Table.from_pydict(
|
2296
|
+
empty_dict, schema=self._obs_pa_schema
|
2297
|
+
).to_pandas()
|
2298
|
+
# in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
|
2299
|
+
feature_sets["obs"] = Schema.from_df(
|
2300
|
+
df=mock_df,
|
2301
|
+
field=self._columns_field,
|
2302
|
+
mute=True,
|
2303
|
+
organism=self._organism,
|
2304
|
+
)
|
2305
|
+
for ms in self._var_fields:
|
2306
|
+
var_key, var_field = self._var_fields[ms]
|
2307
|
+
feature_sets[f"{ms}__var"] = Schema.from_values(
|
2308
|
+
values=self._validated_values[f"{ms}__{var_key}"],
|
2309
|
+
field=var_field,
|
2310
|
+
organism=self._organism,
|
2311
|
+
raise_validation_error=False,
|
2312
|
+
)
|
2313
|
+
artifact._staged_feature_sets = feature_sets
|
2172
2314
|
|
2173
|
-
|
2174
|
-
|
2175
|
-
|
2176
|
-
|
2315
|
+
feature_ref_is_name = _ref_is_name(self._columns_field)
|
2316
|
+
features = Feature.lookup().dict()
|
2317
|
+
for key, field in self._obs_fields.items():
|
2318
|
+
feature = features.get(key)
|
2319
|
+
registry = field.field.model
|
2320
|
+
labels = registry.from_values(
|
2321
|
+
values=self._validated_values[key],
|
2322
|
+
field=field,
|
2323
|
+
organism=self._organism,
|
2177
2324
|
)
|
2178
|
-
|
2179
|
-
|
2180
|
-
|
2181
|
-
|
2182
|
-
|
2183
|
-
|
2184
|
-
|
2185
|
-
|
2186
|
-
|
2187
|
-
|
2188
|
-
|
2189
|
-
assert host.otype == "SpatialData" # noqa: S101
|
2190
|
-
|
2191
|
-
feature_sets = {}
|
2192
|
-
|
2193
|
-
# sample features
|
2194
|
-
sample_features = Feature.from_values(self._sample_metadata.columns) # type: ignore
|
2195
|
-
if len(sample_features) > 0:
|
2196
|
-
feature_sets[self._sample_metadata_key] = Schema(
|
2197
|
-
features=sample_features
|
2198
|
-
)
|
2199
|
-
|
2200
|
-
# table features
|
2201
|
-
for table, field in var_fields.items():
|
2202
|
-
table_fs = parse_staged_feature_sets_from_anndata(
|
2203
|
-
self._sdata[table],
|
2204
|
-
var_field=field,
|
2205
|
-
obs_field=obs_fields.get(table, Feature.name),
|
2206
|
-
mute=mute,
|
2207
|
-
organism=organism,
|
2208
|
-
)
|
2209
|
-
for k, v in table_fs.items():
|
2210
|
-
feature_sets[f"['{table}'].{k}"] = v
|
2211
|
-
|
2212
|
-
def _unify_staged_feature_sets_by_hash(
|
2213
|
-
feature_sets: MutableMapping[str, Schema],
|
2214
|
-
):
|
2215
|
-
unique_values: dict[str, Any] = {}
|
2216
|
-
|
2217
|
-
for key, value in feature_sets.items():
|
2218
|
-
value_hash = (
|
2219
|
-
value.hash
|
2220
|
-
) # Assuming each value has a .hash attribute
|
2221
|
-
if value_hash in unique_values:
|
2222
|
-
feature_sets[key] = unique_values[value_hash]
|
2223
|
-
else:
|
2224
|
-
unique_values[value_hash] = value
|
2225
|
-
|
2226
|
-
return feature_sets
|
2227
|
-
|
2228
|
-
# link feature sets
|
2229
|
-
host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
|
2230
|
-
feature_sets
|
2325
|
+
if len(labels) == 0:
|
2326
|
+
continue
|
2327
|
+
if hasattr(registry, "_name_field"):
|
2328
|
+
label_ref_is_name = field.field.name == registry._name_field
|
2329
|
+
add_labels(
|
2330
|
+
artifact,
|
2331
|
+
records=labels,
|
2332
|
+
feature=feature,
|
2333
|
+
feature_ref_is_name=feature_ref_is_name,
|
2334
|
+
label_ref_is_name=label_ref_is_name,
|
2335
|
+
from_curator=True,
|
2231
2336
|
)
|
2232
|
-
host.save()
|
2233
|
-
|
2234
|
-
_add_set_from_spatialdata(
|
2235
|
-
self._artifact, var_fields=self._var_fields, **feature_kwargs
|
2236
|
-
)
|
2237
|
-
|
2238
|
-
# Link labels
|
2239
|
-
def _add_labels_from_spatialdata(
|
2240
|
-
data,
|
2241
|
-
artifact: Artifact,
|
2242
|
-
fields: dict[str, FieldAttr],
|
2243
|
-
feature_ref_is_name: bool | None = None,
|
2244
|
-
):
|
2245
|
-
"""Add Labels from SpatialData."""
|
2246
|
-
features = Feature.lookup().dict()
|
2247
|
-
for key, field in fields.items():
|
2248
|
-
feature = features.get(key)
|
2249
|
-
registry = field.field.model
|
2250
|
-
filter_kwargs = check_registry_organism(registry, self._organism)
|
2251
|
-
filter_kwargs_current = get_current_filter_kwargs(
|
2252
|
-
registry, filter_kwargs
|
2253
|
-
)
|
2254
|
-
df = data if isinstance(data, pd.DataFrame) else data.obs
|
2255
|
-
labels = registry.from_values(
|
2256
|
-
df[key],
|
2257
|
-
field=field,
|
2258
|
-
**filter_kwargs_current,
|
2259
|
-
)
|
2260
|
-
if len(labels) == 0:
|
2261
|
-
continue
|
2262
2337
|
|
2263
|
-
|
2264
|
-
if hasattr(registry, "_name_field"):
|
2265
|
-
label_ref_is_name = field.field.name == registry._name_field
|
2266
|
-
add_labels(
|
2267
|
-
artifact,
|
2268
|
-
records=labels,
|
2269
|
-
feature=feature,
|
2270
|
-
feature_ref_is_name=feature_ref_is_name,
|
2271
|
-
label_ref_is_name=label_ref_is_name,
|
2272
|
-
from_curator=True,
|
2273
|
-
)
|
2274
|
-
|
2275
|
-
for accessor, accessor_fields in self._categoricals.items():
|
2276
|
-
column_field = self._var_fields.get(accessor)
|
2277
|
-
if accessor == self._sample_metadata_key:
|
2278
|
-
_add_labels_from_spatialdata(
|
2279
|
-
self._sample_metadata,
|
2280
|
-
self._artifact,
|
2281
|
-
accessor_fields,
|
2282
|
-
feature_ref_is_name=(
|
2283
|
-
None if column_field is None else _ref_is_name(column_field)
|
2284
|
-
),
|
2285
|
-
)
|
2286
|
-
else:
|
2287
|
-
_add_labels_from_spatialdata(
|
2288
|
-
self._sdata.tables[accessor],
|
2289
|
-
self._artifact,
|
2290
|
-
accessor_fields,
|
2291
|
-
feature_ref_is_name=(
|
2292
|
-
None if column_field is None else _ref_is_name(column_field)
|
2293
|
-
),
|
2294
|
-
)
|
2295
|
-
|
2296
|
-
finally:
|
2297
|
-
settings.verbosity = verbosity
|
2298
|
-
|
2299
|
-
slug = ln_setup.settings.instance.slug
|
2300
|
-
if ln_setup.settings.instance.is_remote: # pragma: no cover
|
2301
|
-
logger.important(
|
2302
|
-
f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
|
2303
|
-
)
|
2304
|
-
|
2305
|
-
return self._artifact
|
2338
|
+
return artifact.save()
|
2306
2339
|
|
2307
2340
|
|
2308
|
-
|
2309
|
-
|
2310
|
-
) -> dict[str, str]:
|
2311
|
-
"""Restrict the obs fields to name return only available obs fields.
|
2341
|
+
class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
2342
|
+
"""Categorical manager for `AnnData` respecting the CELLxGENE schema.
|
2312
2343
|
|
2313
|
-
|
2314
|
-
If both are available, we validate against ontology_id.
|
2315
|
-
If none are available, we validate against name.
|
2344
|
+
This will be superceded by a schema-based curation flow.
|
2316
2345
|
"""
|
2317
|
-
obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
|
2318
|
-
for name, field in obs_fields.items():
|
2319
|
-
if name.endswith("_ontology_term_id"):
|
2320
|
-
continue
|
2321
|
-
# if both the ontology id and the name are present, only validate on the ontology_id
|
2322
|
-
if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
|
2323
|
-
obs_fields_unique.pop(name)
|
2324
|
-
# if the neither name nor ontology id are present, validate on the name
|
2325
|
-
# this will raise error downstream, we just use name to be more readable
|
2326
|
-
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
2327
|
-
obs_fields_unique[name] = field
|
2328
|
-
|
2329
|
-
# Only retain obs_fields_unique that have keys in adata.obs.columns
|
2330
|
-
available_obs_fields = {
|
2331
|
-
k: v for k, v in obs_fields_unique.items() if k in obs.columns
|
2332
|
-
}
|
2333
|
-
|
2334
|
-
return available_obs_fields
|
2335
|
-
|
2336
|
-
|
2337
|
-
def _add_defaults_to_obs(
|
2338
|
-
obs: pd.DataFrame,
|
2339
|
-
defaults: dict[str, str],
|
2340
|
-
) -> None:
|
2341
|
-
"""Add default columns and values to obs DataFrame."""
|
2342
|
-
added_defaults: dict = {}
|
2343
|
-
for name, default in defaults.items():
|
2344
|
-
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
2345
|
-
obs[name] = default
|
2346
|
-
added_defaults[name] = default
|
2347
|
-
logger.important(
|
2348
|
-
f"added default value '{default}' to the adata.obs['{name}']"
|
2349
|
-
)
|
2350
|
-
|
2351
|
-
|
2352
|
-
class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
2353
|
-
"""Annotation flow of AnnData based on CELLxGENE schema."""
|
2354
2346
|
|
2355
|
-
|
2347
|
+
cxg_categoricals_defaults = {
|
2348
|
+
"cell_type": "unknown",
|
2349
|
+
"development_stage": "unknown",
|
2350
|
+
"disease": "normal",
|
2351
|
+
"donor_id": "unknown",
|
2352
|
+
"self_reported_ethnicity": "unknown",
|
2353
|
+
"sex": "unknown",
|
2354
|
+
"suspension_type": "cell",
|
2355
|
+
"tissue_type": "tissue",
|
2356
|
+
}
|
2356
2357
|
|
2357
2358
|
def __init__(
|
2358
2359
|
self,
|
2359
|
-
adata: ad.AnnData
|
2360
|
+
adata: ad.AnnData,
|
2360
2361
|
categoricals: dict[str, FieldAttr] | None = None,
|
2361
2362
|
organism: Literal["human", "mouse"] = "human",
|
2362
2363
|
*,
|
2364
|
+
schema_version: Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
|
2363
2365
|
defaults: dict[str, str] = None,
|
2364
2366
|
extra_sources: dict[str, Record] = None,
|
2365
|
-
schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
|
2366
2367
|
verbosity: str = "hint",
|
2367
2368
|
) -> None:
|
2368
2369
|
"""CELLxGENE schema curator.
|
@@ -2372,304 +2373,85 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2372
2373
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
2373
2374
|
The CELLxGENE Curator maps against the required CELLxGENE fields by default.
|
2374
2375
|
organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
|
2376
|
+
schema_version: The CELLxGENE schema version to curate against.
|
2375
2377
|
defaults: Default values that are set if columns or column values are missing.
|
2376
2378
|
extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
|
2377
2379
|
These extra sources are joined with the CELLxGENE fixed sources.
|
2378
2380
|
Use this parameter when subclassing.
|
2379
|
-
exclude: A dictionary mapping column names to values to exclude.
|
2380
|
-
schema_version: The CELLxGENE schema version to curate against.
|
2381
2381
|
verbosity: The verbosity level.
|
2382
|
-
|
2383
2382
|
"""
|
2384
2383
|
import bionty as bt
|
2385
2384
|
|
2386
|
-
|
2385
|
+
from ._cellxgene_schemas import (
|
2386
|
+
_add_defaults_to_obs,
|
2387
|
+
_create_sources,
|
2388
|
+
_init_categoricals_additional_values,
|
2389
|
+
_restrict_obs_fields,
|
2390
|
+
)
|
2387
2391
|
|
2388
|
-
|
2392
|
+
# Add defaults first to ensure that we fetch valid sources
|
2393
|
+
if defaults:
|
2394
|
+
_add_defaults_to_obs(adata.obs, defaults)
|
2389
2395
|
|
2396
|
+
# Filter categoricals based on what's present in adata
|
2390
2397
|
if categoricals is None:
|
2391
|
-
categoricals =
|
2398
|
+
categoricals = self._get_cxg_categoricals()
|
2399
|
+
categoricals = _restrict_obs_fields(adata.obs, categoricals)
|
2392
2400
|
|
2393
|
-
|
2394
|
-
|
2395
|
-
VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
|
2396
|
-
if schema_version not in VALID_SCHEMA_VERSIONS:
|
2397
|
-
valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
|
2398
|
-
raise ValueError(
|
2399
|
-
f"Invalid schema_version: {schema_version}. "
|
2400
|
-
f"Valid versions are: {valid_versions}"
|
2401
|
-
)
|
2401
|
+
# Configure sources
|
2402
|
+
sources = _create_sources(categoricals, schema_version, organism)
|
2402
2403
|
self.schema_version = schema_version
|
2403
2404
|
self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
|
2404
|
-
with resources.path(
|
2405
|
-
"lamindb.curators._cellxgene_schemas", "schema_versions.yml"
|
2406
|
-
) as schema_versions_path:
|
2407
|
-
self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
|
2408
|
-
self.schema_version
|
2409
|
-
]
|
2410
|
-
|
2411
|
-
# Fetch AnnData obs to be able to set defaults and get sources
|
2412
|
-
if isinstance(adata, ad.AnnData):
|
2413
|
-
self._adata_obs = adata.obs
|
2414
|
-
else:
|
2415
|
-
self._adata_obs = backed_access(upath.create_path(adata)).obs # type: ignore
|
2416
|
-
|
2417
|
-
# Add defaults first to ensure that we fetch valid sources
|
2418
|
-
if defaults:
|
2419
|
-
_add_defaults_to_obs(self._adata_obs, defaults)
|
2420
|
-
|
2421
|
-
self.sources = self._create_sources(self._adata_obs)
|
2422
|
-
self.sources = {
|
2423
|
-
entity: source
|
2424
|
-
for entity, source in self.sources.items()
|
2425
|
-
if source is not None
|
2426
|
-
}
|
2427
|
-
|
2428
2405
|
# These sources are not a part of the cellxgene schema but rather passed through.
|
2429
2406
|
# This is useful when other Curators extend the CELLxGENE curator
|
2430
2407
|
if extra_sources:
|
2431
|
-
|
2408
|
+
sources = sources | extra_sources
|
2432
2409
|
|
2433
|
-
|
2434
|
-
exclude_keys = {
|
2435
|
-
entity: default
|
2436
|
-
for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
|
2437
|
-
if entity in self._adata_obs.columns # type: ignore
|
2438
|
-
}
|
2410
|
+
_init_categoricals_additional_values()
|
2439
2411
|
|
2440
2412
|
super().__init__(
|
2441
2413
|
data=adata,
|
2442
|
-
var_index=
|
2443
|
-
categoricals=
|
2414
|
+
var_index=bt.Gene.ensembl_gene_id,
|
2415
|
+
categoricals=categoricals,
|
2444
2416
|
verbosity=verbosity,
|
2445
2417
|
organism=organism,
|
2446
|
-
sources=
|
2447
|
-
exclude=exclude_keys,
|
2418
|
+
sources=sources,
|
2448
2419
|
)
|
2449
2420
|
|
2450
2421
|
@classmethod
|
2451
|
-
|
2452
|
-
import bionty as bt
|
2453
|
-
|
2454
|
-
import lamindb as ln
|
2455
|
-
|
2456
|
-
# Note: if you add another control below, be mindful to change the if condition that
|
2457
|
-
# triggers whether creating these records is re-considered
|
2458
|
-
if cls._controls_were_created is None:
|
2459
|
-
cls._controls_were_created = (
|
2460
|
-
ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
|
2461
|
-
is not None
|
2462
|
-
)
|
2463
|
-
if not cls._controls_were_created:
|
2464
|
-
logger.important("Creating control labels in the CellxGene schema.")
|
2465
|
-
bt.CellType(
|
2466
|
-
ontology_id="unknown",
|
2467
|
-
name="unknown",
|
2468
|
-
description="From CellxGene schema.",
|
2469
|
-
).save()
|
2470
|
-
pato = bt.Source.filter(name="pato", version="2024-03-28").one()
|
2471
|
-
normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
|
2472
|
-
bt.Disease(
|
2473
|
-
uid=normal.uid,
|
2474
|
-
name=normal.name,
|
2475
|
-
ontology_id=normal.ontology_id,
|
2476
|
-
description=normal.description,
|
2477
|
-
source=normal.source,
|
2478
|
-
).save()
|
2479
|
-
bt.Ethnicity(
|
2480
|
-
ontology_id="na", name="na", description="From CellxGene schema."
|
2481
|
-
).save()
|
2482
|
-
bt.Ethnicity(
|
2483
|
-
ontology_id="unknown",
|
2484
|
-
name="unknown",
|
2485
|
-
description="From CellxGene schema.",
|
2486
|
-
).save()
|
2487
|
-
bt.DevelopmentalStage(
|
2488
|
-
ontology_id="unknown",
|
2489
|
-
name="unknown",
|
2490
|
-
description="From CellxGene schema.",
|
2491
|
-
).save()
|
2492
|
-
bt.Phenotype(
|
2493
|
-
ontology_id="unknown",
|
2494
|
-
name="unknown",
|
2495
|
-
description="From CellxGene schema.",
|
2496
|
-
).save()
|
2497
|
-
|
2498
|
-
tissue_type = ln.ULabel(
|
2499
|
-
name="TissueType",
|
2500
|
-
is_type=True,
|
2501
|
-
description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
|
2502
|
-
).save()
|
2503
|
-
ln.ULabel(
|
2504
|
-
name="tissue", type=tissue_type, description="From CellxGene schema."
|
2505
|
-
).save()
|
2506
|
-
ln.ULabel(
|
2507
|
-
name="organoid", type=tissue_type, description="From CellxGene schema."
|
2508
|
-
).save()
|
2509
|
-
ln.ULabel(
|
2510
|
-
name="cell culture",
|
2511
|
-
type=tissue_type,
|
2512
|
-
description="From CellxGene schema.",
|
2513
|
-
).save()
|
2514
|
-
|
2515
|
-
suspension_type = ln.ULabel(
|
2516
|
-
name="SuspensionType",
|
2517
|
-
is_type=True,
|
2518
|
-
description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
|
2519
|
-
).save()
|
2520
|
-
ln.ULabel(
|
2521
|
-
name="cell", type=suspension_type, description="From CellxGene schema."
|
2522
|
-
).save()
|
2523
|
-
ln.ULabel(
|
2524
|
-
name="nucleus",
|
2525
|
-
type=suspension_type,
|
2526
|
-
description="From CellxGene schema.",
|
2527
|
-
).save()
|
2528
|
-
ln.ULabel(name="na", type=suspension_type).save()
|
2529
|
-
|
2530
|
-
@classmethod
|
2531
|
-
def _get_categoricals(cls) -> dict[str, FieldAttr]:
|
2532
|
-
import bionty as bt
|
2533
|
-
|
2534
|
-
return {
|
2535
|
-
"assay": bt.ExperimentalFactor.name,
|
2536
|
-
"assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
|
2537
|
-
"cell_type": bt.CellType.name,
|
2538
|
-
"cell_type_ontology_term_id": bt.CellType.ontology_id,
|
2539
|
-
"development_stage": bt.DevelopmentalStage.name,
|
2540
|
-
"development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
|
2541
|
-
"disease": bt.Disease.name,
|
2542
|
-
"disease_ontology_term_id": bt.Disease.ontology_id,
|
2543
|
-
# "donor_id": "str", via pandera
|
2544
|
-
"self_reported_ethnicity": bt.Ethnicity.name,
|
2545
|
-
"self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
|
2546
|
-
"sex": bt.Phenotype.name,
|
2547
|
-
"sex_ontology_term_id": bt.Phenotype.ontology_id,
|
2548
|
-
"suspension_type": ULabel.name,
|
2549
|
-
"tissue": bt.Tissue.name,
|
2550
|
-
"tissue_ontology_term_id": bt.Tissue.ontology_id,
|
2551
|
-
"tissue_type": ULabel.name,
|
2552
|
-
"organism": bt.Organism.name,
|
2553
|
-
"organism_ontology_term_id": bt.Organism.ontology_id,
|
2554
|
-
}
|
2555
|
-
|
2556
|
-
@classmethod
|
2422
|
+
@deprecated(new_name="cxg_categoricals_defaults")
|
2557
2423
|
def _get_categoricals_defaults(cls) -> dict[str, str]:
|
2558
|
-
return
|
2559
|
-
"cell_type": "unknown",
|
2560
|
-
"development_stage": "unknown",
|
2561
|
-
"disease": "normal",
|
2562
|
-
"donor_id": "unknown",
|
2563
|
-
"self_reported_ethnicity": "unknown",
|
2564
|
-
"sex": "unknown",
|
2565
|
-
"suspension_type": "cell",
|
2566
|
-
"tissue_type": "tissue",
|
2567
|
-
}
|
2568
|
-
|
2569
|
-
@property
|
2570
|
-
def pinned_ontologies(self) -> pd.DataFrame:
|
2571
|
-
return self._pinned_ontologies
|
2572
|
-
|
2573
|
-
@property
|
2574
|
-
def adata(self) -> AnnData:
|
2575
|
-
return self._adata
|
2576
|
-
|
2577
|
-
def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
|
2578
|
-
"""Creates a sources dictionary that can be passed to AnnDataCatManager."""
|
2579
|
-
import bionty as bt
|
2580
|
-
|
2581
|
-
# fmt: off
|
2582
|
-
def _fetch_bionty_source(
|
2583
|
-
entity: str, organism: str, source: str
|
2584
|
-
) -> bt.Source | None:
|
2585
|
-
"""Fetch the Bionty source of the pinned ontology.
|
2586
|
-
|
2587
|
-
Returns None if the source does not exist.
|
2588
|
-
"""
|
2589
|
-
version = self._pinned_ontologies.loc[(self._pinned_ontologies.index == entity) &
|
2590
|
-
(self._pinned_ontologies["organism"] == organism) &
|
2591
|
-
(self._pinned_ontologies["source"] == source), "version"].iloc[0]
|
2592
|
-
return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
|
2593
|
-
|
2594
|
-
entity_mapping = {
|
2595
|
-
"var_index": ("Gene", self.organism, "ensembl"),
|
2596
|
-
"cell_type": ("CellType", "all", "cl"),
|
2597
|
-
"assay": ("ExperimentalFactor", "all", "efo"),
|
2598
|
-
"self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
|
2599
|
-
"development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
|
2600
|
-
"disease": ("Disease", "all", "mondo"),
|
2601
|
-
# "organism": ("Organism", "vertebrates", "ensembl"),
|
2602
|
-
"sex": ("Phenotype", "all", "pato"),
|
2603
|
-
"tissue": ("Tissue", "all", "uberon"),
|
2604
|
-
}
|
2605
|
-
# fmt: on
|
2606
|
-
|
2607
|
-
# Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
|
2608
|
-
entity_to_sources = {
|
2609
|
-
entity: _fetch_bionty_source(*params)
|
2610
|
-
for entity, params in entity_mapping.items()
|
2611
|
-
if entity in obs.columns
|
2612
|
-
or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
|
2613
|
-
or entity == "var_index"
|
2614
|
-
}
|
2615
|
-
|
2616
|
-
return entity_to_sources
|
2424
|
+
return cls.cxg_categoricals_defaults
|
2617
2425
|
|
2618
|
-
|
2619
|
-
|
2426
|
+
@classmethod
|
2427
|
+
def _get_cxg_categoricals(cls) -> dict[str, FieldAttr]:
|
2428
|
+
"""Returns the CELLxGENE schema mapped fields."""
|
2429
|
+
from ._cellxgene_schemas import _get_cxg_categoricals
|
2620
2430
|
|
2621
|
-
|
2622
|
-
"""
|
2623
|
-
field_name = field.field.name
|
2624
|
-
assert field_name == "name" # noqa: S101
|
2625
|
-
cols = ["name", "ontology_id"]
|
2626
|
-
registry = field.field.model
|
2627
|
-
|
2628
|
-
if hasattr(registry, "ontology_id"):
|
2629
|
-
validated_records = registry.filter(**{f"{field_name}__in": values})
|
2630
|
-
mapper = (
|
2631
|
-
pd.DataFrame(validated_records.values_list(*cols))
|
2632
|
-
.set_index(0)
|
2633
|
-
.to_dict()[1]
|
2634
|
-
)
|
2635
|
-
return values.map(mapper)
|
2431
|
+
return _get_cxg_categoricals()
|
2636
2432
|
|
2637
|
-
def validate(self) -> bool:
|
2433
|
+
def validate(self) -> bool:
|
2638
2434
|
"""Validates the AnnData object against most cellxgene requirements."""
|
2435
|
+
from ._cellxgene_schemas import RESERVED_NAMES
|
2436
|
+
|
2639
2437
|
# Verify that all required obs columns are present
|
2438
|
+
required_columns = list(self.cxg_categoricals_defaults.keys()) + ["donor_id"]
|
2640
2439
|
missing_obs_fields = [
|
2641
2440
|
name
|
2642
|
-
for name in
|
2441
|
+
for name in required_columns
|
2643
2442
|
if name not in self._adata.obs.columns
|
2644
2443
|
and f"{name}_ontology_term_id" not in self._adata.obs.columns
|
2645
2444
|
]
|
2646
2445
|
if len(missing_obs_fields) > 0:
|
2647
|
-
|
2648
|
-
|
2649
|
-
|
2650
|
-
"consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
|
2651
|
-
"to automatically add these columns with default values."
|
2446
|
+
logger.error(
|
2447
|
+
f"missing required obs columns {_format_values(missing_obs_fields)}\n"
|
2448
|
+
" → consider initializing a Curate object with `defaults=cxg.CellxGeneAnnDataCatManager.cxg_categoricals_defaults` to automatically add these columns with default values"
|
2652
2449
|
)
|
2653
2450
|
return False
|
2654
2451
|
|
2655
2452
|
# Verify that no cellxgene reserved names are present
|
2656
|
-
reserved_names = {
|
2657
|
-
"ethnicity",
|
2658
|
-
"ethnicity_ontology_term_id",
|
2659
|
-
"X_normalization",
|
2660
|
-
"default_field",
|
2661
|
-
"layer_descriptions",
|
2662
|
-
"tags",
|
2663
|
-
"versions",
|
2664
|
-
"contributors",
|
2665
|
-
"preprint_doi",
|
2666
|
-
"project_description",
|
2667
|
-
"project_links",
|
2668
|
-
"project_name",
|
2669
|
-
"publication_doi",
|
2670
|
-
}
|
2671
2453
|
matched_columns = [
|
2672
|
-
column for column in self._adata.obs.columns if column in
|
2454
|
+
column for column in self._adata.obs.columns if column in RESERVED_NAMES
|
2673
2455
|
]
|
2674
2456
|
if len(matched_columns) > 0:
|
2675
2457
|
raise ValueError(
|
@@ -2696,6 +2478,26 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2696
2478
|
Returns:
|
2697
2479
|
An AnnData object which adheres to the cellxgene-schema.
|
2698
2480
|
"""
|
2481
|
+
|
2482
|
+
def _convert_name_to_ontology_id(values: pd.Series, field: FieldAttr):
|
2483
|
+
"""Converts a column that stores a name into a column that stores the ontology id.
|
2484
|
+
|
2485
|
+
cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
|
2486
|
+
"""
|
2487
|
+
field_name = field.field.name
|
2488
|
+
assert field_name == "name" # noqa: S101
|
2489
|
+
cols = ["name", "ontology_id"]
|
2490
|
+
registry = field.field.model
|
2491
|
+
|
2492
|
+
if hasattr(registry, "ontology_id"):
|
2493
|
+
validated_records = registry.filter(**{f"{field_name}__in": values})
|
2494
|
+
mapper = (
|
2495
|
+
pd.DataFrame(validated_records.values_list(*cols))
|
2496
|
+
.set_index(0)
|
2497
|
+
.to_dict()[1]
|
2498
|
+
)
|
2499
|
+
return values.map(mapper)
|
2500
|
+
|
2699
2501
|
# Create a copy since we modify the AnnData object extensively
|
2700
2502
|
adata_cxg = self._adata.copy()
|
2701
2503
|
|
@@ -2715,7 +2517,7 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2715
2517
|
# convert name column to ontology_term_id column
|
2716
2518
|
for column in adata_cxg.obs.columns:
|
2717
2519
|
if column in self.categoricals and not column.endswith("_ontology_term_id"):
|
2718
|
-
mapped_column =
|
2520
|
+
mapped_column = _convert_name_to_ontology_id(
|
2719
2521
|
adata_cxg.obs[column], field=self.categoricals.get(column)
|
2720
2522
|
)
|
2721
2523
|
if mapped_column is not None:
|
@@ -2881,7 +2683,7 @@ class TimeHandler:
|
|
2881
2683
|
|
2882
2684
|
|
2883
2685
|
class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
2884
|
-
"""
|
2686
|
+
"""Categorical manager for `AnnData` to manage perturbations."""
|
2885
2687
|
|
2886
2688
|
PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
|
2887
2689
|
|
@@ -2892,45 +2694,32 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
2892
2694
|
pert_dose: bool = True,
|
2893
2695
|
pert_time: bool = True,
|
2894
2696
|
*,
|
2697
|
+
cxg_schema_version: Literal["5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
|
2895
2698
|
verbosity: str = "hint",
|
2896
|
-
cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
|
2897
2699
|
):
|
2898
2700
|
"""Initialize the curator with configuration and validation settings."""
|
2899
|
-
import bionty as bt
|
2900
|
-
|
2901
2701
|
self._pert_time = pert_time
|
2902
2702
|
self._pert_dose = pert_dose
|
2903
2703
|
|
2904
2704
|
self._validate_initial_data(adata)
|
2905
|
-
self.
|
2906
|
-
|
2907
|
-
self._setup_sources(adata)
|
2908
|
-
self._setup_compound_source()
|
2705
|
+
categoricals, categoricals_defaults = self._configure_categoricals(adata)
|
2909
2706
|
|
2910
2707
|
super().__init__(
|
2911
2708
|
adata=adata,
|
2912
|
-
categoricals=
|
2913
|
-
defaults=
|
2914
|
-
verbosity=verbosity,
|
2709
|
+
categoricals=categoricals,
|
2710
|
+
defaults=categoricals_defaults,
|
2915
2711
|
organism=organism,
|
2916
|
-
extra_sources=self.
|
2712
|
+
extra_sources=self._configure_sources(adata),
|
2917
2713
|
schema_version=cxg_schema_version,
|
2714
|
+
verbosity=verbosity,
|
2918
2715
|
)
|
2919
2716
|
|
2920
|
-
def
|
2717
|
+
def _configure_categoricals(self, adata: ad.AnnData):
|
2921
2718
|
"""Set up default configuration values."""
|
2922
2719
|
import bionty as bt
|
2923
2720
|
import wetlab as wl
|
2924
2721
|
|
2925
|
-
|
2926
|
-
CellxGeneAnnDataCatManager._get_categoricals_defaults()
|
2927
|
-
| {
|
2928
|
-
"cell_line": "unknown",
|
2929
|
-
"pert_target": "unknown",
|
2930
|
-
}
|
2931
|
-
)
|
2932
|
-
|
2933
|
-
self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
|
2722
|
+
categoricals = CellxGeneAnnDataCatManager._get_cxg_categoricals() | {
|
2934
2723
|
k: v
|
2935
2724
|
for k, v in {
|
2936
2725
|
"cell_line": bt.CellLine.name,
|
@@ -2942,22 +2731,41 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
2942
2731
|
}.items()
|
2943
2732
|
if k in adata.obs.columns
|
2944
2733
|
}
|
2945
|
-
# if "donor_id" in
|
2946
|
-
#
|
2734
|
+
# if "donor_id" in categoricals:
|
2735
|
+
# categoricals["donor_id"] = Donor.name
|
2736
|
+
|
2737
|
+
categoricals_defaults = CellxGeneAnnDataCatManager.cxg_categoricals_defaults | {
|
2738
|
+
"cell_line": "unknown",
|
2739
|
+
"pert_target": "unknown",
|
2740
|
+
}
|
2947
2741
|
|
2948
|
-
|
2742
|
+
return categoricals, categoricals_defaults
|
2743
|
+
|
2744
|
+
def _configure_sources(self, adata: ad.AnnData):
|
2949
2745
|
"""Set up data sources."""
|
2950
|
-
|
2746
|
+
import bionty as bt
|
2747
|
+
import wetlab as wl
|
2748
|
+
|
2749
|
+
sources = {}
|
2750
|
+
# # do not yet specify cell_line source
|
2951
2751
|
# if "cell_line" in adata.obs.columns:
|
2952
|
-
#
|
2953
|
-
#
|
2954
|
-
# )
|
2752
|
+
# sources["cell_line"] = bt.Source.filter(
|
2753
|
+
# entity="bionty.CellLine", name="depmap"
|
2754
|
+
# ).first()
|
2955
2755
|
if "pert_compound" in adata.obs.columns:
|
2956
|
-
|
2756
|
+
with logger.mute():
|
2757
|
+
chebi_source = bt.Source.filter(
|
2758
|
+
entity="wetlab.Compound", name="chebi"
|
2759
|
+
).first()
|
2760
|
+
if not chebi_source:
|
2761
|
+
wl.Compound.add_source(
|
2762
|
+
bt.Source.filter(entity="Drug", name="chebi").first()
|
2763
|
+
)
|
2957
2764
|
|
2958
|
-
|
2765
|
+
sources["pert_compound"] = bt.Source.filter(
|
2959
2766
|
entity="wetlab.Compound", name="chebi"
|
2960
2767
|
).first()
|
2768
|
+
return sources
|
2961
2769
|
|
2962
2770
|
def _validate_initial_data(self, adata: ad.AnnData):
|
2963
2771
|
"""Validate the initial data structure."""
|
@@ -3005,20 +2813,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
3005
2813
|
adata.obs[col_name].cat.remove_unused_categories()
|
3006
2814
|
logger.important(f"mapped 'pert_name' to '{col_name}'")
|
3007
2815
|
|
3008
|
-
def _setup_compound_source(self):
|
3009
|
-
"""Set up the compound source with muted logging."""
|
3010
|
-
import bionty as bt
|
3011
|
-
import wetlab as wl
|
3012
|
-
|
3013
|
-
with logger.mute():
|
3014
|
-
chebi_source = bt.Source.filter(
|
3015
|
-
entity="wetlab.Compound", name="chebi"
|
3016
|
-
).first()
|
3017
|
-
if not chebi_source:
|
3018
|
-
wl.Compound.add_source(
|
3019
|
-
bt.Source.filter(entity="Drug", name="chebi").first()
|
3020
|
-
)
|
3021
|
-
|
3022
2816
|
def validate(self) -> bool: # type: ignore
|
3023
2817
|
"""Validate the AnnData object."""
|
3024
2818
|
validated = super().validate()
|
@@ -3136,70 +2930,47 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
3136
2930
|
|
3137
2931
|
def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
3138
2932
|
"""Make sure the source and organism are saved in the same database as the registry."""
|
3139
|
-
from lamindb.core._settings import settings
|
3140
|
-
|
3141
2933
|
db = registry.filter().db
|
3142
2934
|
source = kwargs.get("source")
|
3143
2935
|
organism = kwargs.get("organism")
|
3144
2936
|
filter_kwargs = kwargs.copy()
|
3145
|
-
try:
|
3146
|
-
verbosity = settings.verbosity
|
3147
|
-
settings.verbosity = "error"
|
3148
|
-
if isinstance(organism, Record) and organism._state.db != "default":
|
3149
|
-
if db is None or db == "default":
|
3150
|
-
organism_default = copy.copy(organism)
|
3151
|
-
# save the organism record in the default database
|
3152
|
-
organism_default.save()
|
3153
|
-
filter_kwargs["organism"] = organism_default
|
3154
|
-
if isinstance(source, Record) and source._state.db != "default":
|
3155
|
-
if db is None or db == "default":
|
3156
|
-
source_default = copy.copy(source)
|
3157
|
-
# save the source record in the default database
|
3158
|
-
source_default.save()
|
3159
|
-
filter_kwargs["source"] = source_default
|
3160
|
-
finally:
|
3161
|
-
settings.verbosity = verbosity
|
3162
|
-
return filter_kwargs
|
3163
|
-
|
3164
2937
|
|
3165
|
-
|
3166
|
-
|
3167
|
-
|
3168
|
-
|
3169
|
-
|
3170
|
-
|
3171
|
-
):
|
3172
|
-
|
3173
|
-
|
3174
|
-
|
3175
|
-
|
3176
|
-
|
3177
|
-
exclude = [exclude] if isinstance(exclude, str) else exclude
|
3178
|
-
exclude = [i for i in exclude if i in values]
|
3179
|
-
if len(exclude) > 0:
|
3180
|
-
# exclude values are validated without source and organism
|
3181
|
-
inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
|
3182
|
-
# if exclude values are validated, remove them from the values
|
3183
|
-
values = [i for i in values if i not in inspect_result_exclude.validated]
|
3184
|
-
include_validated = inspect_result_exclude.validated
|
3185
|
-
|
3186
|
-
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
|
3187
|
-
inspect_result._validated += include_validated
|
3188
|
-
inspect_result._non_validated = [
|
3189
|
-
i for i in inspect_result.non_validated if i not in include_validated
|
3190
|
-
]
|
2938
|
+
if isinstance(organism, Record) and organism._state.db != "default":
|
2939
|
+
if db is None or db == "default":
|
2940
|
+
organism_default = copy.copy(organism)
|
2941
|
+
# save the organism record in the default database
|
2942
|
+
organism_default.save()
|
2943
|
+
filter_kwargs["organism"] = organism_default
|
2944
|
+
if isinstance(source, Record) and source._state.db != "default":
|
2945
|
+
if db is None or db == "default":
|
2946
|
+
source_default = copy.copy(source)
|
2947
|
+
# save the source record in the default database
|
2948
|
+
source_default.save()
|
2949
|
+
filter_kwargs["source"] = source_default
|
3191
2950
|
|
3192
|
-
return
|
2951
|
+
return filter_kwargs
|
3193
2952
|
|
3194
2953
|
|
3195
|
-
def
|
2954
|
+
def get_organism_kwargs(
|
2955
|
+
field: FieldAttr, organism: str | None = None
|
2956
|
+
) -> dict[str, str]:
|
3196
2957
|
"""Check if a registry needs an organism and return the organism name."""
|
3197
|
-
|
2958
|
+
registry = field.field.model
|
2959
|
+
if registry.__base__.__name__ == "BioRecord":
|
3198
2960
|
import bionty as bt
|
2961
|
+
from bionty._organism import is_organism_required
|
3199
2962
|
|
3200
|
-
|
3201
|
-
|
3202
|
-
|
2963
|
+
from ..models._from_values import get_organism_record_from_field
|
2964
|
+
|
2965
|
+
if is_organism_required(registry):
|
2966
|
+
if organism is not None or bt.settings.organism is not None:
|
2967
|
+
return {"organism": organism or bt.settings.organism.name}
|
2968
|
+
else:
|
2969
|
+
organism_record = get_organism_record_from_field(
|
2970
|
+
field, organism=organism
|
2971
|
+
)
|
2972
|
+
if organism_record is not None:
|
2973
|
+
return {"organism": organism_record.name}
|
3203
2974
|
return {}
|
3204
2975
|
|
3205
2976
|
|
@@ -3209,7 +2980,6 @@ def validate_categories(
|
|
3209
2980
|
key: str,
|
3210
2981
|
organism: str | None = None,
|
3211
2982
|
source: Record | None = None,
|
3212
|
-
exclude: str | list | None = None,
|
3213
2983
|
hint_print: str | None = None,
|
3214
2984
|
curator: CatManager | None = None,
|
3215
2985
|
) -> tuple[bool, list[str]]:
|
@@ -3221,13 +2991,9 @@ def validate_categories(
|
|
3221
2991
|
key: The key referencing the slot in the DataFrame.
|
3222
2992
|
organism: The organism name.
|
3223
2993
|
source: The source record.
|
3224
|
-
exclude: Exclude specific values from validation.
|
3225
2994
|
standardize: Whether to standardize the values.
|
3226
2995
|
hint_print: The hint to print that suggests fixing non-validated values.
|
3227
2996
|
"""
|
3228
|
-
from lamindb.core._settings import settings
|
3229
|
-
from lamindb.models._from_values import _format_values
|
3230
|
-
|
3231
2997
|
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
3232
2998
|
|
3233
2999
|
def _log_mapping_info():
|
@@ -3237,36 +3003,25 @@ def validate_categories(
|
|
3237
3003
|
|
3238
3004
|
registry = field.field.model
|
3239
3005
|
|
3240
|
-
|
3241
|
-
|
3242
|
-
|
3243
|
-
kwargs_current = get_current_filter_kwargs(registry, kwargs)
|
3006
|
+
kwargs_current = get_current_filter_kwargs(
|
3007
|
+
registry, {"organism": organism, "source": source}
|
3008
|
+
)
|
3244
3009
|
|
3245
3010
|
# inspect values from the default instance
|
3246
|
-
inspect_result =
|
3247
|
-
values=values,
|
3248
|
-
field=field,
|
3249
|
-
registry=registry,
|
3250
|
-
exclude=exclude,
|
3251
|
-
**kwargs_current,
|
3252
|
-
)
|
3011
|
+
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs_current)
|
3253
3012
|
non_validated = inspect_result.non_validated
|
3254
3013
|
syn_mapper = inspect_result.synonyms_mapper
|
3255
3014
|
|
3256
|
-
# inspect the non-validated values from public (
|
3015
|
+
# inspect the non-validated values from public (BioRecord only)
|
3257
3016
|
values_validated = []
|
3258
3017
|
if hasattr(registry, "public"):
|
3259
|
-
|
3260
|
-
|
3261
|
-
|
3262
|
-
|
3263
|
-
|
3264
|
-
|
3265
|
-
|
3266
|
-
)
|
3267
|
-
values_validated += [getattr(r, field.field.name) for r in public_records]
|
3268
|
-
finally:
|
3269
|
-
settings.verbosity = verbosity
|
3018
|
+
public_records = registry.from_values(
|
3019
|
+
non_validated,
|
3020
|
+
field=field,
|
3021
|
+
mute=True,
|
3022
|
+
**kwargs_current,
|
3023
|
+
)
|
3024
|
+
values_validated += [getattr(r, field.field.name) for r in public_records]
|
3270
3025
|
|
3271
3026
|
# logging messages
|
3272
3027
|
non_validated_hint_print = hint_print or f'.add_new_from("{key}")'
|
@@ -3330,7 +3085,6 @@ def validate_categories_in_df(
|
|
3330
3085
|
df: pd.DataFrame,
|
3331
3086
|
fields: dict[str, FieldAttr],
|
3332
3087
|
sources: dict[str, Record] = None,
|
3333
|
-
exclude: dict | None = None,
|
3334
3088
|
curator: CatManager | None = None,
|
3335
3089
|
**kwargs,
|
3336
3090
|
) -> tuple[bool, dict]:
|
@@ -3348,7 +3102,6 @@ def validate_categories_in_df(
|
|
3348
3102
|
field=field,
|
3349
3103
|
key=key,
|
3350
3104
|
source=sources.get(key),
|
3351
|
-
exclude=exclude.get(key) if exclude else None,
|
3352
3105
|
curator=curator,
|
3353
3106
|
**kwargs,
|
3354
3107
|
)
|
@@ -3359,9 +3112,10 @@ def validate_categories_in_df(
|
|
3359
3112
|
|
3360
3113
|
|
3361
3114
|
def save_artifact(
|
3362
|
-
data: pd.DataFrame |
|
3115
|
+
data: pd.DataFrame | ScverseDataStructures,
|
3116
|
+
*,
|
3363
3117
|
fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
|
3364
|
-
|
3118
|
+
index_field: FieldAttr | dict[str, FieldAttr] | None = None,
|
3365
3119
|
description: str | None = None,
|
3366
3120
|
organism: str | None = None,
|
3367
3121
|
key: str | None = None,
|
@@ -3369,73 +3123,52 @@ def save_artifact(
|
|
3369
3123
|
revises: Artifact | None = None,
|
3370
3124
|
run: Run | None = None,
|
3371
3125
|
schema: Schema | None = None,
|
3126
|
+
**kwargs,
|
3372
3127
|
) -> Artifact:
|
3373
3128
|
"""Save all metadata with an Artifact.
|
3374
3129
|
|
3375
3130
|
Args:
|
3376
|
-
data: The
|
3131
|
+
data: The object to save.
|
3377
3132
|
fields: A dictionary mapping obs_column to registry_field.
|
3378
|
-
|
3133
|
+
index_field: The registry field to validate variables index against.
|
3379
3134
|
description: A description of the artifact.
|
3380
3135
|
organism: The organism name.
|
3381
|
-
type: The artifact type.
|
3382
3136
|
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
3383
3137
|
artifact: A already registered artifact. Passing this will not save a new artifact from data.
|
3384
3138
|
revises: Previous version of the artifact. Triggers a revision.
|
3385
3139
|
run: The run that creates the artifact.
|
3140
|
+
schema: The Schema to associate with the Artifact.
|
3386
3141
|
|
3387
3142
|
Returns:
|
3388
3143
|
The saved Artifact.
|
3389
3144
|
"""
|
3390
|
-
from ..models.artifact import add_labels
|
3145
|
+
from ..models.artifact import add_labels
|
3391
3146
|
|
3392
3147
|
if artifact is None:
|
3393
|
-
if
|
3394
|
-
artifact = Artifact.
|
3148
|
+
if isinstance(data, pd.DataFrame):
|
3149
|
+
artifact = Artifact.from_df(
|
3395
3150
|
data, description=description, key=key, revises=revises, run=run
|
3396
3151
|
)
|
3397
|
-
elif isinstance(data,
|
3398
|
-
artifact = Artifact.
|
3152
|
+
elif isinstance(data, AnnData):
|
3153
|
+
artifact = Artifact.from_anndata(
|
3399
3154
|
data, description=description, key=key, revises=revises, run=run
|
3400
3155
|
)
|
3401
3156
|
elif data_is_mudata(data):
|
3402
3157
|
artifact = Artifact.from_mudata(
|
3403
|
-
data,
|
3404
|
-
|
3405
|
-
|
3406
|
-
|
3407
|
-
run=run
|
3158
|
+
data, description=description, key=key, revises=revises, run=run
|
3159
|
+
)
|
3160
|
+
elif data_is_spatialdata(data):
|
3161
|
+
artifact = Artifact.from_spatialdata(
|
3162
|
+
data, description=description, key=key, revises=revises, run=run
|
3163
|
+
)
|
3164
|
+
else:
|
3165
|
+
raise InvalidArgument( # pragma: no cover
|
3166
|
+
"data must be one of pd.Dataframe, AnnData, MuData, SpatialData."
|
3408
3167
|
)
|
3409
|
-
artifact.schema = schema
|
3410
3168
|
artifact.save()
|
3411
3169
|
|
3412
|
-
if organism is not None and columns_field is not None:
|
3413
|
-
feature_kwargs = check_registry_organism(
|
3414
|
-
(
|
3415
|
-
list(columns_field.values())[0].field.model
|
3416
|
-
if isinstance(columns_field, dict)
|
3417
|
-
else columns_field.field.model
|
3418
|
-
),
|
3419
|
-
organism,
|
3420
|
-
)
|
3421
|
-
else:
|
3422
|
-
feature_kwargs = {}
|
3423
|
-
|
3424
|
-
if artifact.otype == "DataFrame":
|
3425
|
-
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) # type: ignore
|
3426
|
-
elif artifact.otype == "AnnData":
|
3427
|
-
artifact.features._add_set_from_anndata( # type: ignore
|
3428
|
-
var_field=columns_field, **feature_kwargs
|
3429
|
-
)
|
3430
|
-
elif artifact.otype == "MuData":
|
3431
|
-
artifact.features._add_set_from_mudata( # type: ignore
|
3432
|
-
var_fields=columns_field, **feature_kwargs
|
3433
|
-
)
|
3434
|
-
else:
|
3435
|
-
raise NotImplementedError
|
3436
|
-
|
3437
3170
|
def _add_labels(
|
3438
|
-
data,
|
3171
|
+
data: pd.DataFrame | ScverseDataStructures,
|
3439
3172
|
artifact: Artifact,
|
3440
3173
|
fields: dict[str, FieldAttr],
|
3441
3174
|
feature_ref_is_name: bool | None = None,
|
@@ -3444,19 +3177,15 @@ def save_artifact(
|
|
3444
3177
|
for key, field in fields.items():
|
3445
3178
|
feature = features.get(key)
|
3446
3179
|
registry = field.field.model
|
3447
|
-
|
3448
|
-
|
3180
|
+
# we don't need source here because all records are already in the DB
|
3181
|
+
filter_kwargs = get_current_filter_kwargs(registry, {"organism": organism})
|
3449
3182
|
df = data if isinstance(data, pd.DataFrame) else data.obs
|
3450
3183
|
# multi-value columns are separated by "|"
|
3451
3184
|
if not df[key].isna().all() and df[key].str.contains("|").any():
|
3452
3185
|
values = df[key].str.split("|").explode().unique()
|
3453
3186
|
else:
|
3454
3187
|
values = df[key].unique()
|
3455
|
-
labels = registry.from_values(
|
3456
|
-
values,
|
3457
|
-
field=field,
|
3458
|
-
**filter_kwargs_current,
|
3459
|
-
)
|
3188
|
+
labels = registry.from_values(values, field=field, **filter_kwargs)
|
3460
3189
|
if len(labels) == 0:
|
3461
3190
|
continue
|
3462
3191
|
label_ref_is_name = None
|
@@ -3471,35 +3200,87 @@ def save_artifact(
|
|
3471
3200
|
from_curator=True,
|
3472
3201
|
)
|
3473
3202
|
|
3474
|
-
|
3475
|
-
|
3476
|
-
|
3477
|
-
|
3478
|
-
|
3479
|
-
|
3480
|
-
|
3481
|
-
|
3482
|
-
|
3483
|
-
|
3484
|
-
|
3485
|
-
else _ref_is_name(column_field_modality)
|
3486
|
-
),
|
3487
|
-
)
|
3203
|
+
match artifact.otype:
|
3204
|
+
case "DataFrame":
|
3205
|
+
artifact.features._add_set_from_df(field=index_field, organism=organism) # type: ignore
|
3206
|
+
_add_labels(
|
3207
|
+
data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
|
3208
|
+
)
|
3209
|
+
case "AnnData":
|
3210
|
+
if schema is not None and "uns" in schema.slots:
|
3211
|
+
uns_field = parse_cat_dtype(schema.slots["uns"].itype, is_itype=True)[
|
3212
|
+
"field"
|
3213
|
+
]
|
3488
3214
|
else:
|
3489
|
-
|
3490
|
-
|
3491
|
-
|
3492
|
-
|
3493
|
-
|
3494
|
-
|
3495
|
-
|
3496
|
-
|
3497
|
-
|
3498
|
-
|
3499
|
-
|
3500
|
-
|
3501
|
-
|
3502
|
-
|
3215
|
+
uns_field = None
|
3216
|
+
artifact.features._add_set_from_anndata( # type: ignore
|
3217
|
+
var_field=index_field, uns_field=uns_field, organism=organism
|
3218
|
+
)
|
3219
|
+
_add_labels(
|
3220
|
+
data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
|
3221
|
+
)
|
3222
|
+
case "MuData":
|
3223
|
+
artifact.features._add_set_from_mudata( # type: ignore
|
3224
|
+
var_fields=index_field, organism=organism
|
3225
|
+
)
|
3226
|
+
for modality, modality_fields in fields.items():
|
3227
|
+
column_field_modality = index_field.get(modality)
|
3228
|
+
if modality == "obs":
|
3229
|
+
_add_labels(
|
3230
|
+
data,
|
3231
|
+
artifact,
|
3232
|
+
modality_fields,
|
3233
|
+
feature_ref_is_name=(
|
3234
|
+
None
|
3235
|
+
if column_field_modality is None
|
3236
|
+
else _ref_is_name(column_field_modality)
|
3237
|
+
),
|
3238
|
+
)
|
3239
|
+
else:
|
3240
|
+
_add_labels(
|
3241
|
+
data[modality],
|
3242
|
+
artifact,
|
3243
|
+
modality_fields,
|
3244
|
+
feature_ref_is_name=(
|
3245
|
+
None
|
3246
|
+
if column_field_modality is None
|
3247
|
+
else _ref_is_name(column_field_modality)
|
3248
|
+
),
|
3249
|
+
)
|
3250
|
+
case "SpatialData":
|
3251
|
+
artifact.features._add_set_from_spatialdata( # type: ignore
|
3252
|
+
sample_metadata_key=kwargs.get("sample_metadata_key", "sample"),
|
3253
|
+
var_fields=index_field,
|
3254
|
+
organism=organism,
|
3255
|
+
)
|
3256
|
+
sample_metadata_key = kwargs.get("sample_metadata_key", "sample")
|
3257
|
+
for accessor, accessor_fields in fields.items():
|
3258
|
+
column_field = index_field.get(accessor)
|
3259
|
+
if accessor == sample_metadata_key:
|
3260
|
+
_add_labels(
|
3261
|
+
data.get_attrs(
|
3262
|
+
key=sample_metadata_key, return_as="df", flatten=True
|
3263
|
+
),
|
3264
|
+
artifact,
|
3265
|
+
accessor_fields,
|
3266
|
+
feature_ref_is_name=(
|
3267
|
+
None if column_field is None else _ref_is_name(column_field)
|
3268
|
+
),
|
3269
|
+
)
|
3270
|
+
else:
|
3271
|
+
_add_labels(
|
3272
|
+
data.tables[accessor],
|
3273
|
+
artifact,
|
3274
|
+
accessor_fields,
|
3275
|
+
feature_ref_is_name=(
|
3276
|
+
None if column_field is None else _ref_is_name(column_field)
|
3277
|
+
),
|
3278
|
+
)
|
3279
|
+
case _:
|
3280
|
+
raise NotImplementedError # pragma: no cover
|
3281
|
+
|
3282
|
+
artifact.schema = schema
|
3283
|
+
artifact.save()
|
3503
3284
|
|
3504
3285
|
slug = ln_setup.settings.instance.slug
|
3505
3286
|
if ln_setup.settings.instance.is_remote: # pdagma: no cover
|
@@ -3529,8 +3310,7 @@ def update_registry(
|
|
3529
3310
|
organism: str | None = None,
|
3530
3311
|
dtype: str | None = None,
|
3531
3312
|
source: Record | None = None,
|
3532
|
-
|
3533
|
-
**kwargs,
|
3313
|
+
**create_kwargs,
|
3534
3314
|
) -> None:
|
3535
3315
|
"""Save features or labels records in the default instance..
|
3536
3316
|
|
@@ -3543,82 +3323,68 @@ def update_registry(
|
|
3543
3323
|
organism: The organism name.
|
3544
3324
|
dtype: The type of the feature.
|
3545
3325
|
source: The source record.
|
3546
|
-
|
3547
|
-
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
3326
|
+
**create_kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
3548
3327
|
"""
|
3549
|
-
from lamindb.core._settings import settings
|
3550
3328
|
from lamindb.models.save import save as ln_save
|
3551
3329
|
|
3552
3330
|
registry = field.field.model
|
3553
|
-
filter_kwargs =
|
3554
|
-
|
3331
|
+
filter_kwargs = get_current_filter_kwargs(
|
3332
|
+
registry, {"organism": organism, "source": source}
|
3333
|
+
)
|
3555
3334
|
values = [i for i in values if isinstance(i, str) and i]
|
3556
3335
|
if not values:
|
3557
3336
|
return
|
3558
3337
|
|
3559
|
-
|
3560
|
-
try:
|
3561
|
-
settings.verbosity = "error"
|
3562
|
-
labels_saved: dict = {"from public": [], "new": []}
|
3338
|
+
labels_saved: dict = {"from public": [], "new": []}
|
3563
3339
|
|
3564
|
-
|
3565
|
-
|
3566
|
-
|
3567
|
-
|
3568
|
-
|
3569
|
-
|
3570
|
-
|
3571
|
-
|
3572
|
-
|
3573
|
-
|
3574
|
-
|
3575
|
-
|
3576
|
-
if source
|
3577
|
-
|
3578
|
-
|
3579
|
-
|
3580
|
-
|
3581
|
-
|
3582
|
-
ln_save(public_records)
|
3583
|
-
labels_saved["from public"] = [
|
3584
|
-
getattr(r, field.field.name) for r in public_records
|
3585
|
-
]
|
3586
|
-
# non-validated records from the default instance
|
3587
|
-
non_validated_labels = [
|
3588
|
-
i for i in values if i not in existing_and_public_labels
|
3340
|
+
# inspect the default instance and save validated records from public
|
3341
|
+
existing_and_public_records = registry.from_values(
|
3342
|
+
list(values), field=field, **filter_kwargs, mute=True
|
3343
|
+
)
|
3344
|
+
existing_and_public_labels = [
|
3345
|
+
getattr(r, field.field.name) for r in existing_and_public_records
|
3346
|
+
]
|
3347
|
+
# public records that are not already in the database
|
3348
|
+
public_records = [r for r in existing_and_public_records if r._state.adding]
|
3349
|
+
# here we check to only save the public records if they are from the specified source
|
3350
|
+
# we check the uid because r.source and source can be from different instances
|
3351
|
+
if source:
|
3352
|
+
public_records = [r for r in public_records if r.source.uid == source.uid]
|
3353
|
+
if len(public_records) > 0:
|
3354
|
+
logger.info(f"saving validated records of '{key}'")
|
3355
|
+
ln_save(public_records)
|
3356
|
+
labels_saved["from public"] = [
|
3357
|
+
getattr(r, field.field.name) for r in public_records
|
3589
3358
|
]
|
3359
|
+
# non-validated records from the default instance
|
3360
|
+
non_validated_labels = [i for i in values if i not in existing_and_public_labels]
|
3361
|
+
|
3362
|
+
# save non-validated/new records
|
3363
|
+
labels_saved["new"] = non_validated_labels
|
3364
|
+
if not validated_only:
|
3365
|
+
non_validated_records: RecordList[Any] = [] # type: ignore
|
3366
|
+
if df is not None and registry == Feature:
|
3367
|
+
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
3368
|
+
non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
|
3369
|
+
else:
|
3370
|
+
if (
|
3371
|
+
organism
|
3372
|
+
and hasattr(registry, "organism")
|
3373
|
+
and registry._meta.get_field("organism").is_relation
|
3374
|
+
):
|
3375
|
+
# make sure organism record is saved to the current instance
|
3376
|
+
create_kwargs["organism"] = _save_organism(name=organism)
|
3590
3377
|
|
3591
|
-
|
3592
|
-
|
3593
|
-
|
3594
|
-
|
3595
|
-
|
3596
|
-
|
3597
|
-
non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
|
3598
|
-
else:
|
3599
|
-
if "organism" in filter_kwargs:
|
3600
|
-
# make sure organism record is saved to the current instance
|
3601
|
-
filter_kwargs["organism"] = _save_organism(name=organism)
|
3602
|
-
init_kwargs = {}
|
3603
|
-
for value in labels_saved["new"]:
|
3604
|
-
init_kwargs[field.field.name] = value
|
3605
|
-
if registry == Feature:
|
3606
|
-
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
3607
|
-
non_validated_records.append(
|
3608
|
-
registry(
|
3609
|
-
**init_kwargs,
|
3610
|
-
**{k: v for k, v in filter_kwargs.items() if k != "source"},
|
3611
|
-
**{k: v for k, v in kwargs.items() if k != "sources"},
|
3612
|
-
)
|
3613
|
-
)
|
3614
|
-
ln_save(non_validated_records)
|
3615
|
-
|
3616
|
-
# save parent labels for ulabels, for example a parent label "project" for label "project001"
|
3617
|
-
if registry == ULabel and field.field.name == "name":
|
3618
|
-
save_ulabels_parent(values, field=field, key=key)
|
3378
|
+
for value in labels_saved["new"]:
|
3379
|
+
init_kwargs = {field.field.name: value}
|
3380
|
+
if registry == Feature:
|
3381
|
+
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
3382
|
+
non_validated_records.append(registry(**init_kwargs, **create_kwargs))
|
3383
|
+
ln_save(non_validated_records)
|
3619
3384
|
|
3620
|
-
|
3621
|
-
|
3385
|
+
# save parent labels for ulabels, for example a parent label "project" for label "project001"
|
3386
|
+
if registry == ULabel and field.field.name == "name":
|
3387
|
+
save_ulabels_type(values, field=field, key=key)
|
3622
3388
|
|
3623
3389
|
log_saved_labels(
|
3624
3390
|
labels_saved,
|
@@ -3653,16 +3419,18 @@ def log_saved_labels(
|
|
3653
3419
|
)
|
3654
3420
|
|
3655
3421
|
|
3656
|
-
def
|
3657
|
-
"""Save
|
3422
|
+
def save_ulabels_type(values: list[str], field: FieldAttr, key: str) -> None:
|
3423
|
+
"""Save the ULabel type of the given labels."""
|
3658
3424
|
registry = field.field.model
|
3659
3425
|
assert registry == ULabel # noqa: S101
|
3660
|
-
all_records = registry.
|
3661
|
-
|
3662
|
-
|
3663
|
-
|
3664
|
-
|
3665
|
-
|
3426
|
+
all_records = registry.filter(**{field.field.name: list(values)}).all()
|
3427
|
+
# so `tissue_type` becomes `TissueType`
|
3428
|
+
type_name = "".join([i.capitalize() for i in key.lower().split("_")])
|
3429
|
+
ulabel_type = registry.filter(name=type_name, is_type=True).one_or_none()
|
3430
|
+
if ulabel_type is None:
|
3431
|
+
ulabel_type = registry(name=type_name, is_type=True).save()
|
3432
|
+
logger.important(f"Created a ULabel type: {ulabel_type}")
|
3433
|
+
all_records.update(type=ulabel_type)
|
3666
3434
|
|
3667
3435
|
|
3668
3436
|
def _save_organism(name: str):
|
@@ -3674,8 +3442,9 @@ def _save_organism(name: str):
|
|
3674
3442
|
organism = bt.Organism.from_source(name=name)
|
3675
3443
|
if organism is None:
|
3676
3444
|
raise ValidationError(
|
3677
|
-
f'Organism "{name}" not found\n'
|
3678
|
-
f' → please save it: bt.Organism(name="{name}").save()'
|
3445
|
+
f'Organism "{name}" not found from public reference\n'
|
3446
|
+
f' → please save it from a different source: bt.Organism.from_source(name="{name}", source).save()'
|
3447
|
+
f' → or manually save it without source: bt.Organism(name="{name}").save()'
|
3679
3448
|
)
|
3680
3449
|
organism.save()
|
3681
3450
|
return organism
|
@@ -3761,7 +3530,6 @@ def from_tiledbsoma(
|
|
3761
3530
|
obs_columns: FieldAttr = Feature.name,
|
3762
3531
|
organism: str | None = None,
|
3763
3532
|
sources: dict[str, Record] | None = None,
|
3764
|
-
exclude: dict[str, str | list[str]] | None = None,
|
3765
3533
|
) -> TiledbsomaCatManager:
|
3766
3534
|
return TiledbsomaCatManager(
|
3767
3535
|
experiment_uri=experiment_uri,
|
@@ -3770,7 +3538,6 @@ def from_tiledbsoma(
|
|
3770
3538
|
obs_columns=obs_columns,
|
3771
3539
|
organism=organism,
|
3772
3540
|
sources=sources,
|
3773
|
-
exclude=exclude,
|
3774
3541
|
)
|
3775
3542
|
|
3776
3543
|
|
@@ -3782,7 +3549,6 @@ def from_spatialdata(
|
|
3782
3549
|
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
3783
3550
|
organism: str | None = None,
|
3784
3551
|
sources: dict[str, dict[str, Record]] | None = None,
|
3785
|
-
exclude: dict[str, dict] | None = None,
|
3786
3552
|
verbosity: str = "hint",
|
3787
3553
|
*,
|
3788
3554
|
sample_metadata_key: str = "sample",
|
@@ -3799,7 +3565,6 @@ def from_spatialdata(
|
|
3799
3565
|
verbosity=verbosity,
|
3800
3566
|
organism=organism,
|
3801
3567
|
sources=sources,
|
3802
|
-
exclude=exclude,
|
3803
3568
|
sample_metadata_key=sample_metadata_key,
|
3804
3569
|
)
|
3805
3570
|
|