lamindb 1.2a2__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +3 -1
- lamindb/core/_compat.py +60 -0
- lamindb/core/_context.py +15 -12
- lamindb/core/datasets/__init__.py +1 -0
- lamindb/core/datasets/_core.py +23 -0
- lamindb/core/datasets/_small.py +16 -2
- lamindb/core/loaders.py +22 -12
- lamindb/core/storage/_tiledbsoma.py +2 -2
- lamindb/core/storage/_zarr.py +84 -26
- lamindb/core/storage/objects.py +45 -44
- lamindb/core/types.py +10 -0
- lamindb/curators/__init__.py +1272 -1517
- lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
- lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
- lamindb/models/_feature_manager.py +65 -14
- lamindb/models/_from_values.py +113 -78
- lamindb/models/artifact.py +142 -98
- lamindb/models/can_curate.py +185 -216
- lamindb/models/feature.py +32 -2
- lamindb/models/project.py +69 -7
- lamindb/models/query_set.py +12 -2
- lamindb/models/record.py +48 -25
- lamindb/models/run.py +18 -1
- lamindb/models/schema.py +0 -8
- {lamindb-1.2a2.dist-info → lamindb-1.3.0.dist-info}/METADATA +7 -6
- {lamindb-1.2a2.dist-info → lamindb-1.3.0.dist-info}/RECORD +28 -27
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
- {lamindb-1.2a2.dist-info → lamindb-1.3.0.dist-info}/LICENSE +0 -0
- {lamindb-1.2a2.dist-info → lamindb-1.3.0.dist-info}/WHEEL +0 -0
lamindb/curators/__init__.py
CHANGED
@@ -1,25 +1,27 @@
|
|
1
1
|
"""Curators.
|
2
2
|
|
3
|
-
.. versionadded:: 1.1.0
|
4
|
-
|
5
3
|
.. autosummary::
|
6
4
|
:toctree: .
|
7
5
|
|
8
|
-
Curator
|
9
6
|
DataFrameCurator
|
10
7
|
AnnDataCurator
|
8
|
+
MuDataCurator
|
9
|
+
SpatialDataCurator
|
11
10
|
|
12
|
-
|
11
|
+
Helper classes.
|
13
12
|
|
14
13
|
.. autosummary::
|
15
14
|
:toctree: .
|
16
15
|
|
16
|
+
Curator
|
17
|
+
SlotsCurator
|
17
18
|
CatManager
|
19
|
+
CatLookup
|
18
20
|
DataFrameCatManager
|
19
21
|
AnnDataCatManager
|
20
22
|
MuDataCatManager
|
23
|
+
SpatialDataCatManager
|
21
24
|
TiledbsomaCatManager
|
22
|
-
CurateLookup
|
23
25
|
|
24
26
|
"""
|
25
27
|
|
@@ -27,7 +29,6 @@ from __future__ import annotations
|
|
27
29
|
|
28
30
|
import copy
|
29
31
|
import re
|
30
|
-
from importlib import resources
|
31
32
|
from itertools import chain
|
32
33
|
from typing import TYPE_CHECKING, Any, Literal
|
33
34
|
|
@@ -37,45 +38,44 @@ import pandas as pd
|
|
37
38
|
import pandera
|
38
39
|
import pyarrow as pa
|
39
40
|
from lamin_utils import colors, logger
|
40
|
-
from lamindb_setup.core import deprecated
|
41
|
+
from lamindb_setup.core import deprecated
|
41
42
|
from lamindb_setup.core._docs import doc_args
|
42
43
|
from lamindb_setup.core.upath import UPath
|
43
44
|
|
44
|
-
from lamindb.core.storage._backed_access import backed_access
|
45
|
-
|
46
|
-
from ._cellxgene_schemas import _read_schema_versions
|
47
|
-
|
48
45
|
if TYPE_CHECKING:
|
49
|
-
from anndata import AnnData
|
50
46
|
from lamindb_setup.core.types import UPathStr
|
47
|
+
from mudata import MuData
|
48
|
+
from spatialdata import SpatialData
|
51
49
|
|
52
|
-
from lamindb.
|
50
|
+
from lamindb.core.types import ScverseDataStructures
|
53
51
|
from lamindb.models import Record
|
54
52
|
from lamindb.base.types import FieldAttr # noqa
|
55
53
|
from lamindb.core._settings import settings
|
56
54
|
from lamindb.models import (
|
57
55
|
Artifact,
|
58
|
-
Collection,
|
59
56
|
Feature,
|
60
57
|
Record,
|
61
58
|
Run,
|
62
59
|
Schema,
|
63
60
|
ULabel,
|
64
61
|
)
|
65
|
-
from lamindb.models.
|
66
|
-
|
62
|
+
from lamindb.models.artifact import (
|
63
|
+
add_labels,
|
64
|
+
data_is_anndata,
|
65
|
+
data_is_mudata,
|
66
|
+
data_is_spatialdata,
|
67
|
+
)
|
67
68
|
from lamindb.models.feature import parse_dtype, parse_dtype_single_cat
|
68
69
|
from lamindb.models._from_values import _format_values
|
69
70
|
|
70
71
|
from ..errors import InvalidArgument, ValidationError
|
72
|
+
from anndata import AnnData
|
71
73
|
|
72
74
|
if TYPE_CHECKING:
|
73
75
|
from collections.abc import Iterable, MutableMapping
|
74
76
|
from typing import Any
|
75
77
|
|
76
78
|
from lamindb_setup.core.types import UPathStr
|
77
|
-
from mudata import MuData
|
78
|
-
from spatialdata import SpatialData
|
79
79
|
|
80
80
|
from lamindb.models.query_set import RecordList
|
81
81
|
|
@@ -86,7 +86,7 @@ def strip_ansi_codes(text):
|
|
86
86
|
return ansi_pattern.sub("", text)
|
87
87
|
|
88
88
|
|
89
|
-
class
|
89
|
+
class CatLookup:
|
90
90
|
"""Lookup categories from the reference instance.
|
91
91
|
|
92
92
|
Args:
|
@@ -94,10 +94,10 @@ class CurateLookup:
|
|
94
94
|
slots: A dictionary of slot fields to lookup.
|
95
95
|
public: Whether to lookup from the public instance. Defaults to False.
|
96
96
|
|
97
|
-
Example
|
98
|
-
|
99
|
-
|
100
|
-
|
97
|
+
Example::
|
98
|
+
|
99
|
+
curator = ln.curators.DataFrameCurator(...)
|
100
|
+
curator.cat.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
|
101
101
|
|
102
102
|
"""
|
103
103
|
|
@@ -150,7 +150,7 @@ class CurateLookup:
|
|
150
150
|
" → categories.alveolar_type_1_fibroblast_cell\n\n"
|
151
151
|
"To look up public ontologies, use .lookup(public=True)"
|
152
152
|
)
|
153
|
-
else: #
|
153
|
+
else: # pragma: no cover
|
154
154
|
return colors.warning("No fields are found!")
|
155
155
|
|
156
156
|
|
@@ -163,7 +163,7 @@ SLOTS_DOCSTRING = """Curator objects by slot.
|
|
163
163
|
"""
|
164
164
|
|
165
165
|
|
166
|
-
VALIDATE_DOCSTRING = """Validate dataset.
|
166
|
+
VALIDATE_DOCSTRING = """Validate dataset against Schema.
|
167
167
|
|
168
168
|
Raises:
|
169
169
|
lamindb.errors.ValidationError: If validation fails.
|
@@ -183,15 +183,17 @@ Returns:
|
|
183
183
|
|
184
184
|
|
185
185
|
class Curator:
|
186
|
-
"""
|
186
|
+
"""Curator base class.
|
187
187
|
|
188
188
|
A `Curator` object makes it easy to validate, standardize & annotate datasets.
|
189
189
|
|
190
|
-
.. versionadded:: 1.1.0
|
191
|
-
|
192
190
|
See:
|
193
191
|
- :class:`~lamindb.curators.DataFrameCurator`
|
194
192
|
- :class:`~lamindb.curators.AnnDataCurator`
|
193
|
+
- :class:`~lamindb.curators.MuDataCurator`
|
194
|
+
- :class:`~lamindb.curators.SpatialDataCurator`
|
195
|
+
|
196
|
+
.. versionadded:: 1.1.0
|
195
197
|
"""
|
196
198
|
|
197
199
|
def __init__(self, dataset: Any, schema: Schema | None = None):
|
@@ -199,7 +201,12 @@ class Curator:
|
|
199
201
|
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
200
202
|
if isinstance(self._dataset, Artifact):
|
201
203
|
self._artifact = self._dataset
|
202
|
-
if self._artifact.otype in {
|
204
|
+
if self._artifact.otype in {
|
205
|
+
"DataFrame",
|
206
|
+
"AnnData",
|
207
|
+
"MuData",
|
208
|
+
"SpatialData",
|
209
|
+
}:
|
203
210
|
self._dataset = self._dataset.load()
|
204
211
|
self._schema: Schema | None = schema
|
205
212
|
self._is_validated: bool = False
|
@@ -208,7 +215,7 @@ class Curator:
|
|
208
215
|
@doc_args(VALIDATE_DOCSTRING)
|
209
216
|
def validate(self) -> bool | str:
|
210
217
|
"""{}""" # noqa: D415
|
211
|
-
pass #
|
218
|
+
pass # pragma: no cover
|
212
219
|
|
213
220
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
214
221
|
def save_artifact(
|
@@ -225,9 +232,72 @@ class Curator:
|
|
225
232
|
pass
|
226
233
|
|
227
234
|
|
235
|
+
class SlotsCurator(Curator):
|
236
|
+
"""Curator for a dataset with slots.
|
237
|
+
|
238
|
+
Args:
|
239
|
+
dataset: The dataset to validate & annotate.
|
240
|
+
schema: A `Schema` object that defines the validation constraints.
|
241
|
+
|
242
|
+
.. versionadded:: 1.3.0
|
243
|
+
"""
|
244
|
+
|
245
|
+
def __init__(
|
246
|
+
self,
|
247
|
+
dataset: Any,
|
248
|
+
schema: Schema,
|
249
|
+
) -> None:
|
250
|
+
super().__init__(dataset=dataset, schema=schema)
|
251
|
+
self._slots: dict[str, DataFrameCurator] = {}
|
252
|
+
|
253
|
+
# used in MuDataCurator and SpatialDataCurator
|
254
|
+
# in form of {table/modality_key: var_field}
|
255
|
+
self._var_fields: dict[str, FieldAttr] = {}
|
256
|
+
# in form of {table/modality_key: categoricals}
|
257
|
+
self._categoricals: dict[str, dict[str, FieldAttr]] = {}
|
258
|
+
|
259
|
+
@property
|
260
|
+
@doc_args(SLOTS_DOCSTRING)
|
261
|
+
def slots(self) -> dict[str, DataFrameCurator]:
|
262
|
+
"""{}""" # noqa: D415
|
263
|
+
return self._slots
|
264
|
+
|
265
|
+
@doc_args(VALIDATE_DOCSTRING)
|
266
|
+
def validate(self) -> None:
|
267
|
+
"""{}""" # noqa: D415
|
268
|
+
for _, curator in self._slots.items():
|
269
|
+
curator.validate()
|
270
|
+
|
271
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
272
|
+
def save_artifact(
|
273
|
+
self,
|
274
|
+
*,
|
275
|
+
key: str | None = None,
|
276
|
+
description: str | None = None,
|
277
|
+
revises: Artifact | None = None,
|
278
|
+
run: Run | None = None,
|
279
|
+
) -> Artifact:
|
280
|
+
"""{}""" # noqa: D415
|
281
|
+
if not self._is_validated:
|
282
|
+
self.validate()
|
283
|
+
|
284
|
+
# default implementation for MuDataCurator and SpatialDataCurator
|
285
|
+
return save_artifact( # type: ignore
|
286
|
+
self._dataset,
|
287
|
+
key=key,
|
288
|
+
description=description,
|
289
|
+
fields=self._categoricals,
|
290
|
+
index_field=self._var_fields,
|
291
|
+
artifact=self._artifact,
|
292
|
+
revises=revises,
|
293
|
+
run=run,
|
294
|
+
schema=self._schema,
|
295
|
+
)
|
296
|
+
|
297
|
+
|
228
298
|
class DataFrameCurator(Curator):
|
229
299
|
# the example in the docstring is tested in test_curators_quickstart_example
|
230
|
-
"""Curator for
|
300
|
+
"""Curator for `DataFrame`.
|
231
301
|
|
232
302
|
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
233
303
|
|
@@ -282,7 +352,9 @@ class DataFrameCurator(Curator):
|
|
282
352
|
feature.dtype if not feature.dtype.startswith("cat") else "category"
|
283
353
|
)
|
284
354
|
pandera_columns[feature.name] = pandera.Column(
|
285
|
-
pandera_dtype,
|
355
|
+
pandera_dtype,
|
356
|
+
nullable=feature.nullable,
|
357
|
+
coerce=feature.coerce_dtype,
|
286
358
|
)
|
287
359
|
if feature.dtype.startswith("cat"):
|
288
360
|
categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
|
@@ -378,7 +450,7 @@ class DataFrameCurator(Curator):
|
|
378
450
|
description: str | None = None,
|
379
451
|
revises: Artifact | None = None,
|
380
452
|
run: Run | None = None,
|
381
|
-
):
|
453
|
+
) -> Artifact:
|
382
454
|
"""{}""" # noqa: D415
|
383
455
|
if not self._is_validated:
|
384
456
|
self.validate() # raises ValidationError if doesn't validate
|
@@ -387,7 +459,7 @@ class DataFrameCurator(Curator):
|
|
387
459
|
self._dataset,
|
388
460
|
description=description,
|
389
461
|
fields=self._cat_manager.categoricals,
|
390
|
-
|
462
|
+
index_field=result["field"],
|
391
463
|
key=key,
|
392
464
|
artifact=self._artifact,
|
393
465
|
revises=revises,
|
@@ -396,9 +468,9 @@ class DataFrameCurator(Curator):
|
|
396
468
|
)
|
397
469
|
|
398
470
|
|
399
|
-
class AnnDataCurator(
|
471
|
+
class AnnDataCurator(SlotsCurator):
|
400
472
|
# the example in the docstring is tested in test_curators_quickstart_example
|
401
|
-
"""Curator for
|
473
|
+
"""Curator for `AnnData`.
|
402
474
|
|
403
475
|
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
404
476
|
|
@@ -446,7 +518,7 @@ class AnnDataCurator(Curator):
|
|
446
518
|
).save()
|
447
519
|
|
448
520
|
# curate an AnnData
|
449
|
-
adata = datasets.small_dataset1(otype="AnnData")
|
521
|
+
adata = ln.core.datasets.small_dataset1(otype="AnnData")
|
450
522
|
curator = ln.curators.AnnDataCurator(adata, anndata_schema)
|
451
523
|
artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
|
452
524
|
assert artifact.schema == anndata_schema
|
@@ -466,9 +538,9 @@ class AnnDataCurator(Curator):
|
|
466
538
|
self._slots = {
|
467
539
|
slot: DataFrameCurator(
|
468
540
|
(
|
469
|
-
self._dataset
|
541
|
+
getattr(self._dataset, slot).T
|
470
542
|
if slot == "var"
|
471
|
-
else self._dataset
|
543
|
+
else getattr(self._dataset, slot)
|
472
544
|
),
|
473
545
|
slot_schema,
|
474
546
|
)
|
@@ -476,18 +548,6 @@ class AnnDataCurator(Curator):
|
|
476
548
|
if slot in {"obs", "var"}
|
477
549
|
}
|
478
550
|
|
479
|
-
@property
|
480
|
-
@doc_args(SLOTS_DOCSTRING)
|
481
|
-
def slots(self) -> dict[str, DataFrameCurator]:
|
482
|
-
"""{}""" # noqa: D415
|
483
|
-
return self._slots
|
484
|
-
|
485
|
-
@doc_args(VALIDATE_DOCSTRING)
|
486
|
-
def validate(self) -> None:
|
487
|
-
"""{}""" # noqa: D415
|
488
|
-
for _, curator in self._slots.items():
|
489
|
-
curator.validate()
|
490
|
-
|
491
551
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
492
552
|
def save_artifact(
|
493
553
|
self,
|
@@ -496,7 +556,7 @@ class AnnDataCurator(Curator):
|
|
496
556
|
description: str | None = None,
|
497
557
|
revises: Artifact | None = None,
|
498
558
|
run: Run | None = None,
|
499
|
-
):
|
559
|
+
) -> Artifact:
|
500
560
|
"""{}""" # noqa: D415
|
501
561
|
if not self._is_validated:
|
502
562
|
self.validate()
|
@@ -504,7 +564,7 @@ class AnnDataCurator(Curator):
|
|
504
564
|
self._dataset,
|
505
565
|
description=description,
|
506
566
|
fields=self.slots["obs"]._cat_manager.categoricals,
|
507
|
-
|
567
|
+
index_field=(
|
508
568
|
parse_dtype_single_cat(self.slots["var"]._schema.itype, is_itype=True)[
|
509
569
|
"field"
|
510
570
|
]
|
@@ -519,34 +579,286 @@ class AnnDataCurator(Curator):
|
|
519
579
|
)
|
520
580
|
|
521
581
|
|
522
|
-
|
523
|
-
|
582
|
+
def _assign_var_fields_categoricals_multimodal(
|
583
|
+
modality: str | None,
|
584
|
+
slot_type: str,
|
585
|
+
slot: str,
|
586
|
+
slot_schema: Schema,
|
587
|
+
var_fields: dict[str, FieldAttr],
|
588
|
+
categoricals: dict[str, dict[str, FieldAttr]],
|
589
|
+
slots: dict[str, DataFrameCurator],
|
590
|
+
) -> None:
|
591
|
+
"""Assigns var_fields and categoricals for multimodal data curators."""
|
592
|
+
if modality is not None:
|
593
|
+
# Makes sure that all tables are present
|
594
|
+
var_fields[modality] = None
|
595
|
+
categoricals[modality] = {}
|
596
|
+
|
597
|
+
if slot_type == "var":
|
598
|
+
var_field = parse_dtype_single_cat(slot_schema.itype, is_itype=True)["field"]
|
599
|
+
if modality is None:
|
600
|
+
# This should rarely/never be used since tables should have different var fields
|
601
|
+
var_fields[slot] = var_field # pragma: no cover
|
602
|
+
else:
|
603
|
+
# Note that this is NOT nested since the nested key is always "var"
|
604
|
+
var_fields[modality] = var_field
|
605
|
+
else:
|
606
|
+
obs_fields = slots[slot]._cat_manager.categoricals
|
607
|
+
if modality is None:
|
608
|
+
categoricals[slot] = obs_fields
|
609
|
+
else:
|
610
|
+
# Note that this is NOT nested since the nested key is always "obs"
|
611
|
+
categoricals[modality] = obs_fields
|
612
|
+
|
524
613
|
|
525
|
-
|
614
|
+
class MuDataCurator(SlotsCurator):
|
615
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
616
|
+
"""Curator for `MuData`.
|
617
|
+
|
618
|
+
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
526
619
|
|
527
|
-
|
620
|
+
.. versionadded:: 1.3.0
|
528
621
|
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
>>> columns=Feature.name, # map column names
|
533
|
-
>>> categoricals={"perturbation": ULabel.name}, # map categories
|
534
|
-
>>> )
|
535
|
-
>>> cat_manager.validate() # validate the dataframe
|
536
|
-
>>> artifact = cat_manager.save_artifact(description="my RNA-seq")
|
537
|
-
>>> artifact.describe() # see annotations
|
622
|
+
Args:
|
623
|
+
dataset: The MuData-like object to validate & annotate.
|
624
|
+
schema: A `Schema` object that defines the validation constraints.
|
538
625
|
|
539
|
-
|
626
|
+
Example::
|
540
627
|
|
541
|
-
|
628
|
+
import lamindb as ln
|
629
|
+
import bionty as bt
|
630
|
+
|
631
|
+
# define the global obs schema
|
632
|
+
obs_schema = ln.Schema(
|
633
|
+
name="mudata_papalexi21_subset_obs_schema",
|
634
|
+
features=[
|
635
|
+
ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
|
636
|
+
ln.Feature(name="replicate", dtype="cat[ULabel[Replicate]]").save(),
|
637
|
+
],
|
638
|
+
).save()
|
639
|
+
|
640
|
+
# define the ['rna'].obs schema
|
641
|
+
obs_schema_rna = ln.Schema(
|
642
|
+
name="mudata_papalexi21_subset_rna_obs_schema",
|
643
|
+
features=[
|
644
|
+
ln.Feature(name="nCount_RNA", dtype=int).save(),
|
645
|
+
ln.Feature(name="nFeature_RNA", dtype=int).save(),
|
646
|
+
ln.Feature(name="percent.mito", dtype=float).save(),
|
647
|
+
],
|
648
|
+
coerce_dtype=True,
|
649
|
+
).save()
|
650
|
+
|
651
|
+
# define the ['hto'].obs schema
|
652
|
+
obs_schema_hto = ln.Schema(
|
653
|
+
name="mudata_papalexi21_subset_hto_obs_schema",
|
654
|
+
features=[
|
655
|
+
ln.Feature(name="nCount_HTO", dtype=int).save(),
|
656
|
+
ln.Feature(name="nFeature_HTO", dtype=int).save(),
|
657
|
+
ln.Feature(name="technique", dtype=bt.ExperimentalFactor).save(),
|
658
|
+
],
|
659
|
+
coerce_dtype=True,
|
660
|
+
).save()
|
661
|
+
|
662
|
+
# define ['rna'].var schema
|
663
|
+
var_schema_rna = ln.Schema(
|
664
|
+
name="mudata_papalexi21_subset_rna_var_schema",
|
665
|
+
itype=bt.Gene.symbol,
|
666
|
+
dtype=float,
|
667
|
+
).save()
|
668
|
+
|
669
|
+
# define composite schema
|
670
|
+
mudata_schema = ln.Schema(
|
671
|
+
name="mudata_papalexi21_subset_mudata_schema",
|
672
|
+
otype="MuData",
|
673
|
+
components={
|
674
|
+
"obs": obs_schema,
|
675
|
+
"rna:obs": obs_schema_rna,
|
676
|
+
"hto:obs": obs_schema_hto,
|
677
|
+
"rna:var": var_schema_rna,
|
678
|
+
},
|
679
|
+
).save()
|
542
680
|
|
543
|
-
|
544
|
-
|
681
|
+
# curate a MuData
|
682
|
+
mdata = ln.core.datasets.mudata_papalexi21_subset()
|
683
|
+
bt.settings.organism = "human" # set the organism
|
684
|
+
curator = ln.curators.MuDataCurator(mdata, mudata_schema)
|
685
|
+
artifact = curator.save_artifact(key="example_datasets/mudata_papalexi21_subset.h5mu")
|
686
|
+
assert artifact.schema == mudata_schema
|
545
687
|
"""
|
546
688
|
|
547
689
|
def __init__(
|
548
|
-
self,
|
549
|
-
|
690
|
+
self,
|
691
|
+
dataset: MuData | Artifact,
|
692
|
+
schema: Schema,
|
693
|
+
) -> None:
|
694
|
+
super().__init__(dataset=dataset, schema=schema)
|
695
|
+
if not data_is_mudata(self._dataset):
|
696
|
+
raise InvalidArgument("dataset must be MuData-like.")
|
697
|
+
if schema.otype != "MuData":
|
698
|
+
raise InvalidArgument("Schema otype must be 'MuData'.")
|
699
|
+
|
700
|
+
for slot, slot_schema in schema.slots.items():
|
701
|
+
# Assign to _slots
|
702
|
+
if ":" in slot:
|
703
|
+
modality, modality_slot = slot.split(":")
|
704
|
+
schema_dataset = self._dataset.__getitem__(modality)
|
705
|
+
else:
|
706
|
+
modality, modality_slot = None, slot
|
707
|
+
schema_dataset = self._dataset
|
708
|
+
self._slots[slot] = DataFrameCurator(
|
709
|
+
(
|
710
|
+
getattr(schema_dataset, modality_slot).T
|
711
|
+
if modality_slot == "var"
|
712
|
+
else getattr(schema_dataset, modality_slot)
|
713
|
+
),
|
714
|
+
slot_schema,
|
715
|
+
)
|
716
|
+
_assign_var_fields_categoricals_multimodal(
|
717
|
+
modality=modality,
|
718
|
+
slot_type=modality_slot,
|
719
|
+
slot=slot,
|
720
|
+
slot_schema=slot_schema,
|
721
|
+
var_fields=self._var_fields,
|
722
|
+
categoricals=self._categoricals,
|
723
|
+
slots=self._slots,
|
724
|
+
)
|
725
|
+
|
726
|
+
# for consistency with BaseCatManager
|
727
|
+
self._columns_field = self._var_fields
|
728
|
+
|
729
|
+
|
730
|
+
class SpatialDataCurator(SlotsCurator):
|
731
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
732
|
+
"""Curator for `SpatialData`.
|
733
|
+
|
734
|
+
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
735
|
+
|
736
|
+
.. versionadded:: 1.3.0
|
737
|
+
|
738
|
+
Args:
|
739
|
+
dataset: The SpatialData-like object to validate & annotate.
|
740
|
+
schema: A `Schema` object that defines the validation constraints.
|
741
|
+
|
742
|
+
Example::
|
743
|
+
|
744
|
+
import lamindb as ln
|
745
|
+
import bionty as bt
|
746
|
+
|
747
|
+
# define sample schema
|
748
|
+
sample_schema = ln.Schema(
|
749
|
+
name="blobs_sample_level_metadata",
|
750
|
+
features=[
|
751
|
+
ln.Feature(name="assay", dtype=bt.ExperimentalFactor).save(),
|
752
|
+
ln.Feature(name="disease", dtype=bt.Disease).save(),
|
753
|
+
ln.Feature(name="development_stage", dtype=bt.DevelopmentalStage).save(),
|
754
|
+
],
|
755
|
+
coerce_dtype=True
|
756
|
+
).save()
|
757
|
+
|
758
|
+
# define table obs schema
|
759
|
+
blobs_obs_schema = ln.Schema(
|
760
|
+
name="blobs_obs_level_metadata",
|
761
|
+
features=[
|
762
|
+
ln.Feature(name="sample_region", dtype="str").save(),
|
763
|
+
],
|
764
|
+
coerce_dtype=True
|
765
|
+
).save()
|
766
|
+
|
767
|
+
# define table var schema
|
768
|
+
blobs_var_schema = ln.Schema(
|
769
|
+
name="blobs_var_schema",
|
770
|
+
itype=bt.Gene.ensembl_gene_id,
|
771
|
+
dtype=int
|
772
|
+
).save()
|
773
|
+
|
774
|
+
# define composite schema
|
775
|
+
spatialdata_schema = ln.Schema(
|
776
|
+
name="blobs_spatialdata_schema",
|
777
|
+
otype="SpatialData",
|
778
|
+
components={
|
779
|
+
"sample": sample_schema,
|
780
|
+
"table:obs": blobs_obs_schema,
|
781
|
+
"table:var": blobs_var_schema,
|
782
|
+
}).save()
|
783
|
+
|
784
|
+
# curate a SpatialData
|
785
|
+
spatialdata = ln.core.datasets.spatialdata_blobs()
|
786
|
+
curator = ln.curators.SpatialDataCurator(spatialdata, spatialdata_schema)
|
787
|
+
try:
|
788
|
+
curator.validate()
|
789
|
+
except ln.errors.ValidationError as error:
|
790
|
+
print(error)
|
791
|
+
|
792
|
+
# validate again (must pass now) and save artifact
|
793
|
+
artifact = curator.save_artifact(key="example_datasets/spatialdata1.zarr")
|
794
|
+
assert artifact.schema == spatialdata_schema
|
795
|
+
"""
|
796
|
+
|
797
|
+
def __init__(
|
798
|
+
self,
|
799
|
+
dataset: SpatialData | Artifact,
|
800
|
+
schema: Schema,
|
801
|
+
*,
|
802
|
+
sample_metadata_key: str | None = "sample",
|
803
|
+
) -> None:
|
804
|
+
super().__init__(dataset=dataset, schema=schema)
|
805
|
+
if not data_is_spatialdata(self._dataset):
|
806
|
+
raise InvalidArgument("dataset must be SpatialData-like.")
|
807
|
+
if schema.otype != "SpatialData":
|
808
|
+
raise InvalidArgument("Schema otype must be 'SpatialData'.")
|
809
|
+
|
810
|
+
for slot, slot_schema in schema.slots.items():
|
811
|
+
# Assign to _slots
|
812
|
+
if ":" in slot:
|
813
|
+
table_key, table_slot = slot.split(":")
|
814
|
+
schema_dataset = self._dataset.tables.__getitem__(table_key)
|
815
|
+
# sample metadata (does not have a `:` separator)
|
816
|
+
else:
|
817
|
+
table_key = None
|
818
|
+
table_slot = slot
|
819
|
+
schema_dataset = self._dataset.get_attrs(
|
820
|
+
key=sample_metadata_key, return_as="df", flatten=True
|
821
|
+
)
|
822
|
+
|
823
|
+
self._slots[slot] = DataFrameCurator(
|
824
|
+
(
|
825
|
+
getattr(schema_dataset, table_slot).T
|
826
|
+
if table_slot == "var"
|
827
|
+
else (
|
828
|
+
getattr(schema_dataset, table_slot)
|
829
|
+
if table_slot != sample_metadata_key
|
830
|
+
else schema_dataset
|
831
|
+
) # just take the schema_dataset if it's the sample metadata key
|
832
|
+
),
|
833
|
+
slot_schema,
|
834
|
+
)
|
835
|
+
|
836
|
+
_assign_var_fields_categoricals_multimodal(
|
837
|
+
modality=table_key,
|
838
|
+
slot_type=table_slot,
|
839
|
+
slot=slot,
|
840
|
+
slot_schema=slot_schema,
|
841
|
+
var_fields=self._var_fields,
|
842
|
+
categoricals=self._categoricals,
|
843
|
+
slots=self._slots,
|
844
|
+
)
|
845
|
+
|
846
|
+
# for consistency with BaseCatManager
|
847
|
+
self._columns_field = self._var_fields
|
848
|
+
|
849
|
+
|
850
|
+
class CatManager:
|
851
|
+
"""Manage categoricals by updating registries.
|
852
|
+
|
853
|
+
This class is accessible from within a `DataFrameCurator` via the `.cat` attribute.
|
854
|
+
|
855
|
+
If you find non-validated values, you have several options:
|
856
|
+
|
857
|
+
- new values found in the data can be registered via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.add_new_from`
|
858
|
+
- non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
|
859
|
+
"""
|
860
|
+
|
861
|
+
def __init__(self, *, dataset, categoricals, sources, organism, columns_field=None):
|
550
862
|
# the below is shared with Curator
|
551
863
|
self._artifact: Artifact = None # pass the dataset as an artifact
|
552
864
|
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
@@ -560,7 +872,6 @@ class CatManager:
|
|
560
872
|
self._non_validated = None
|
561
873
|
self._organism = organism
|
562
874
|
self._sources = sources or {}
|
563
|
-
self._exclude = exclude or {}
|
564
875
|
self._columns_field = columns_field
|
565
876
|
self._validate_category_error_messages: str = ""
|
566
877
|
|
@@ -620,7 +931,7 @@ class CatManager:
|
|
620
931
|
Returns:
|
621
932
|
None
|
622
933
|
"""
|
623
|
-
pass #
|
934
|
+
pass # pragma: no cover
|
624
935
|
|
625
936
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
626
937
|
def save_artifact(
|
@@ -645,10 +956,10 @@ class CatManager:
|
|
645
956
|
settings.verbosity = "warning"
|
646
957
|
self._artifact = save_artifact( # type: ignore
|
647
958
|
self._dataset,
|
959
|
+
key=key,
|
648
960
|
description=description,
|
649
961
|
fields=self.categoricals,
|
650
|
-
|
651
|
-
key=key,
|
962
|
+
index_field=self._columns_field,
|
652
963
|
artifact=self._artifact,
|
653
964
|
revises=revises,
|
654
965
|
run=run,
|
@@ -662,34 +973,7 @@ class CatManager:
|
|
662
973
|
|
663
974
|
|
664
975
|
class DataFrameCatManager(CatManager):
|
665
|
-
"""
|
666
|
-
|
667
|
-
See also :class:`~lamindb.Curator`.
|
668
|
-
|
669
|
-
Args:
|
670
|
-
df: The DataFrame object to curate.
|
671
|
-
columns: The field attribute for the feature column.
|
672
|
-
categoricals: A dictionary mapping column names to registry_field.
|
673
|
-
verbosity: The verbosity level.
|
674
|
-
organism: The organism name.
|
675
|
-
sources: A dictionary mapping column names to Source records.
|
676
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
677
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
678
|
-
using the exclude parameter ensures they are not validated.
|
679
|
-
|
680
|
-
Returns:
|
681
|
-
A curator object.
|
682
|
-
|
683
|
-
Examples:
|
684
|
-
>>> import bionty as bt
|
685
|
-
>>> curator = ln.Curator.from_df(
|
686
|
-
... df,
|
687
|
-
... categoricals={
|
688
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
689
|
-
... "donor_id": ULabel.name
|
690
|
-
... }
|
691
|
-
... )
|
692
|
-
"""
|
976
|
+
"""Categorical manager for `DataFrame`."""
|
693
977
|
|
694
978
|
def __init__(
|
695
979
|
self,
|
@@ -699,7 +983,6 @@ class DataFrameCatManager(CatManager):
|
|
699
983
|
verbosity: str = "hint",
|
700
984
|
organism: str | None = None,
|
701
985
|
sources: dict[str, Record] | None = None,
|
702
|
-
exclude: dict | None = None,
|
703
986
|
) -> None:
|
704
987
|
from lamindb.core._settings import settings
|
705
988
|
|
@@ -714,17 +997,16 @@ class DataFrameCatManager(CatManager):
|
|
714
997
|
organism=organism,
|
715
998
|
categoricals=categoricals,
|
716
999
|
sources=sources,
|
717
|
-
exclude=exclude,
|
718
1000
|
)
|
719
1001
|
self._save_columns()
|
720
1002
|
|
721
|
-
def lookup(self, public: bool = False) ->
|
1003
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
722
1004
|
"""Lookup categories.
|
723
1005
|
|
724
1006
|
Args:
|
725
1007
|
public: If "public", the lookup is performed on the public reference.
|
726
1008
|
"""
|
727
|
-
return
|
1009
|
+
return CatLookup(
|
728
1010
|
categoricals=self._categoricals,
|
729
1011
|
slots={"columns": self._columns_field},
|
730
1012
|
public=public,
|
@@ -739,7 +1021,6 @@ class DataFrameCatManager(CatManager):
|
|
739
1021
|
key="columns",
|
740
1022
|
validated_only=False,
|
741
1023
|
source=self._sources.get("columns"),
|
742
|
-
exclude=self._exclude.get("columns"),
|
743
1024
|
)
|
744
1025
|
|
745
1026
|
# Save the rest of the columns based on validated_only
|
@@ -752,7 +1033,6 @@ class DataFrameCatManager(CatManager):
|
|
752
1033
|
validated_only=validated_only,
|
753
1034
|
df=self._dataset, # Get the Feature type from df
|
754
1035
|
source=self._sources.get("columns"),
|
755
|
-
exclude=self._exclude.get("columns"),
|
756
1036
|
)
|
757
1037
|
|
758
1038
|
@deprecated(new_name="is run by default")
|
@@ -778,7 +1058,6 @@ class DataFrameCatManager(CatManager):
|
|
778
1058
|
self._dataset,
|
779
1059
|
fields=self.categoricals,
|
780
1060
|
sources=self._sources,
|
781
|
-
exclude=self._exclude,
|
782
1061
|
curator=self,
|
783
1062
|
organism=self._organism,
|
784
1063
|
)
|
@@ -852,7 +1131,6 @@ class DataFrameCatManager(CatManager):
|
|
852
1131
|
key=categorical,
|
853
1132
|
validated_only=validated_only,
|
854
1133
|
source=self._sources.get(categorical),
|
855
|
-
exclude=self._exclude.get(categorical),
|
856
1134
|
organism=self._organism,
|
857
1135
|
)
|
858
1136
|
# adding new records removes them from non_validated
|
@@ -882,32 +1160,7 @@ class DataFrameCatManager(CatManager):
|
|
882
1160
|
|
883
1161
|
|
884
1162
|
class AnnDataCatManager(CatManager):
|
885
|
-
"""
|
886
|
-
|
887
|
-
Args:
|
888
|
-
data: The AnnData object or an AnnData-like path.
|
889
|
-
var_index: The registry field for mapping the ``.var`` index.
|
890
|
-
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
891
|
-
obs_columns: The registry field for mapping the ``.obs.columns``.
|
892
|
-
verbosity: The verbosity level.
|
893
|
-
organism: The organism name.
|
894
|
-
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
895
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
896
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
897
|
-
using the exclude parameter ensures they are not validated.
|
898
|
-
|
899
|
-
Examples:
|
900
|
-
>>> import bionty as bt
|
901
|
-
>>> curator = ln.Curator.from_anndata(
|
902
|
-
... adata,
|
903
|
-
... var_index=bt.Gene.ensembl_gene_id,
|
904
|
-
... categoricals={
|
905
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
906
|
-
... "donor_id": ULabel.name
|
907
|
-
... },
|
908
|
-
... organism="human",
|
909
|
-
... )
|
910
|
-
"""
|
1163
|
+
"""Categorical manager for `AnnData`."""
|
911
1164
|
|
912
1165
|
def __init__(
|
913
1166
|
self,
|
@@ -918,13 +1171,10 @@ class AnnDataCatManager(CatManager):
|
|
918
1171
|
verbosity: str = "hint",
|
919
1172
|
organism: str | None = None,
|
920
1173
|
sources: dict[str, Record] | None = None,
|
921
|
-
exclude: dict | None = None,
|
922
1174
|
) -> None:
|
923
1175
|
if isinstance(var_index, str):
|
924
1176
|
raise TypeError("var_index parameter has to be a bionty field")
|
925
1177
|
|
926
|
-
if sources is None:
|
927
|
-
sources = {}
|
928
1178
|
if not data_is_anndata(data):
|
929
1179
|
raise TypeError("data has to be an AnnData object")
|
930
1180
|
|
@@ -935,12 +1185,12 @@ class AnnDataCatManager(CatManager):
|
|
935
1185
|
|
936
1186
|
self._obs_fields = categoricals or {}
|
937
1187
|
self._var_field = var_index
|
1188
|
+
self._sources = sources or {}
|
938
1189
|
super().__init__(
|
939
1190
|
dataset=data,
|
940
1191
|
categoricals=categoricals,
|
941
|
-
sources=
|
1192
|
+
sources=self._sources,
|
942
1193
|
organism=organism,
|
943
|
-
exclude=exclude,
|
944
1194
|
columns_field=var_index,
|
945
1195
|
)
|
946
1196
|
self._adata = self._dataset
|
@@ -950,8 +1200,7 @@ class AnnDataCatManager(CatManager):
|
|
950
1200
|
columns=obs_columns,
|
951
1201
|
verbosity=verbosity,
|
952
1202
|
organism=None,
|
953
|
-
sources=
|
954
|
-
exclude=exclude,
|
1203
|
+
sources=self._sources,
|
955
1204
|
)
|
956
1205
|
|
957
1206
|
@property
|
@@ -964,13 +1213,13 @@ class AnnDataCatManager(CatManager):
|
|
964
1213
|
"""Return the obs fields to validate against."""
|
965
1214
|
return self._obs_fields
|
966
1215
|
|
967
|
-
def lookup(self, public: bool = False) ->
|
1216
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
968
1217
|
"""Lookup categories.
|
969
1218
|
|
970
1219
|
Args:
|
971
1220
|
public: If "public", the lookup is performed on the public reference.
|
972
1221
|
"""
|
973
|
-
return
|
1222
|
+
return CatLookup(
|
974
1223
|
categoricals=self._obs_fields,
|
975
1224
|
slots={"columns": self._columns_field, "var_index": self._var_field},
|
976
1225
|
public=public,
|
@@ -989,7 +1238,6 @@ class AnnDataCatManager(CatManager):
|
|
989
1238
|
validated_only=validated_only,
|
990
1239
|
organism=self._organism,
|
991
1240
|
source=self._sources.get("var_index"),
|
992
|
-
exclude=self._exclude.get("var_index"),
|
993
1241
|
)
|
994
1242
|
|
995
1243
|
def add_new_from(self, key: str, **kwargs):
|
@@ -1033,7 +1281,6 @@ class AnnDataCatManager(CatManager):
|
|
1033
1281
|
key="var_index",
|
1034
1282
|
source=self._sources.get("var_index"),
|
1035
1283
|
hint_print=".add_new_from_var_index()",
|
1036
|
-
exclude=self._exclude.get("var_index"),
|
1037
1284
|
organism=self._organism, # type: ignore
|
1038
1285
|
)
|
1039
1286
|
else:
|
@@ -1077,59 +1324,29 @@ class AnnDataCatManager(CatManager):
|
|
1077
1324
|
|
1078
1325
|
|
1079
1326
|
class MuDataCatManager(CatManager):
|
1080
|
-
"""
|
1081
|
-
|
1082
|
-
Args:
|
1083
|
-
mdata: The MuData object to curate.
|
1084
|
-
var_index: The registry field for mapping the ``.var`` index for each modality.
|
1085
|
-
For example:
|
1086
|
-
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
|
1087
|
-
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
1088
|
-
Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
|
1089
|
-
verbosity: The verbosity level.
|
1090
|
-
organism: The organism name.
|
1091
|
-
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
1092
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
1093
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1094
|
-
using the exclude parameter ensures they are not validated.
|
1095
|
-
|
1096
|
-
Examples:
|
1097
|
-
>>> import bionty as bt
|
1098
|
-
>>> curator = ln.Curator.from_mudata(
|
1099
|
-
... mdata,
|
1100
|
-
... var_index={
|
1101
|
-
... "rna": bt.Gene.ensembl_gene_id,
|
1102
|
-
... "adt": CellMarker.name
|
1103
|
-
... },
|
1104
|
-
... categoricals={
|
1105
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
1106
|
-
... "donor_id": ULabel.name
|
1107
|
-
... },
|
1108
|
-
... organism="human",
|
1109
|
-
... )
|
1110
|
-
"""
|
1327
|
+
"""Categorical manager for `MuData`."""
|
1111
1328
|
|
1112
1329
|
def __init__(
|
1113
1330
|
self,
|
1114
1331
|
mdata: MuData | Artifact,
|
1115
|
-
var_index: dict[str, FieldAttr],
|
1332
|
+
var_index: dict[str, FieldAttr] | None = None,
|
1116
1333
|
categoricals: dict[str, FieldAttr] | None = None,
|
1117
1334
|
verbosity: str = "hint",
|
1118
1335
|
organism: str | None = None,
|
1119
1336
|
sources: dict[str, Record] | None = None,
|
1120
|
-
exclude: dict | None = None, # {modality: {field: [values]}}
|
1121
1337
|
) -> None:
|
1122
1338
|
super().__init__(
|
1123
1339
|
dataset=mdata,
|
1124
1340
|
categoricals={},
|
1125
1341
|
sources=sources,
|
1126
1342
|
organism=organism,
|
1127
|
-
exclude=exclude,
|
1128
1343
|
)
|
1129
|
-
self._columns_field =
|
1130
|
-
|
1344
|
+
self._columns_field = (
|
1345
|
+
var_index or {}
|
1346
|
+
) # this is for consistency with BaseCatManager
|
1347
|
+
self._var_fields = var_index or {}
|
1131
1348
|
self._verify_modality(self._var_fields.keys())
|
1132
|
-
self._obs_fields = self._parse_categoricals(categoricals)
|
1349
|
+
self._obs_fields = self._parse_categoricals(categoricals or {})
|
1133
1350
|
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
1134
1351
|
self._verbosity = verbosity
|
1135
1352
|
self._obs_df_curator = None
|
@@ -1140,7 +1357,6 @@ class MuDataCatManager(CatManager):
|
|
1140
1357
|
categoricals=self._obs_fields.get("obs", {}),
|
1141
1358
|
verbosity=verbosity,
|
1142
1359
|
sources=self._sources.get("obs"),
|
1143
|
-
exclude=self._exclude.get("obs"),
|
1144
1360
|
organism=organism,
|
1145
1361
|
)
|
1146
1362
|
self._mod_adata_curators = {
|
@@ -1150,7 +1366,6 @@ class MuDataCatManager(CatManager):
|
|
1150
1366
|
categoricals=self._obs_fields.get(modality),
|
1151
1367
|
verbosity=verbosity,
|
1152
1368
|
sources=self._sources.get(modality),
|
1153
|
-
exclude=self._exclude.get(modality),
|
1154
1369
|
organism=organism,
|
1155
1370
|
)
|
1156
1371
|
for modality in self._modalities
|
@@ -1199,7 +1414,7 @@ class MuDataCatManager(CatManager):
|
|
1199
1414
|
obs_fields["obs"][k] = v
|
1200
1415
|
return obs_fields
|
1201
1416
|
|
1202
|
-
def lookup(self, public: bool = False) ->
|
1417
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
1203
1418
|
"""Lookup categories.
|
1204
1419
|
|
1205
1420
|
Args:
|
@@ -1212,7 +1427,7 @@ class MuDataCatManager(CatManager):
|
|
1212
1427
|
obs_fields[k] = v
|
1213
1428
|
else:
|
1214
1429
|
obs_fields[f"{mod}:{k}"] = v
|
1215
|
-
return
|
1430
|
+
return CatLookup(
|
1216
1431
|
categoricals=obs_fields,
|
1217
1432
|
slots={
|
1218
1433
|
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
@@ -1271,8 +1486,6 @@ class MuDataCatManager(CatManager):
|
|
1271
1486
|
|
1272
1487
|
def validate(self) -> bool:
|
1273
1488
|
"""Validate categories."""
|
1274
|
-
from lamindb.core._settings import settings
|
1275
|
-
|
1276
1489
|
# add all validated records to the current instance
|
1277
1490
|
verbosity = settings.verbosity
|
1278
1491
|
try:
|
@@ -1329,393 +1542,290 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
|
1329
1542
|
)
|
1330
1543
|
|
1331
1544
|
|
1332
|
-
class
|
1333
|
-
"""
|
1334
|
-
|
1335
|
-
Args:
|
1336
|
-
experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
|
1337
|
-
var_index: The registry fields for mapping the `.var` indices for measurements.
|
1338
|
-
Should be in the form `{"measurement name": ("var column", field)}`.
|
1339
|
-
These keys should be used in the flattened form (`'{measurement name}__{column name in .var}'`)
|
1340
|
-
in `.standardize` or `.add_new_from`, see the output of `.var_index`.
|
1341
|
-
categoricals: A dictionary mapping categorical `.obs` columns to a registry field.
|
1342
|
-
obs_columns: The registry field for mapping the names of the `.obs` columns.
|
1343
|
-
organism: The organism name.
|
1344
|
-
sources: A dictionary mapping `.obs` columns to Source records.
|
1345
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
1346
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1347
|
-
using the exclude parameter ensures they are not validated.
|
1348
|
-
|
1349
|
-
Examples:
|
1350
|
-
>>> import bionty as bt
|
1351
|
-
>>> curator = ln.Curator.from_tiledbsoma(
|
1352
|
-
... "./my_array_store.tiledbsoma",
|
1353
|
-
... var_index={"RNA": ("var_id", bt.Gene.symbol)},
|
1354
|
-
... categoricals={
|
1355
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
1356
|
-
... "donor_id": ULabel.name
|
1357
|
-
... },
|
1358
|
-
... organism="human",
|
1359
|
-
... )
|
1360
|
-
"""
|
1545
|
+
class SpatialDataCatManager(CatManager):
|
1546
|
+
"""Categorical manager for `SpatialData`."""
|
1361
1547
|
|
1362
1548
|
def __init__(
|
1363
1549
|
self,
|
1364
|
-
|
1365
|
-
var_index: dict[str,
|
1366
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
1367
|
-
|
1550
|
+
sdata: Any,
|
1551
|
+
var_index: dict[str, FieldAttr],
|
1552
|
+
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
1553
|
+
verbosity: str = "hint",
|
1368
1554
|
organism: str | None = None,
|
1369
|
-
sources: dict[str, Record] | None = None,
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1555
|
+
sources: dict[str, dict[str, Record]] | None = None,
|
1556
|
+
*,
|
1557
|
+
sample_metadata_key: str | None = "sample",
|
1558
|
+
) -> None:
|
1559
|
+
super().__init__(
|
1560
|
+
dataset=sdata,
|
1561
|
+
categoricals={},
|
1562
|
+
sources=sources,
|
1563
|
+
organism=organism,
|
1564
|
+
)
|
1565
|
+
if isinstance(sdata, Artifact):
|
1566
|
+
self._sdata = sdata.load()
|
1378
1567
|
else:
|
1379
|
-
self.
|
1380
|
-
|
1381
|
-
self.
|
1382
|
-
self.
|
1383
|
-
self.
|
1384
|
-
|
1385
|
-
self.
|
1386
|
-
|
1387
|
-
self._validated_values: dict[str, list] = {}
|
1388
|
-
# filled by _check_save_keys
|
1389
|
-
self._n_obs: int | None = None
|
1390
|
-
self._valid_obs_keys: list[str] | None = None
|
1391
|
-
self._obs_pa_schema: pa.lib.Schema | None = (
|
1392
|
-
None # this is needed to create the obs feature set
|
1568
|
+
self._sdata = self._dataset
|
1569
|
+
self._sample_metadata_key = sample_metadata_key
|
1570
|
+
self._write_path = None
|
1571
|
+
self._var_fields = var_index
|
1572
|
+
self._verify_accessor_exists(self._var_fields.keys())
|
1573
|
+
self._categoricals = categoricals
|
1574
|
+
self._table_keys = set(self._var_fields.keys()) | set(
|
1575
|
+
self._categoricals.keys() - {self._sample_metadata_key}
|
1393
1576
|
)
|
1394
|
-
self.
|
1395
|
-
self.
|
1396
|
-
self.
|
1397
|
-
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1402
|
-
|
1403
|
-
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1404
|
-
experiment_obs = experiment.obs
|
1405
|
-
self._n_obs = len(experiment_obs)
|
1406
|
-
self._obs_pa_schema = experiment_obs.schema
|
1407
|
-
valid_obs_keys = [
|
1408
|
-
k for k in self._obs_pa_schema.names if k != "soma_joinid"
|
1409
|
-
]
|
1410
|
-
self._valid_obs_keys = valid_obs_keys
|
1411
|
-
|
1412
|
-
valid_var_keys = []
|
1413
|
-
ms_list = []
|
1414
|
-
for ms in experiment.ms.keys():
|
1415
|
-
ms_list.append(ms)
|
1416
|
-
var_ms = experiment.ms[ms].var
|
1417
|
-
valid_var_keys += [
|
1418
|
-
f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
|
1419
|
-
]
|
1420
|
-
self._valid_var_keys = valid_var_keys
|
1577
|
+
self._verbosity = verbosity
|
1578
|
+
self._sample_df_curator = None
|
1579
|
+
if self._sample_metadata_key is not None:
|
1580
|
+
self._sample_metadata = self._sdata.get_attrs(
|
1581
|
+
key=self._sample_metadata_key, return_as="df", flatten=True
|
1582
|
+
)
|
1583
|
+
self._is_validated = False
|
1421
1584
|
|
1422
|
-
#
|
1585
|
+
# Check validity of keys in categoricals
|
1423
1586
|
nonval_keys = []
|
1424
|
-
for
|
1425
|
-
if
|
1426
|
-
|
1587
|
+
for accessor, accessor_categoricals in self._categoricals.items():
|
1588
|
+
if (
|
1589
|
+
accessor == self._sample_metadata_key
|
1590
|
+
and self._sample_metadata is not None
|
1591
|
+
):
|
1592
|
+
for key in accessor_categoricals.keys():
|
1593
|
+
if key not in self._sample_metadata.columns:
|
1594
|
+
nonval_keys.append(key)
|
1595
|
+
else:
|
1596
|
+
for key in accessor_categoricals.keys():
|
1597
|
+
if key not in self._sdata[accessor].obs.columns:
|
1598
|
+
nonval_keys.append(key)
|
1599
|
+
|
1427
1600
|
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1428
1601
|
|
1429
|
-
# check validity of keys in
|
1430
|
-
self._var_fields_flat = {}
|
1602
|
+
# check validity of keys in sources
|
1431
1603
|
nonval_keys = []
|
1432
|
-
for
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1604
|
+
for accessor, accessor_sources in self._sources.items():
|
1605
|
+
if (
|
1606
|
+
accessor == self._sample_metadata_key
|
1607
|
+
and self._sample_metadata is not None
|
1608
|
+
):
|
1609
|
+
columns = self._sample_metadata.columns
|
1610
|
+
elif accessor != self._sample_metadata_key:
|
1611
|
+
columns = self._sdata[accessor].obs.columns
|
1437
1612
|
else:
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1443
|
-
for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
|
1444
|
-
nonval_keys = []
|
1445
|
-
for arg_key in dct.keys():
|
1446
|
-
if arg_key not in valid_arg_keys:
|
1447
|
-
nonval_keys.append(arg_key)
|
1448
|
-
_maybe_curation_keys_not_present(nonval_keys, name)
|
1613
|
+
continue
|
1614
|
+
for key in accessor_sources:
|
1615
|
+
if key not in columns:
|
1616
|
+
nonval_keys.append(key)
|
1617
|
+
_maybe_curation_keys_not_present(nonval_keys, "sources")
|
1449
1618
|
|
1450
|
-
#
|
1451
|
-
register_columns = list(self._obs_fields.keys())
|
1452
|
-
organism = check_registry_organism(
|
1453
|
-
self._columns_field.field.model, self._organism
|
1454
|
-
).get("organism")
|
1455
|
-
update_registry(
|
1456
|
-
values=register_columns,
|
1457
|
-
field=self._columns_field,
|
1458
|
-
key="columns",
|
1459
|
-
validated_only=False,
|
1460
|
-
organism=organism,
|
1461
|
-
source=self._sources.get("columns"),
|
1462
|
-
exclude=self._exclude.get("columns"),
|
1463
|
-
)
|
1464
|
-
additional_columns = [k for k in valid_obs_keys if k not in register_columns]
|
1465
|
-
# no need to register with validated_only=True if columns are features
|
1619
|
+
# Set up sample level metadata and table Curator objects
|
1466
1620
|
if (
|
1467
|
-
|
1468
|
-
and self.
|
1621
|
+
self._sample_metadata_key is not None
|
1622
|
+
and self._sample_metadata_key in self._categoricals
|
1469
1623
|
):
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
|
1474
|
-
|
1624
|
+
self._sample_df_curator = DataFrameCatManager(
|
1625
|
+
df=self._sample_metadata,
|
1626
|
+
columns=Feature.name,
|
1627
|
+
categoricals=self._categoricals.get(self._sample_metadata_key, {}),
|
1628
|
+
verbosity=verbosity,
|
1629
|
+
sources=self._sources.get(self._sample_metadata_key),
|
1630
|
+
organism=organism,
|
1631
|
+
)
|
1632
|
+
self._table_adata_curators = {
|
1633
|
+
table: AnnDataCatManager(
|
1634
|
+
data=self._sdata[table],
|
1635
|
+
var_index=var_index.get(table),
|
1636
|
+
categoricals=self._categoricals.get(table),
|
1637
|
+
verbosity=verbosity,
|
1638
|
+
sources=self._sources.get(table),
|
1475
1639
|
organism=organism,
|
1476
|
-
source=self._sources.get("columns"),
|
1477
|
-
exclude=self._exclude.get("columns"),
|
1478
1640
|
)
|
1641
|
+
for table in self._table_keys
|
1642
|
+
}
|
1479
1643
|
|
1480
|
-
|
1481
|
-
"""Validate categories."""
|
1482
|
-
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1644
|
+
self._non_validated = None
|
1483
1645
|
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
var_ms = experiment.ms[ms].var
|
1489
|
-
var_ms_key = f"{ms}__{key}"
|
1490
|
-
# it was already validated and cached
|
1491
|
-
if var_ms_key in self._validated_values:
|
1492
|
-
continue
|
1493
|
-
var_ms_values = (
|
1494
|
-
var_ms.read(column_names=[key]).concat()[key].to_pylist()
|
1495
|
-
)
|
1496
|
-
organism = check_registry_organism(
|
1497
|
-
field.field.model, self._organism
|
1498
|
-
).get("organism")
|
1499
|
-
update_registry(
|
1500
|
-
values=var_ms_values,
|
1501
|
-
field=field,
|
1502
|
-
key=var_ms_key,
|
1503
|
-
validated_only=True,
|
1504
|
-
organism=organism,
|
1505
|
-
source=self._sources.get(var_ms_key),
|
1506
|
-
exclude=self._exclude.get(var_ms_key),
|
1507
|
-
)
|
1508
|
-
_, non_val = validate_categories(
|
1509
|
-
values=var_ms_values,
|
1510
|
-
field=field,
|
1511
|
-
key=var_ms_key,
|
1512
|
-
organism=organism,
|
1513
|
-
source=self._sources.get(var_ms_key),
|
1514
|
-
exclude=self._exclude.get(var_ms_key),
|
1515
|
-
)
|
1516
|
-
if len(non_val) > 0:
|
1517
|
-
validated = False
|
1518
|
-
self._non_validated_values[var_ms_key] = non_val
|
1519
|
-
else:
|
1520
|
-
self._validated_values[var_ms_key] = var_ms_values
|
1646
|
+
@property
|
1647
|
+
def var_index(self) -> FieldAttr:
|
1648
|
+
"""Return the registry fields to validate variables indices against."""
|
1649
|
+
return self._var_fields
|
1521
1650
|
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
continue
|
1527
|
-
values = pa.compute.unique(
|
1528
|
-
obs.read(column_names=[key]).concat()[key]
|
1529
|
-
).to_pylist()
|
1530
|
-
organism = check_registry_organism(
|
1531
|
-
field.field.model, self._organism
|
1532
|
-
).get("organism")
|
1533
|
-
update_registry(
|
1534
|
-
values=values,
|
1535
|
-
field=field,
|
1536
|
-
key=key,
|
1537
|
-
validated_only=True,
|
1538
|
-
organism=organism,
|
1539
|
-
source=self._sources.get(key),
|
1540
|
-
exclude=self._exclude.get(key),
|
1541
|
-
)
|
1542
|
-
_, non_val = validate_categories(
|
1543
|
-
values=values,
|
1544
|
-
field=field,
|
1545
|
-
key=key,
|
1546
|
-
organism=organism,
|
1547
|
-
source=self._sources.get(key),
|
1548
|
-
exclude=self._exclude.get(key),
|
1549
|
-
)
|
1550
|
-
if len(non_val) > 0:
|
1551
|
-
validated = False
|
1552
|
-
self._non_validated_values[key] = non_val
|
1553
|
-
else:
|
1554
|
-
self._validated_values[key] = values
|
1555
|
-
self._is_validated = validated
|
1556
|
-
return self._is_validated
|
1651
|
+
@property
|
1652
|
+
def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
|
1653
|
+
"""Return the categorical keys and fields to validate against."""
|
1654
|
+
return self._categoricals
|
1557
1655
|
|
1558
|
-
|
1559
|
-
|
1656
|
+
@property
|
1657
|
+
def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
|
1658
|
+
"""Return the non-validated features and labels."""
|
1659
|
+
if self._non_validated is None:
|
1660
|
+
raise ValidationError("Please run validate() first!")
|
1661
|
+
return self._non_validated
|
1560
1662
|
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1663
|
+
def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
|
1664
|
+
"""Verify that the accessors exist (either a valid table or in attrs)."""
|
1665
|
+
for acc in accessors:
|
1666
|
+
is_present = False
|
1667
|
+
try:
|
1668
|
+
self._sdata.get_attrs(key=acc)
|
1669
|
+
is_present = True
|
1670
|
+
except KeyError:
|
1671
|
+
if acc in self._sdata.tables.keys():
|
1672
|
+
is_present = True
|
1673
|
+
if not is_present:
|
1674
|
+
raise ValidationError(f"Accessor '{acc}' does not exist!")
|
1570
1675
|
|
1571
|
-
def
|
1572
|
-
"""
|
1676
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
1677
|
+
"""Look up categories.
|
1573
1678
|
|
1574
1679
|
Args:
|
1575
|
-
|
1576
|
-
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
1577
|
-
or a column name in `.obs`.
|
1680
|
+
public: Whether the lookup is performed on the public reference.
|
1578
1681
|
"""
|
1579
|
-
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
|
1584
|
-
|
1585
|
-
|
1682
|
+
cat_values_dict = list(self.categoricals.values())[0]
|
1683
|
+
return CatLookup(
|
1684
|
+
categoricals=cat_values_dict,
|
1685
|
+
slots={"accessors": cat_values_dict.keys()},
|
1686
|
+
public=public,
|
1687
|
+
)
|
1688
|
+
|
1689
|
+
def _update_registry_all(self) -> None:
|
1690
|
+
"""Saves labels of all features for sample and table metadata."""
|
1691
|
+
if self._sample_df_curator is not None:
|
1692
|
+
self._sample_df_curator._update_registry_all(
|
1693
|
+
validated_only=True,
|
1586
1694
|
)
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1590
|
-
)
|
1591
|
-
keys = [key]
|
1592
|
-
for k in keys:
|
1593
|
-
values, field = self._non_validated_values_field(k)
|
1594
|
-
if len(values) == 0:
|
1595
|
-
continue
|
1596
|
-
organism = check_registry_organism(field.field.model, self._organism).get(
|
1597
|
-
"organism"
|
1598
|
-
)
|
1599
|
-
update_registry(
|
1600
|
-
values=values,
|
1601
|
-
field=field,
|
1602
|
-
key=k,
|
1603
|
-
validated_only=False,
|
1604
|
-
organism=organism,
|
1605
|
-
source=self._sources.get(k),
|
1606
|
-
exclude=self._exclude.get(k),
|
1607
|
-
**kwargs,
|
1695
|
+
for _, adata_curator in self._table_adata_curators.items():
|
1696
|
+
adata_curator._obs_df_curator._update_registry_all(
|
1697
|
+
validated_only=True,
|
1608
1698
|
)
|
1609
|
-
# update non-validated values list but keep the key there
|
1610
|
-
# it will be removed by .validate()
|
1611
|
-
if k in self._non_validated_values:
|
1612
|
-
self._non_validated_values[k] = []
|
1613
1699
|
|
1614
|
-
|
1615
|
-
|
1616
|
-
"""Return the non-validated features and labels."""
|
1617
|
-
non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
|
1618
|
-
return non_val
|
1700
|
+
def add_new_from_var_index(self, table: str, **kwargs) -> None:
|
1701
|
+
"""Save new values from ``.var.index`` of table.
|
1619
1702
|
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1703
|
+
Args:
|
1704
|
+
table: The table key.
|
1705
|
+
organism: The organism name.
|
1706
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
1707
|
+
"""
|
1708
|
+
if self._non_validated is None:
|
1709
|
+
raise ValidationError("Run .validate() first.")
|
1710
|
+
self._table_adata_curators[table].add_new_from_var_index(**kwargs)
|
1711
|
+
if table in self.non_validated.keys():
|
1712
|
+
if "var_index" in self._non_validated[table]:
|
1713
|
+
self._non_validated[table].pop("var_index")
|
1624
1714
|
|
1625
|
-
|
1626
|
-
|
1627
|
-
"""Return the obs fields to validate against."""
|
1628
|
-
return self._obs_fields
|
1715
|
+
if len(self.non_validated[table].values()) == 0:
|
1716
|
+
self.non_validated.pop(table)
|
1629
1717
|
|
1630
|
-
def
|
1631
|
-
|
1718
|
+
def add_new_from(
|
1719
|
+
self,
|
1720
|
+
key: str,
|
1721
|
+
accessor: str | None = None,
|
1722
|
+
**kwargs,
|
1723
|
+
) -> None:
|
1724
|
+
"""Save new values of categorical from sample level metadata or table.
|
1632
1725
|
|
1633
1726
|
Args:
|
1634
|
-
|
1727
|
+
key: The key referencing the slot in the DataFrame.
|
1728
|
+
accessor: The accessor key such as 'sample' or 'table x'.
|
1729
|
+
organism: The organism name.
|
1730
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
1635
1731
|
"""
|
1636
|
-
|
1637
|
-
|
1638
|
-
slots={"columns": self._columns_field, **self._var_fields_flat},
|
1639
|
-
public=public,
|
1640
|
-
)
|
1732
|
+
if self._non_validated is None:
|
1733
|
+
raise ValidationError("Run .validate() first.")
|
1641
1734
|
|
1642
|
-
|
1643
|
-
|
1735
|
+
if len(kwargs) > 0 and key == "all":
|
1736
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
1737
|
+
|
1738
|
+
if accessor not in self.categoricals:
|
1739
|
+
raise ValueError(
|
1740
|
+
f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
|
1741
|
+
)
|
1742
|
+
|
1743
|
+
if accessor in self._table_adata_curators:
|
1744
|
+
adata_curator = self._table_adata_curators[accessor]
|
1745
|
+
adata_curator.add_new_from(key=key, **kwargs)
|
1746
|
+
if accessor == self._sample_metadata_key:
|
1747
|
+
self._sample_df_curator.add_new_from(key=key, **kwargs)
|
1748
|
+
|
1749
|
+
if accessor in self.non_validated.keys():
|
1750
|
+
if len(self.non_validated[accessor].values()) == 0:
|
1751
|
+
self.non_validated.pop(accessor)
|
1752
|
+
|
1753
|
+
def standardize(self, key: str, accessor: str | None = None) -> None:
|
1754
|
+
"""Replace synonyms with canonical values.
|
1644
1755
|
|
1645
1756
|
Modifies the dataset inplace.
|
1646
1757
|
|
1647
1758
|
Args:
|
1648
|
-
key: The key referencing the slot in the
|
1649
|
-
|
1650
|
-
or a column name in `.obs`.
|
1759
|
+
key: The key referencing the slot in the table or sample metadata.
|
1760
|
+
accessor: The accessor key such as 'sample_key' or 'table_key'.
|
1651
1761
|
"""
|
1652
1762
|
if len(self.non_validated) == 0:
|
1653
1763
|
logger.warning("values are already standardized")
|
1654
1764
|
return
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1765
|
+
if self._artifact is not None:
|
1766
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
1767
|
+
|
1768
|
+
if accessor == self._sample_metadata_key:
|
1769
|
+
if key not in self._sample_metadata.columns:
|
1770
|
+
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
1658
1771
|
else:
|
1659
|
-
if
|
1660
|
-
|
1661
|
-
|
1662
|
-
|
1663
|
-
|
1772
|
+
if (
|
1773
|
+
key == "var_index" and self._sdata.tables[accessor].var.index is None
|
1774
|
+
) or (
|
1775
|
+
key != "var_index"
|
1776
|
+
and key not in self._sdata.tables[accessor].obs.columns
|
1777
|
+
):
|
1778
|
+
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
1664
1779
|
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
ms, _, slot_key = k.partition("__")
|
1671
|
-
slot = lambda experiment: experiment.ms[ms].var # noqa: B023
|
1672
|
-
else:
|
1673
|
-
slot = lambda experiment: experiment.obs
|
1674
|
-
slot_key = k
|
1675
|
-
# errors if public ontology and the model has no organism
|
1676
|
-
# has to be fixed in bionty
|
1677
|
-
organism = check_registry_organism(field.field.model, self._organism).get(
|
1678
|
-
"organism"
|
1679
|
-
)
|
1680
|
-
syn_mapper = standardize_categories(
|
1681
|
-
values=values,
|
1682
|
-
field=field,
|
1683
|
-
source=self._sources.get(k),
|
1684
|
-
organism=organism,
|
1685
|
-
)
|
1686
|
-
if (n_syn_mapper := len(syn_mapper)) == 0:
|
1687
|
-
continue
|
1780
|
+
if accessor in self._table_adata_curators.keys():
|
1781
|
+
adata_curator = self._table_adata_curators[accessor]
|
1782
|
+
adata_curator.standardize(key)
|
1783
|
+
if accessor == self._sample_metadata_key:
|
1784
|
+
self._sample_df_curator.standardize(key)
|
1688
1785
|
|
1689
|
-
|
1786
|
+
if len(self.non_validated[accessor].values()) == 0:
|
1787
|
+
self.non_validated.pop(accessor)
|
1690
1788
|
|
1691
|
-
|
1692
|
-
|
1693
|
-
table = slot(experiment).read(value_filter=value_filter).concat()
|
1789
|
+
def validate(self) -> bool:
|
1790
|
+
"""Validate variables and categorical observations.
|
1694
1791
|
|
1695
|
-
|
1696
|
-
|
1792
|
+
This method also registers the validated records in the current instance:
|
1793
|
+
- from public sources
|
1697
1794
|
|
1698
|
-
|
1699
|
-
|
1700
|
-
df[slot_key] = df[slot_key].map(
|
1701
|
-
lambda val: syn_mapper.get(val, val) # noqa
|
1702
|
-
)
|
1703
|
-
# write the mapped values
|
1704
|
-
with _open_tiledbsoma(self._dataset, mode="w") as experiment:
|
1705
|
-
slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
|
1706
|
-
# update non_validated dict
|
1707
|
-
non_val_k = [
|
1708
|
-
nv for nv in self._non_validated_values[k] if nv not in syn_mapper
|
1709
|
-
]
|
1710
|
-
self._non_validated_values[k] = non_val_k
|
1795
|
+
Args:
|
1796
|
+
organism: The organism name.
|
1711
1797
|
|
1712
|
-
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1718
|
-
|
1798
|
+
Returns:
|
1799
|
+
Whether the SpatialData object is validated.
|
1800
|
+
"""
|
1801
|
+
# add all validated records to the current instance
|
1802
|
+
verbosity = settings.verbosity
|
1803
|
+
try:
|
1804
|
+
settings.verbosity = "error"
|
1805
|
+
self._update_registry_all()
|
1806
|
+
finally:
|
1807
|
+
settings.verbosity = verbosity
|
1808
|
+
|
1809
|
+
self._non_validated = {} # type: ignore
|
1810
|
+
|
1811
|
+
sample_validated = True
|
1812
|
+
if self._sample_df_curator:
|
1813
|
+
logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
|
1814
|
+
sample_validated &= self._sample_df_curator.validate()
|
1815
|
+
if len(self._sample_df_curator.non_validated) > 0:
|
1816
|
+
self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
|
1817
|
+
logger.print("")
|
1818
|
+
|
1819
|
+
mods_validated = True
|
1820
|
+
for table, adata_curator in self._table_adata_curators.items():
|
1821
|
+
logger.info(f"validating categoricals of table '{table}' ...")
|
1822
|
+
mods_validated &= adata_curator.validate()
|
1823
|
+
if len(adata_curator.non_validated) > 0:
|
1824
|
+
self._non_validated[table] = adata_curator.non_validated # type: ignore
|
1825
|
+
logger.print("")
|
1826
|
+
|
1827
|
+
self._is_validated = sample_validated & mods_validated
|
1828
|
+
return self._is_validated
|
1719
1829
|
|
1720
1830
|
def save_artifact(
|
1721
1831
|
self,
|
@@ -1725,424 +1835,388 @@ class TiledbsomaCatManager(CatManager):
|
|
1725
1835
|
revises: Artifact | None = None,
|
1726
1836
|
run: Run | None = None,
|
1727
1837
|
) -> Artifact:
|
1728
|
-
"""Save the validated
|
1838
|
+
"""Save the validated SpatialData store and metadata.
|
1729
1839
|
|
1730
1840
|
Args:
|
1731
|
-
description: A description of the
|
1841
|
+
description: A description of the dataset.
|
1732
1842
|
key: A path-like key to reference artifact in default storage,
|
1733
|
-
e.g., `"
|
1843
|
+
e.g., `"myartifact.zarr"`. Artifacts with the same key form a version family.
|
1734
1844
|
revises: Previous version of the artifact. Triggers a revision.
|
1735
1845
|
run: The run that creates the artifact.
|
1736
1846
|
|
1737
1847
|
Returns:
|
1738
1848
|
A saved artifact record.
|
1739
1849
|
"""
|
1740
|
-
from lamindb.models.artifact import add_labels
|
1741
|
-
|
1742
1850
|
if not self._is_validated:
|
1743
1851
|
self.validate()
|
1744
1852
|
if not self._is_validated:
|
1745
1853
|
raise ValidationError("Dataset does not validate. Please curate.")
|
1746
1854
|
|
1747
|
-
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1855
|
+
return save_artifact(
|
1856
|
+
self._sdata,
|
1857
|
+
description=description,
|
1858
|
+
fields=self.categoricals,
|
1859
|
+
index_field=self.var_index,
|
1860
|
+
key=key,
|
1861
|
+
artifact=self._artifact,
|
1862
|
+
revises=revises,
|
1863
|
+
run=run,
|
1864
|
+
schema=None,
|
1865
|
+
organism=self._organism,
|
1866
|
+
sample_metadata_key=self._sample_metadata_key,
|
1867
|
+
)
|
1868
|
+
|
1869
|
+
|
1870
|
+
class TiledbsomaCatManager(CatManager):
|
1871
|
+
"""Categorical manager for `tiledbsoma.Experiment`."""
|
1872
|
+
|
1873
|
+
def __init__(
|
1874
|
+
self,
|
1875
|
+
experiment_uri: UPathStr | Artifact,
|
1876
|
+
var_index: dict[str, tuple[str, FieldAttr]],
|
1877
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
1878
|
+
obs_columns: FieldAttr = Feature.name,
|
1879
|
+
organism: str | None = None,
|
1880
|
+
sources: dict[str, Record] | None = None,
|
1881
|
+
):
|
1882
|
+
self._obs_fields = categoricals or {}
|
1883
|
+
self._var_fields = var_index
|
1884
|
+
self._columns_field = obs_columns
|
1885
|
+
if isinstance(experiment_uri, Artifact):
|
1886
|
+
self._dataset = experiment_uri.path
|
1887
|
+
self._artifact = experiment_uri
|
1758
1888
|
else:
|
1759
|
-
|
1889
|
+
self._dataset = UPath(experiment_uri)
|
1890
|
+
self._artifact = None
|
1891
|
+
self._organism = organism
|
1892
|
+
self._sources = sources or {}
|
1760
1893
|
|
1761
|
-
|
1762
|
-
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
1766
|
-
|
1767
|
-
|
1768
|
-
|
1769
|
-
|
1770
|
-
|
1771
|
-
|
1772
|
-
|
1773
|
-
field=self._columns_field,
|
1774
|
-
mute=True,
|
1775
|
-
organism=organism,
|
1776
|
-
)
|
1777
|
-
for ms in self._var_fields:
|
1778
|
-
var_key, var_field = self._var_fields[ms]
|
1779
|
-
organism = check_registry_organism(
|
1780
|
-
var_field.field.model, self._organism
|
1781
|
-
).get("organism")
|
1782
|
-
feature_sets[f"{ms}__var"] = Schema.from_values(
|
1783
|
-
values=self._validated_values[f"{ms}__{var_key}"],
|
1784
|
-
field=var_field,
|
1785
|
-
organism=organism,
|
1786
|
-
raise_validation_error=False,
|
1787
|
-
)
|
1788
|
-
artifact._staged_feature_sets = feature_sets
|
1789
|
-
|
1790
|
-
feature_ref_is_name = _ref_is_name(self._columns_field)
|
1791
|
-
features = Feature.lookup().dict()
|
1792
|
-
for key, field in self._obs_fields.items():
|
1793
|
-
feature = features.get(key)
|
1794
|
-
registry = field.field.model
|
1795
|
-
organism = check_registry_organism(field.field.model, self._organism).get(
|
1796
|
-
"organism"
|
1797
|
-
)
|
1798
|
-
labels = registry.from_values(
|
1799
|
-
values=self._validated_values[key], field=field, organism=organism
|
1800
|
-
)
|
1801
|
-
if len(labels) == 0:
|
1802
|
-
continue
|
1803
|
-
if hasattr(registry, "_name_field"):
|
1804
|
-
label_ref_is_name = field.field.name == registry._name_field
|
1805
|
-
add_labels(
|
1806
|
-
artifact,
|
1807
|
-
records=labels,
|
1808
|
-
feature=feature,
|
1809
|
-
feature_ref_is_name=feature_ref_is_name,
|
1810
|
-
label_ref_is_name=label_ref_is_name,
|
1811
|
-
from_curator=True,
|
1812
|
-
)
|
1813
|
-
|
1814
|
-
return artifact.save()
|
1815
|
-
|
1816
|
-
|
1817
|
-
class SpatialDataCatManager(CatManager):
|
1818
|
-
"""Curation flow for a ``Spatialdata`` object.
|
1819
|
-
|
1820
|
-
See also :class:`~lamindb.Curator`.
|
1821
|
-
|
1822
|
-
Note that if genes or other measurements are removed from the SpatialData object,
|
1823
|
-
the object should be recreated.
|
1894
|
+
self._is_validated: bool | None = False
|
1895
|
+
self._non_validated_values: dict[str, list] | None = None
|
1896
|
+
self._validated_values: dict[str, list] = {}
|
1897
|
+
# filled by _check_save_keys
|
1898
|
+
self._n_obs: int | None = None
|
1899
|
+
self._valid_obs_keys: list[str] | None = None
|
1900
|
+
self._obs_pa_schema: pa.lib.Schema | None = (
|
1901
|
+
None # this is needed to create the obs feature set
|
1902
|
+
)
|
1903
|
+
self._valid_var_keys: list[str] | None = None
|
1904
|
+
self._var_fields_flat: dict[str, FieldAttr] | None = None
|
1905
|
+
self._check_save_keys()
|
1824
1906
|
|
1825
|
-
|
1907
|
+
# check that the provided keys in var_index and categoricals are available in the store
|
1908
|
+
# and save features
|
1909
|
+
def _check_save_keys(self):
|
1910
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1826
1911
|
|
1827
|
-
|
1828
|
-
|
1829
|
-
|
1830
|
-
|
1912
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1913
|
+
experiment_obs = experiment.obs
|
1914
|
+
self._n_obs = len(experiment_obs)
|
1915
|
+
self._obs_pa_schema = experiment_obs.schema
|
1916
|
+
valid_obs_keys = [
|
1917
|
+
k for k in self._obs_pa_schema.names if k != "soma_joinid"
|
1918
|
+
]
|
1919
|
+
self._valid_obs_keys = valid_obs_keys
|
1831
1920
|
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1835
|
-
|
1836
|
-
|
1837
|
-
|
1838
|
-
|
1839
|
-
|
1840
|
-
|
1841
|
-
>>> import bionty as bt
|
1842
|
-
>>> curator = SpatialDataCatManager(
|
1843
|
-
... sdata,
|
1844
|
-
... var_index={
|
1845
|
-
... "table_1": bt.Gene.ensembl_gene_id,
|
1846
|
-
... },
|
1847
|
-
... categoricals={
|
1848
|
-
... "table1":
|
1849
|
-
... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
|
1850
|
-
... "sample":
|
1851
|
-
... {"experimental_factor": bt.ExperimentalFactor.name},
|
1852
|
-
... },
|
1853
|
-
... organism="human",
|
1854
|
-
... )
|
1855
|
-
"""
|
1921
|
+
valid_var_keys = []
|
1922
|
+
ms_list = []
|
1923
|
+
for ms in experiment.ms.keys():
|
1924
|
+
ms_list.append(ms)
|
1925
|
+
var_ms = experiment.ms[ms].var
|
1926
|
+
valid_var_keys += [
|
1927
|
+
f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
|
1928
|
+
]
|
1929
|
+
self._valid_var_keys = valid_var_keys
|
1856
1930
|
|
1857
|
-
|
1858
|
-
|
1859
|
-
|
1860
|
-
|
1861
|
-
|
1862
|
-
|
1863
|
-
organism: str | None = None,
|
1864
|
-
sources: dict[str, dict[str, Record]] | None = None,
|
1865
|
-
exclude: dict[str, dict] | None = None,
|
1866
|
-
*,
|
1867
|
-
sample_metadata_key: str | None = "sample",
|
1868
|
-
) -> None:
|
1869
|
-
super().__init__(
|
1870
|
-
dataset=sdata,
|
1871
|
-
categoricals={},
|
1872
|
-
sources=sources,
|
1873
|
-
organism=organism,
|
1874
|
-
exclude=exclude,
|
1875
|
-
)
|
1876
|
-
if isinstance(sdata, Artifact):
|
1877
|
-
# TODO: load() doesn't yet work
|
1878
|
-
self._sdata = sdata.load()
|
1879
|
-
else:
|
1880
|
-
self._sdata = self._dataset
|
1881
|
-
self._sample_metadata_key = sample_metadata_key
|
1882
|
-
self._write_path = None
|
1883
|
-
self._var_fields = var_index
|
1884
|
-
self._verify_accessor_exists(self._var_fields.keys())
|
1885
|
-
self._categoricals = categoricals
|
1886
|
-
self._table_keys = set(self._var_fields.keys()) | set(
|
1887
|
-
self._categoricals.keys() - {self._sample_metadata_key}
|
1888
|
-
)
|
1889
|
-
self._verbosity = verbosity
|
1890
|
-
self._sample_df_curator = None
|
1891
|
-
if self._sample_metadata_key is not None:
|
1892
|
-
self._sample_metadata = self._sdata.get_attrs(
|
1893
|
-
key=self._sample_metadata_key, return_as="df", flatten=True
|
1894
|
-
)
|
1895
|
-
self._is_validated = False
|
1931
|
+
# check validity of keys in categoricals
|
1932
|
+
nonval_keys = []
|
1933
|
+
for obs_key in self._obs_fields.keys():
|
1934
|
+
if obs_key not in valid_obs_keys:
|
1935
|
+
nonval_keys.append(obs_key)
|
1936
|
+
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1896
1937
|
|
1897
|
-
#
|
1938
|
+
# check validity of keys in var_index
|
1939
|
+
self._var_fields_flat = {}
|
1898
1940
|
nonval_keys = []
|
1899
|
-
for
|
1900
|
-
|
1901
|
-
|
1902
|
-
|
1903
|
-
|
1904
|
-
for key in accessor_categoricals.keys():
|
1905
|
-
if key not in self._sample_metadata.columns:
|
1906
|
-
nonval_keys.append(key)
|
1941
|
+
for ms_key in self._var_fields.keys():
|
1942
|
+
var_key, var_field = self._var_fields[ms_key]
|
1943
|
+
var_key_flat = f"{ms_key}__{var_key}"
|
1944
|
+
if var_key_flat not in valid_var_keys:
|
1945
|
+
nonval_keys.append(f"({ms_key}, {var_key})")
|
1907
1946
|
else:
|
1908
|
-
|
1909
|
-
|
1910
|
-
nonval_keys.append(key)
|
1911
|
-
|
1912
|
-
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1947
|
+
self._var_fields_flat[var_key_flat] = var_field
|
1948
|
+
_maybe_curation_keys_not_present(nonval_keys, "var_index")
|
1913
1949
|
|
1914
|
-
# check validity of keys in sources
|
1915
|
-
|
1916
|
-
|
1917
|
-
|
1918
|
-
|
1919
|
-
|
1920
|
-
|
1921
|
-
):
|
1922
|
-
columns = self._sample_metadata.columns
|
1923
|
-
elif accessor != self._sample_metadata_key:
|
1924
|
-
columns = self._sdata[accessor].obs.columns
|
1925
|
-
else:
|
1926
|
-
continue
|
1927
|
-
for key in accessor_sources:
|
1928
|
-
if key not in columns:
|
1929
|
-
nonval_keys.append(key)
|
1930
|
-
_maybe_curation_keys_not_present(nonval_keys, name)
|
1950
|
+
# check validity of keys in sources
|
1951
|
+
valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
|
1952
|
+
nonval_keys = []
|
1953
|
+
for arg_key in self._sources.keys():
|
1954
|
+
if arg_key not in valid_arg_keys:
|
1955
|
+
nonval_keys.append(arg_key)
|
1956
|
+
_maybe_curation_keys_not_present(nonval_keys, "sources")
|
1931
1957
|
|
1932
|
-
#
|
1958
|
+
# register obs columns' names
|
1959
|
+
register_columns = list(self._obs_fields.keys())
|
1960
|
+
organism = configure_organism(
|
1961
|
+
self._columns_field.field.model, self._organism
|
1962
|
+
).get("organism")
|
1963
|
+
update_registry(
|
1964
|
+
values=register_columns,
|
1965
|
+
field=self._columns_field,
|
1966
|
+
key="columns",
|
1967
|
+
validated_only=False,
|
1968
|
+
organism=organism,
|
1969
|
+
source=self._sources.get("columns"),
|
1970
|
+
)
|
1971
|
+
additional_columns = [k for k in valid_obs_keys if k not in register_columns]
|
1972
|
+
# no need to register with validated_only=True if columns are features
|
1933
1973
|
if (
|
1934
|
-
|
1935
|
-
and self.
|
1974
|
+
len(additional_columns) > 0
|
1975
|
+
and self._columns_field.field.model is not Feature
|
1936
1976
|
):
|
1937
|
-
|
1938
|
-
|
1939
|
-
|
1940
|
-
|
1941
|
-
|
1942
|
-
sources=self._sources.get(self._sample_metadata_key),
|
1943
|
-
exclude=self._exclude.get(self._sample_metadata_key),
|
1944
|
-
organism=organism,
|
1945
|
-
)
|
1946
|
-
self._table_adata_curators = {
|
1947
|
-
table: AnnDataCatManager(
|
1948
|
-
data=self._sdata[table],
|
1949
|
-
var_index=var_index.get(table),
|
1950
|
-
categoricals=self._categoricals.get(table),
|
1951
|
-
verbosity=verbosity,
|
1952
|
-
sources=self._sources.get(table),
|
1953
|
-
exclude=self._exclude.get(table),
|
1977
|
+
update_registry(
|
1978
|
+
values=additional_columns,
|
1979
|
+
field=self._columns_field,
|
1980
|
+
key="columns",
|
1981
|
+
validated_only=True,
|
1954
1982
|
organism=organism,
|
1983
|
+
source=self._sources.get("columns"),
|
1955
1984
|
)
|
1956
|
-
for table in self._table_keys
|
1957
|
-
}
|
1958
1985
|
|
1959
|
-
|
1986
|
+
def validate(self):
|
1987
|
+
"""Validate categories."""
|
1988
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1960
1989
|
|
1961
|
-
|
1962
|
-
|
1963
|
-
""
|
1964
|
-
|
1990
|
+
validated = True
|
1991
|
+
self._non_validated_values = {}
|
1992
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1993
|
+
for ms, (key, field) in self._var_fields.items():
|
1994
|
+
var_ms = experiment.ms[ms].var
|
1995
|
+
var_ms_key = f"{ms}__{key}"
|
1996
|
+
# it was already validated and cached
|
1997
|
+
if var_ms_key in self._validated_values:
|
1998
|
+
continue
|
1999
|
+
var_ms_values = (
|
2000
|
+
var_ms.read(column_names=[key]).concat()[key].to_pylist()
|
2001
|
+
)
|
2002
|
+
organism = configure_organism(field.field.model, self._organism).get(
|
2003
|
+
"organism"
|
2004
|
+
)
|
2005
|
+
update_registry(
|
2006
|
+
values=var_ms_values,
|
2007
|
+
field=field,
|
2008
|
+
key=var_ms_key,
|
2009
|
+
validated_only=True,
|
2010
|
+
organism=organism,
|
2011
|
+
source=self._sources.get(var_ms_key),
|
2012
|
+
)
|
2013
|
+
_, non_val = validate_categories(
|
2014
|
+
values=var_ms_values,
|
2015
|
+
field=field,
|
2016
|
+
key=var_ms_key,
|
2017
|
+
organism=organism,
|
2018
|
+
source=self._sources.get(var_ms_key),
|
2019
|
+
)
|
2020
|
+
if len(non_val) > 0:
|
2021
|
+
validated = False
|
2022
|
+
self._non_validated_values[var_ms_key] = non_val
|
2023
|
+
else:
|
2024
|
+
self._validated_values[var_ms_key] = var_ms_values
|
1965
2025
|
|
1966
|
-
|
1967
|
-
|
1968
|
-
|
1969
|
-
|
2026
|
+
obs = experiment.obs
|
2027
|
+
for key, field in self._obs_fields.items():
|
2028
|
+
# already validated and cached
|
2029
|
+
if key in self._validated_values:
|
2030
|
+
continue
|
2031
|
+
values = pa.compute.unique(
|
2032
|
+
obs.read(column_names=[key]).concat()[key]
|
2033
|
+
).to_pylist()
|
2034
|
+
organism = configure_organism(field.field.model, self._organism).get(
|
2035
|
+
"organism"
|
2036
|
+
)
|
2037
|
+
update_registry(
|
2038
|
+
values=values,
|
2039
|
+
field=field,
|
2040
|
+
key=key,
|
2041
|
+
validated_only=True,
|
2042
|
+
organism=organism,
|
2043
|
+
source=self._sources.get(key),
|
2044
|
+
)
|
2045
|
+
_, non_val = validate_categories(
|
2046
|
+
values=values,
|
2047
|
+
field=field,
|
2048
|
+
key=key,
|
2049
|
+
organism=organism,
|
2050
|
+
source=self._sources.get(key),
|
2051
|
+
)
|
2052
|
+
if len(non_val) > 0:
|
2053
|
+
validated = False
|
2054
|
+
self._non_validated_values[key] = non_val
|
2055
|
+
else:
|
2056
|
+
self._validated_values[key] = values
|
2057
|
+
self._is_validated = validated
|
2058
|
+
return self._is_validated
|
1970
2059
|
|
1971
|
-
|
1972
|
-
|
1973
|
-
"""Return the non-validated features and labels."""
|
1974
|
-
if self._non_validated is None:
|
1975
|
-
raise ValidationError("Please run validate() first!")
|
1976
|
-
return self._non_validated
|
2060
|
+
def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
|
2061
|
+
assert self._non_validated_values is not None # noqa: S101
|
1977
2062
|
|
1978
|
-
|
1979
|
-
|
1980
|
-
|
1981
|
-
|
1982
|
-
|
1983
|
-
|
1984
|
-
|
1985
|
-
|
1986
|
-
|
1987
|
-
is_present = True
|
1988
|
-
if not is_present:
|
1989
|
-
raise ValidationError(f"Accessor '{acc}' does not exist!")
|
2063
|
+
if key in self._valid_obs_keys:
|
2064
|
+
field = self._obs_fields[key]
|
2065
|
+
elif key in self._valid_var_keys:
|
2066
|
+
ms = key.partition("__")[0]
|
2067
|
+
field = self._var_fields[ms][1]
|
2068
|
+
else:
|
2069
|
+
raise KeyError(f"key {key} is invalid!")
|
2070
|
+
values = self._non_validated_values.get(key, [])
|
2071
|
+
return values, field
|
1990
2072
|
|
1991
|
-
def
|
1992
|
-
"""
|
2073
|
+
def add_new_from(self, key: str, **kwargs) -> None:
|
2074
|
+
"""Add validated & new categories.
|
1993
2075
|
|
1994
2076
|
Args:
|
1995
|
-
|
2077
|
+
key: The key referencing the slot in the `tiledbsoma` store.
|
2078
|
+
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
2079
|
+
or a column name in `.obs`.
|
1996
2080
|
"""
|
1997
|
-
|
1998
|
-
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2002
|
-
|
2003
|
-
|
2004
|
-
def _update_registry_all(self) -> None:
|
2005
|
-
"""Saves labels of all features for sample and table metadata."""
|
2006
|
-
if self._sample_df_curator is not None:
|
2007
|
-
self._sample_df_curator._update_registry_all(
|
2008
|
-
validated_only=True,
|
2081
|
+
if self._non_validated_values is None:
|
2082
|
+
raise ValidationError("Run .validate() first.")
|
2083
|
+
if key == "all":
|
2084
|
+
keys = list(self._non_validated_values.keys())
|
2085
|
+
else:
|
2086
|
+
avail_keys = list(
|
2087
|
+
chain(self._non_validated_values.keys(), self._validated_values.keys())
|
2009
2088
|
)
|
2010
|
-
|
2011
|
-
|
2012
|
-
|
2089
|
+
if key not in avail_keys:
|
2090
|
+
raise KeyError(
|
2091
|
+
f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
|
2092
|
+
)
|
2093
|
+
keys = [key]
|
2094
|
+
for k in keys:
|
2095
|
+
values, field = self._non_validated_values_field(k)
|
2096
|
+
if len(values) == 0:
|
2097
|
+
continue
|
2098
|
+
organism = configure_organism(field.field.model, self._organism).get(
|
2099
|
+
"organism"
|
2100
|
+
)
|
2101
|
+
update_registry(
|
2102
|
+
values=values,
|
2103
|
+
field=field,
|
2104
|
+
key=k,
|
2105
|
+
validated_only=False,
|
2106
|
+
organism=organism,
|
2107
|
+
source=self._sources.get(k),
|
2108
|
+
**kwargs,
|
2013
2109
|
)
|
2110
|
+
# update non-validated values list but keep the key there
|
2111
|
+
# it will be removed by .validate()
|
2112
|
+
if k in self._non_validated_values:
|
2113
|
+
self._non_validated_values[k] = []
|
2014
2114
|
|
2015
|
-
|
2016
|
-
|
2115
|
+
@property
|
2116
|
+
def non_validated(self) -> dict[str, list]:
|
2117
|
+
"""Return the non-validated features and labels."""
|
2118
|
+
non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
|
2119
|
+
return non_val
|
2017
2120
|
|
2018
|
-
|
2019
|
-
|
2020
|
-
|
2021
|
-
|
2022
|
-
"""
|
2023
|
-
if self._non_validated is None:
|
2024
|
-
raise ValidationError("Run .validate() first.")
|
2025
|
-
self._table_adata_curators[table].add_new_from_var_index(**kwargs)
|
2026
|
-
if table in self.non_validated.keys():
|
2027
|
-
if "var_index" in self._non_validated[table]:
|
2028
|
-
self._non_validated[table].pop("var_index")
|
2121
|
+
@property
|
2122
|
+
def var_index(self) -> dict[str, FieldAttr]:
|
2123
|
+
"""Return the registry fields with flattened keys to validate variables indices against."""
|
2124
|
+
return self._var_fields_flat
|
2029
2125
|
|
2030
|
-
|
2031
|
-
|
2126
|
+
@property
|
2127
|
+
def categoricals(self) -> dict[str, FieldAttr]:
|
2128
|
+
"""Return the obs fields to validate against."""
|
2129
|
+
return self._obs_fields
|
2032
2130
|
|
2033
|
-
def
|
2034
|
-
|
2035
|
-
key: str,
|
2036
|
-
accessor: str | None = None,
|
2037
|
-
**kwargs,
|
2038
|
-
) -> None:
|
2039
|
-
"""Save new values of categorical from sample level metadata or table.
|
2131
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
2132
|
+
"""Lookup categories.
|
2040
2133
|
|
2041
2134
|
Args:
|
2042
|
-
|
2043
|
-
accessor: The accessor key such as 'sample' or 'table x'.
|
2044
|
-
organism: The organism name.
|
2045
|
-
**kwargs: Additional keyword arguments to pass to create new records.
|
2135
|
+
public: If "public", the lookup is performed on the public reference.
|
2046
2136
|
"""
|
2047
|
-
|
2048
|
-
|
2049
|
-
|
2050
|
-
|
2051
|
-
|
2052
|
-
|
2053
|
-
if accessor not in self.categoricals:
|
2054
|
-
raise ValueError(
|
2055
|
-
f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
|
2056
|
-
)
|
2057
|
-
|
2058
|
-
if accessor in self._table_adata_curators:
|
2059
|
-
adata_curator = self._table_adata_curators[accessor]
|
2060
|
-
adata_curator.add_new_from(key=key, **kwargs)
|
2061
|
-
if accessor == self._sample_metadata_key:
|
2062
|
-
self._sample_df_curator.add_new_from(key=key, **kwargs)
|
2063
|
-
|
2064
|
-
if accessor in self.non_validated.keys():
|
2065
|
-
if len(self.non_validated[accessor].values()) == 0:
|
2066
|
-
self.non_validated.pop(accessor)
|
2137
|
+
return CatLookup(
|
2138
|
+
categoricals=self._obs_fields,
|
2139
|
+
slots={"columns": self._columns_field, **self._var_fields_flat},
|
2140
|
+
public=public,
|
2141
|
+
)
|
2067
2142
|
|
2068
|
-
def standardize(self, key: str
|
2069
|
-
"""Replace synonyms with
|
2143
|
+
def standardize(self, key: str):
|
2144
|
+
"""Replace synonyms with standardized values.
|
2070
2145
|
|
2071
2146
|
Modifies the dataset inplace.
|
2072
2147
|
|
2073
2148
|
Args:
|
2074
|
-
key: The key referencing the slot in the
|
2075
|
-
|
2149
|
+
key: The key referencing the slot in the `tiledbsoma` store.
|
2150
|
+
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
2151
|
+
or a column name in `.obs`.
|
2076
2152
|
"""
|
2077
2153
|
if len(self.non_validated) == 0:
|
2078
2154
|
logger.warning("values are already standardized")
|
2079
2155
|
return
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
if accessor == self._sample_metadata_key:
|
2084
|
-
if key not in self._sample_metadata.columns:
|
2085
|
-
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
2156
|
+
avail_keys = list(self._non_validated_values.keys())
|
2157
|
+
if key == "all":
|
2158
|
+
keys = avail_keys
|
2086
2159
|
else:
|
2087
|
-
if
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
):
|
2093
|
-
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
2094
|
-
|
2095
|
-
if accessor in self._table_adata_curators.keys():
|
2096
|
-
adata_curator = self._table_adata_curators[accessor]
|
2097
|
-
adata_curator.standardize(key)
|
2098
|
-
if accessor == self._sample_metadata_key:
|
2099
|
-
self._sample_df_curator.standardize(key)
|
2100
|
-
|
2101
|
-
if len(self.non_validated[accessor].values()) == 0:
|
2102
|
-
self.non_validated.pop(accessor)
|
2103
|
-
|
2104
|
-
def validate(self) -> bool:
|
2105
|
-
"""Validate variables and categorical observations.
|
2106
|
-
|
2107
|
-
This method also registers the validated records in the current instance:
|
2108
|
-
- from public sources
|
2109
|
-
|
2110
|
-
Args:
|
2111
|
-
organism: The organism name.
|
2160
|
+
if key not in avail_keys:
|
2161
|
+
raise KeyError(
|
2162
|
+
f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
|
2163
|
+
)
|
2164
|
+
keys = [key]
|
2112
2165
|
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2166
|
+
for k in keys:
|
2167
|
+
values, field = self._non_validated_values_field(k)
|
2168
|
+
if len(values) == 0:
|
2169
|
+
continue
|
2170
|
+
if k in self._valid_var_keys:
|
2171
|
+
ms, _, slot_key = k.partition("__")
|
2172
|
+
slot = lambda experiment: experiment.ms[ms].var # noqa: B023
|
2173
|
+
else:
|
2174
|
+
slot = lambda experiment: experiment.obs
|
2175
|
+
slot_key = k
|
2176
|
+
# errors if public ontology and the model has no organism
|
2177
|
+
# has to be fixed in bionty
|
2178
|
+
organism = configure_organism(field.field.model, self._organism).get(
|
2179
|
+
"organism"
|
2180
|
+
)
|
2181
|
+
syn_mapper = standardize_categories(
|
2182
|
+
values=values,
|
2183
|
+
field=field,
|
2184
|
+
source=self._sources.get(k),
|
2185
|
+
organism=organism,
|
2186
|
+
)
|
2187
|
+
if (n_syn_mapper := len(syn_mapper)) == 0:
|
2188
|
+
continue
|
2117
2189
|
|
2118
|
-
|
2119
|
-
verbosity = settings.verbosity
|
2120
|
-
try:
|
2121
|
-
settings.verbosity = "error"
|
2122
|
-
self._update_registry_all()
|
2123
|
-
finally:
|
2124
|
-
settings.verbosity = verbosity
|
2190
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
2125
2191
|
|
2126
|
-
|
2192
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
2193
|
+
value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
|
2194
|
+
table = slot(experiment).read(value_filter=value_filter).concat()
|
2127
2195
|
|
2128
|
-
|
2129
|
-
|
2130
|
-
logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
|
2131
|
-
sample_validated &= self._sample_df_curator.validate()
|
2132
|
-
if len(self._sample_df_curator.non_validated) > 0:
|
2133
|
-
self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
|
2134
|
-
logger.print("")
|
2196
|
+
if len(table) == 0:
|
2197
|
+
continue
|
2135
2198
|
|
2136
|
-
|
2137
|
-
|
2138
|
-
|
2139
|
-
|
2140
|
-
|
2141
|
-
|
2142
|
-
|
2199
|
+
df = table.to_pandas()
|
2200
|
+
# map values
|
2201
|
+
df[slot_key] = df[slot_key].map(
|
2202
|
+
lambda val: syn_mapper.get(val, val) # noqa
|
2203
|
+
)
|
2204
|
+
# write the mapped values
|
2205
|
+
with _open_tiledbsoma(self._dataset, mode="w") as experiment:
|
2206
|
+
slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
|
2207
|
+
# update non_validated dict
|
2208
|
+
non_val_k = [
|
2209
|
+
nv for nv in self._non_validated_values[k] if nv not in syn_mapper
|
2210
|
+
]
|
2211
|
+
self._non_validated_values[k] = non_val_k
|
2143
2212
|
|
2144
|
-
|
2145
|
-
|
2213
|
+
syn_mapper_print = _format_values(
|
2214
|
+
[f'"{m_k}" → "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
|
2215
|
+
)
|
2216
|
+
s = "s" if n_syn_mapper > 1 else ""
|
2217
|
+
logger.success(
|
2218
|
+
f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
|
2219
|
+
)
|
2146
2220
|
|
2147
2221
|
def save_artifact(
|
2148
2222
|
self,
|
@@ -2152,217 +2226,119 @@ class SpatialDataCatManager(CatManager):
|
|
2152
2226
|
revises: Artifact | None = None,
|
2153
2227
|
run: Run | None = None,
|
2154
2228
|
) -> Artifact:
|
2229
|
+
"""Save the validated `tiledbsoma` store and metadata.
|
2230
|
+
|
2231
|
+
Args:
|
2232
|
+
description: A description of the ``tiledbsoma`` store.
|
2233
|
+
key: A path-like key to reference artifact in default storage,
|
2234
|
+
e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
|
2235
|
+
revises: Previous version of the artifact. Triggers a revision.
|
2236
|
+
run: The run that creates the artifact.
|
2237
|
+
|
2238
|
+
Returns:
|
2239
|
+
A saved artifact record.
|
2240
|
+
"""
|
2155
2241
|
if not self._is_validated:
|
2156
2242
|
self.validate()
|
2157
2243
|
if not self._is_validated:
|
2158
2244
|
raise ValidationError("Dataset does not validate. Please curate.")
|
2159
2245
|
|
2160
|
-
|
2161
|
-
|
2162
|
-
|
2163
|
-
|
2164
|
-
self._artifact = Artifact.from_spatialdata(
|
2165
|
-
self._sdata,
|
2166
|
-
key=key,
|
2246
|
+
if self._artifact is None:
|
2247
|
+
artifact = Artifact(
|
2248
|
+
self._dataset,
|
2167
2249
|
description=description,
|
2250
|
+
key=key,
|
2168
2251
|
revises=revises,
|
2169
2252
|
run=run,
|
2170
2253
|
)
|
2171
|
-
self.
|
2254
|
+
artifact.n_observations = self._n_obs
|
2255
|
+
artifact.otype = "tiledbsoma"
|
2256
|
+
artifact.save()
|
2257
|
+
else:
|
2258
|
+
artifact = self._artifact
|
2172
2259
|
|
2173
|
-
|
2174
|
-
|
2175
|
-
|
2176
|
-
self._organism
|
2260
|
+
feature_sets = {}
|
2261
|
+
if len(self._obs_fields) > 0:
|
2262
|
+
organism = configure_organism(
|
2263
|
+
self._columns_field.field.model, self._organism
|
2264
|
+
).get("organism")
|
2265
|
+
empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
|
2266
|
+
mock_df = pa.Table.from_pydict(
|
2267
|
+
empty_dict, schema=self._obs_pa_schema
|
2268
|
+
).to_pandas()
|
2269
|
+
# in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
|
2270
|
+
feature_sets["obs"] = Schema.from_df(
|
2271
|
+
df=mock_df,
|
2272
|
+
field=self._columns_field,
|
2273
|
+
mute=True,
|
2274
|
+
organism=organism,
|
2177
2275
|
)
|
2178
|
-
|
2179
|
-
|
2180
|
-
|
2181
|
-
|
2182
|
-
obs_fields: dict[str, FieldAttr] = None,
|
2183
|
-
mute: bool = False,
|
2184
|
-
organism: str | Record | None = None,
|
2185
|
-
):
|
2186
|
-
"""Add Schemas from SpatialData."""
|
2187
|
-
if obs_fields is None:
|
2188
|
-
obs_fields = {}
|
2189
|
-
assert host.otype == "SpatialData" # noqa: S101
|
2190
|
-
|
2191
|
-
feature_sets = {}
|
2192
|
-
|
2193
|
-
# sample features
|
2194
|
-
sample_features = Feature.from_values(self._sample_metadata.columns) # type: ignore
|
2195
|
-
if len(sample_features) > 0:
|
2196
|
-
feature_sets[self._sample_metadata_key] = Schema(
|
2197
|
-
features=sample_features
|
2198
|
-
)
|
2199
|
-
|
2200
|
-
# table features
|
2201
|
-
for table, field in var_fields.items():
|
2202
|
-
table_fs = parse_staged_feature_sets_from_anndata(
|
2203
|
-
self._sdata[table],
|
2204
|
-
var_field=field,
|
2205
|
-
obs_field=obs_fields.get(table, Feature.name),
|
2206
|
-
mute=mute,
|
2207
|
-
organism=organism,
|
2208
|
-
)
|
2209
|
-
for k, v in table_fs.items():
|
2210
|
-
feature_sets[f"['{table}'].{k}"] = v
|
2211
|
-
|
2212
|
-
def _unify_staged_feature_sets_by_hash(
|
2213
|
-
feature_sets: MutableMapping[str, Schema],
|
2214
|
-
):
|
2215
|
-
unique_values: dict[str, Any] = {}
|
2216
|
-
|
2217
|
-
for key, value in feature_sets.items():
|
2218
|
-
value_hash = (
|
2219
|
-
value.hash
|
2220
|
-
) # Assuming each value has a .hash attribute
|
2221
|
-
if value_hash in unique_values:
|
2222
|
-
feature_sets[key] = unique_values[value_hash]
|
2223
|
-
else:
|
2224
|
-
unique_values[value_hash] = value
|
2225
|
-
|
2226
|
-
return feature_sets
|
2227
|
-
|
2228
|
-
# link feature sets
|
2229
|
-
host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
|
2230
|
-
feature_sets
|
2231
|
-
)
|
2232
|
-
host.save()
|
2233
|
-
|
2234
|
-
_add_set_from_spatialdata(
|
2235
|
-
self._artifact, var_fields=self._var_fields, **feature_kwargs
|
2276
|
+
for ms in self._var_fields:
|
2277
|
+
var_key, var_field = self._var_fields[ms]
|
2278
|
+
organism = configure_organism(var_field.field.model, self._organism).get(
|
2279
|
+
"organism"
|
2236
2280
|
)
|
2281
|
+
feature_sets[f"{ms}__var"] = Schema.from_values(
|
2282
|
+
values=self._validated_values[f"{ms}__{var_key}"],
|
2283
|
+
field=var_field,
|
2284
|
+
organism=organism,
|
2285
|
+
raise_validation_error=False,
|
2286
|
+
)
|
2287
|
+
artifact._staged_feature_sets = feature_sets
|
2237
2288
|
|
2238
|
-
|
2239
|
-
|
2240
|
-
|
2241
|
-
|
2242
|
-
|
2243
|
-
|
2244
|
-
|
2245
|
-
|
2246
|
-
|
2247
|
-
|
2248
|
-
feature = features.get(key)
|
2249
|
-
registry = field.field.model
|
2250
|
-
filter_kwargs = check_registry_organism(registry, self._organism)
|
2251
|
-
filter_kwargs_current = get_current_filter_kwargs(
|
2252
|
-
registry, filter_kwargs
|
2253
|
-
)
|
2254
|
-
df = data if isinstance(data, pd.DataFrame) else data.obs
|
2255
|
-
labels = registry.from_values(
|
2256
|
-
df[key],
|
2257
|
-
field=field,
|
2258
|
-
**filter_kwargs_current,
|
2259
|
-
)
|
2260
|
-
if len(labels) == 0:
|
2261
|
-
continue
|
2262
|
-
|
2263
|
-
label_ref_is_name = None
|
2264
|
-
if hasattr(registry, "_name_field"):
|
2265
|
-
label_ref_is_name = field.field.name == registry._name_field
|
2266
|
-
add_labels(
|
2267
|
-
artifact,
|
2268
|
-
records=labels,
|
2269
|
-
feature=feature,
|
2270
|
-
feature_ref_is_name=feature_ref_is_name,
|
2271
|
-
label_ref_is_name=label_ref_is_name,
|
2272
|
-
from_curator=True,
|
2273
|
-
)
|
2274
|
-
|
2275
|
-
for accessor, accessor_fields in self._categoricals.items():
|
2276
|
-
column_field = self._var_fields.get(accessor)
|
2277
|
-
if accessor == self._sample_metadata_key:
|
2278
|
-
_add_labels_from_spatialdata(
|
2279
|
-
self._sample_metadata,
|
2280
|
-
self._artifact,
|
2281
|
-
accessor_fields,
|
2282
|
-
feature_ref_is_name=(
|
2283
|
-
None if column_field is None else _ref_is_name(column_field)
|
2284
|
-
),
|
2285
|
-
)
|
2286
|
-
else:
|
2287
|
-
_add_labels_from_spatialdata(
|
2288
|
-
self._sdata.tables[accessor],
|
2289
|
-
self._artifact,
|
2290
|
-
accessor_fields,
|
2291
|
-
feature_ref_is_name=(
|
2292
|
-
None if column_field is None else _ref_is_name(column_field)
|
2293
|
-
),
|
2294
|
-
)
|
2295
|
-
|
2296
|
-
finally:
|
2297
|
-
settings.verbosity = verbosity
|
2298
|
-
|
2299
|
-
slug = ln_setup.settings.instance.slug
|
2300
|
-
if ln_setup.settings.instance.is_remote: # pragma: no cover
|
2301
|
-
logger.important(
|
2302
|
-
f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
|
2289
|
+
feature_ref_is_name = _ref_is_name(self._columns_field)
|
2290
|
+
features = Feature.lookup().dict()
|
2291
|
+
for key, field in self._obs_fields.items():
|
2292
|
+
feature = features.get(key)
|
2293
|
+
registry = field.field.model
|
2294
|
+
organism = configure_organism(field.field.model, self._organism).get(
|
2295
|
+
"organism"
|
2296
|
+
)
|
2297
|
+
labels = registry.from_values(
|
2298
|
+
values=self._validated_values[key], field=field, organism=organism
|
2303
2299
|
)
|
2300
|
+
if len(labels) == 0:
|
2301
|
+
continue
|
2302
|
+
if hasattr(registry, "_name_field"):
|
2303
|
+
label_ref_is_name = field.field.name == registry._name_field
|
2304
|
+
add_labels(
|
2305
|
+
artifact,
|
2306
|
+
records=labels,
|
2307
|
+
feature=feature,
|
2308
|
+
feature_ref_is_name=feature_ref_is_name,
|
2309
|
+
label_ref_is_name=label_ref_is_name,
|
2310
|
+
from_curator=True,
|
2311
|
+
)
|
2304
2312
|
|
2305
|
-
return
|
2313
|
+
return artifact.save()
|
2306
2314
|
|
2307
2315
|
|
2308
|
-
|
2309
|
-
|
2310
|
-
) -> dict[str, str]:
|
2311
|
-
"""Restrict the obs fields to name return only available obs fields.
|
2316
|
+
class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
2317
|
+
"""Categorical manager for `AnnData` respecting the CELLxGENE schema.
|
2312
2318
|
|
2313
|
-
|
2314
|
-
If both are available, we validate against ontology_id.
|
2315
|
-
If none are available, we validate against name.
|
2319
|
+
This will be superceded by a schema-based curation flow.
|
2316
2320
|
"""
|
2317
|
-
obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
|
2318
|
-
for name, field in obs_fields.items():
|
2319
|
-
if name.endswith("_ontology_term_id"):
|
2320
|
-
continue
|
2321
|
-
# if both the ontology id and the name are present, only validate on the ontology_id
|
2322
|
-
if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
|
2323
|
-
obs_fields_unique.pop(name)
|
2324
|
-
# if the neither name nor ontology id are present, validate on the name
|
2325
|
-
# this will raise error downstream, we just use name to be more readable
|
2326
|
-
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
2327
|
-
obs_fields_unique[name] = field
|
2328
|
-
|
2329
|
-
# Only retain obs_fields_unique that have keys in adata.obs.columns
|
2330
|
-
available_obs_fields = {
|
2331
|
-
k: v for k, v in obs_fields_unique.items() if k in obs.columns
|
2332
|
-
}
|
2333
|
-
|
2334
|
-
return available_obs_fields
|
2335
|
-
|
2336
2321
|
|
2337
|
-
|
2338
|
-
|
2339
|
-
|
2340
|
-
|
2341
|
-
|
2342
|
-
|
2343
|
-
|
2344
|
-
|
2345
|
-
|
2346
|
-
|
2347
|
-
logger.important(
|
2348
|
-
f"added default value '{default}' to the adata.obs['{name}']"
|
2349
|
-
)
|
2350
|
-
|
2351
|
-
|
2352
|
-
class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
2353
|
-
"""Annotation flow of AnnData based on CELLxGENE schema."""
|
2354
|
-
|
2355
|
-
_controls_were_created: bool | None = None
|
2322
|
+
cxg_categoricals_defaults = {
|
2323
|
+
"cell_type": "unknown",
|
2324
|
+
"development_stage": "unknown",
|
2325
|
+
"disease": "normal",
|
2326
|
+
"donor_id": "unknown",
|
2327
|
+
"self_reported_ethnicity": "unknown",
|
2328
|
+
"sex": "unknown",
|
2329
|
+
"suspension_type": "cell",
|
2330
|
+
"tissue_type": "tissue",
|
2331
|
+
}
|
2356
2332
|
|
2357
2333
|
def __init__(
|
2358
2334
|
self,
|
2359
|
-
adata: ad.AnnData
|
2335
|
+
adata: ad.AnnData,
|
2360
2336
|
categoricals: dict[str, FieldAttr] | None = None,
|
2361
2337
|
organism: Literal["human", "mouse"] = "human",
|
2362
2338
|
*,
|
2339
|
+
schema_version: Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
|
2363
2340
|
defaults: dict[str, str] = None,
|
2364
2341
|
extra_sources: dict[str, Record] = None,
|
2365
|
-
schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
|
2366
2342
|
verbosity: str = "hint",
|
2367
2343
|
) -> None:
|
2368
2344
|
"""CELLxGENE schema curator.
|
@@ -2372,304 +2348,85 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2372
2348
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
2373
2349
|
The CELLxGENE Curator maps against the required CELLxGENE fields by default.
|
2374
2350
|
organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
|
2351
|
+
schema_version: The CELLxGENE schema version to curate against.
|
2375
2352
|
defaults: Default values that are set if columns or column values are missing.
|
2376
2353
|
extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
|
2377
2354
|
These extra sources are joined with the CELLxGENE fixed sources.
|
2378
2355
|
Use this parameter when subclassing.
|
2379
|
-
exclude: A dictionary mapping column names to values to exclude.
|
2380
|
-
schema_version: The CELLxGENE schema version to curate against.
|
2381
2356
|
verbosity: The verbosity level.
|
2382
|
-
|
2383
2357
|
"""
|
2384
2358
|
import bionty as bt
|
2385
2359
|
|
2386
|
-
|
2360
|
+
from ._cellxgene_schemas import (
|
2361
|
+
_add_defaults_to_obs,
|
2362
|
+
_create_sources,
|
2363
|
+
_init_categoricals_additional_values,
|
2364
|
+
_restrict_obs_fields,
|
2365
|
+
)
|
2387
2366
|
|
2388
|
-
|
2367
|
+
# Add defaults first to ensure that we fetch valid sources
|
2368
|
+
if defaults:
|
2369
|
+
_add_defaults_to_obs(adata.obs, defaults)
|
2389
2370
|
|
2371
|
+
# Filter categoricals based on what's present in adata
|
2390
2372
|
if categoricals is None:
|
2391
|
-
categoricals =
|
2373
|
+
categoricals = self._get_cxg_categoricals()
|
2374
|
+
categoricals = _restrict_obs_fields(adata.obs, categoricals)
|
2392
2375
|
|
2393
|
-
|
2394
|
-
|
2395
|
-
VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
|
2396
|
-
if schema_version not in VALID_SCHEMA_VERSIONS:
|
2397
|
-
valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
|
2398
|
-
raise ValueError(
|
2399
|
-
f"Invalid schema_version: {schema_version}. "
|
2400
|
-
f"Valid versions are: {valid_versions}"
|
2401
|
-
)
|
2376
|
+
# Configure sources
|
2377
|
+
sources = _create_sources(categoricals, schema_version, organism)
|
2402
2378
|
self.schema_version = schema_version
|
2403
2379
|
self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
|
2404
|
-
with resources.path(
|
2405
|
-
"lamindb.curators._cellxgene_schemas", "schema_versions.yml"
|
2406
|
-
) as schema_versions_path:
|
2407
|
-
self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
|
2408
|
-
self.schema_version
|
2409
|
-
]
|
2410
|
-
|
2411
|
-
# Fetch AnnData obs to be able to set defaults and get sources
|
2412
|
-
if isinstance(adata, ad.AnnData):
|
2413
|
-
self._adata_obs = adata.obs
|
2414
|
-
else:
|
2415
|
-
self._adata_obs = backed_access(upath.create_path(adata)).obs # type: ignore
|
2416
|
-
|
2417
|
-
# Add defaults first to ensure that we fetch valid sources
|
2418
|
-
if defaults:
|
2419
|
-
_add_defaults_to_obs(self._adata_obs, defaults)
|
2420
|
-
|
2421
|
-
self.sources = self._create_sources(self._adata_obs)
|
2422
|
-
self.sources = {
|
2423
|
-
entity: source
|
2424
|
-
for entity, source in self.sources.items()
|
2425
|
-
if source is not None
|
2426
|
-
}
|
2427
|
-
|
2428
2380
|
# These sources are not a part of the cellxgene schema but rather passed through.
|
2429
2381
|
# This is useful when other Curators extend the CELLxGENE curator
|
2430
2382
|
if extra_sources:
|
2431
|
-
|
2383
|
+
sources = sources | extra_sources
|
2432
2384
|
|
2433
|
-
|
2434
|
-
exclude_keys = {
|
2435
|
-
entity: default
|
2436
|
-
for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
|
2437
|
-
if entity in self._adata_obs.columns # type: ignore
|
2438
|
-
}
|
2385
|
+
_init_categoricals_additional_values()
|
2439
2386
|
|
2440
2387
|
super().__init__(
|
2441
2388
|
data=adata,
|
2442
|
-
var_index=
|
2443
|
-
categoricals=
|
2389
|
+
var_index=bt.Gene.ensembl_gene_id,
|
2390
|
+
categoricals=categoricals,
|
2444
2391
|
verbosity=verbosity,
|
2445
2392
|
organism=organism,
|
2446
|
-
sources=
|
2447
|
-
exclude=exclude_keys,
|
2393
|
+
sources=sources,
|
2448
2394
|
)
|
2449
2395
|
|
2450
2396
|
@classmethod
|
2451
|
-
|
2452
|
-
import bionty as bt
|
2453
|
-
|
2454
|
-
import lamindb as ln
|
2455
|
-
|
2456
|
-
# Note: if you add another control below, be mindful to change the if condition that
|
2457
|
-
# triggers whether creating these records is re-considered
|
2458
|
-
if cls._controls_were_created is None:
|
2459
|
-
cls._controls_were_created = (
|
2460
|
-
ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
|
2461
|
-
is not None
|
2462
|
-
)
|
2463
|
-
if not cls._controls_were_created:
|
2464
|
-
logger.important("Creating control labels in the CellxGene schema.")
|
2465
|
-
bt.CellType(
|
2466
|
-
ontology_id="unknown",
|
2467
|
-
name="unknown",
|
2468
|
-
description="From CellxGene schema.",
|
2469
|
-
).save()
|
2470
|
-
pato = bt.Source.filter(name="pato", version="2024-03-28").one()
|
2471
|
-
normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
|
2472
|
-
bt.Disease(
|
2473
|
-
uid=normal.uid,
|
2474
|
-
name=normal.name,
|
2475
|
-
ontology_id=normal.ontology_id,
|
2476
|
-
description=normal.description,
|
2477
|
-
source=normal.source,
|
2478
|
-
).save()
|
2479
|
-
bt.Ethnicity(
|
2480
|
-
ontology_id="na", name="na", description="From CellxGene schema."
|
2481
|
-
).save()
|
2482
|
-
bt.Ethnicity(
|
2483
|
-
ontology_id="unknown",
|
2484
|
-
name="unknown",
|
2485
|
-
description="From CellxGene schema.",
|
2486
|
-
).save()
|
2487
|
-
bt.DevelopmentalStage(
|
2488
|
-
ontology_id="unknown",
|
2489
|
-
name="unknown",
|
2490
|
-
description="From CellxGene schema.",
|
2491
|
-
).save()
|
2492
|
-
bt.Phenotype(
|
2493
|
-
ontology_id="unknown",
|
2494
|
-
name="unknown",
|
2495
|
-
description="From CellxGene schema.",
|
2496
|
-
).save()
|
2497
|
-
|
2498
|
-
tissue_type = ln.ULabel(
|
2499
|
-
name="TissueType",
|
2500
|
-
is_type=True,
|
2501
|
-
description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
|
2502
|
-
).save()
|
2503
|
-
ln.ULabel(
|
2504
|
-
name="tissue", type=tissue_type, description="From CellxGene schema."
|
2505
|
-
).save()
|
2506
|
-
ln.ULabel(
|
2507
|
-
name="organoid", type=tissue_type, description="From CellxGene schema."
|
2508
|
-
).save()
|
2509
|
-
ln.ULabel(
|
2510
|
-
name="cell culture",
|
2511
|
-
type=tissue_type,
|
2512
|
-
description="From CellxGene schema.",
|
2513
|
-
).save()
|
2514
|
-
|
2515
|
-
suspension_type = ln.ULabel(
|
2516
|
-
name="SuspensionType",
|
2517
|
-
is_type=True,
|
2518
|
-
description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
|
2519
|
-
).save()
|
2520
|
-
ln.ULabel(
|
2521
|
-
name="cell", type=suspension_type, description="From CellxGene schema."
|
2522
|
-
).save()
|
2523
|
-
ln.ULabel(
|
2524
|
-
name="nucleus",
|
2525
|
-
type=suspension_type,
|
2526
|
-
description="From CellxGene schema.",
|
2527
|
-
).save()
|
2528
|
-
ln.ULabel(name="na", type=suspension_type).save()
|
2529
|
-
|
2530
|
-
@classmethod
|
2531
|
-
def _get_categoricals(cls) -> dict[str, FieldAttr]:
|
2532
|
-
import bionty as bt
|
2533
|
-
|
2534
|
-
return {
|
2535
|
-
"assay": bt.ExperimentalFactor.name,
|
2536
|
-
"assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
|
2537
|
-
"cell_type": bt.CellType.name,
|
2538
|
-
"cell_type_ontology_term_id": bt.CellType.ontology_id,
|
2539
|
-
"development_stage": bt.DevelopmentalStage.name,
|
2540
|
-
"development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
|
2541
|
-
"disease": bt.Disease.name,
|
2542
|
-
"disease_ontology_term_id": bt.Disease.ontology_id,
|
2543
|
-
# "donor_id": "str", via pandera
|
2544
|
-
"self_reported_ethnicity": bt.Ethnicity.name,
|
2545
|
-
"self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
|
2546
|
-
"sex": bt.Phenotype.name,
|
2547
|
-
"sex_ontology_term_id": bt.Phenotype.ontology_id,
|
2548
|
-
"suspension_type": ULabel.name,
|
2549
|
-
"tissue": bt.Tissue.name,
|
2550
|
-
"tissue_ontology_term_id": bt.Tissue.ontology_id,
|
2551
|
-
"tissue_type": ULabel.name,
|
2552
|
-
"organism": bt.Organism.name,
|
2553
|
-
"organism_ontology_term_id": bt.Organism.ontology_id,
|
2554
|
-
}
|
2555
|
-
|
2556
|
-
@classmethod
|
2397
|
+
@deprecated(new_name="cxg_categoricals_defaults")
|
2557
2398
|
def _get_categoricals_defaults(cls) -> dict[str, str]:
|
2558
|
-
return
|
2559
|
-
"cell_type": "unknown",
|
2560
|
-
"development_stage": "unknown",
|
2561
|
-
"disease": "normal",
|
2562
|
-
"donor_id": "unknown",
|
2563
|
-
"self_reported_ethnicity": "unknown",
|
2564
|
-
"sex": "unknown",
|
2565
|
-
"suspension_type": "cell",
|
2566
|
-
"tissue_type": "tissue",
|
2567
|
-
}
|
2568
|
-
|
2569
|
-
@property
|
2570
|
-
def pinned_ontologies(self) -> pd.DataFrame:
|
2571
|
-
return self._pinned_ontologies
|
2572
|
-
|
2573
|
-
@property
|
2574
|
-
def adata(self) -> AnnData:
|
2575
|
-
return self._adata
|
2576
|
-
|
2577
|
-
def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
|
2578
|
-
"""Creates a sources dictionary that can be passed to AnnDataCatManager."""
|
2579
|
-
import bionty as bt
|
2580
|
-
|
2581
|
-
# fmt: off
|
2582
|
-
def _fetch_bionty_source(
|
2583
|
-
entity: str, organism: str, source: str
|
2584
|
-
) -> bt.Source | None:
|
2585
|
-
"""Fetch the Bionty source of the pinned ontology.
|
2399
|
+
return cls.cxg_categoricals_defaults
|
2586
2400
|
|
2587
|
-
|
2588
|
-
|
2589
|
-
|
2590
|
-
|
2591
|
-
(self._pinned_ontologies["source"] == source), "version"].iloc[0]
|
2592
|
-
return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
|
2593
|
-
|
2594
|
-
entity_mapping = {
|
2595
|
-
"var_index": ("Gene", self.organism, "ensembl"),
|
2596
|
-
"cell_type": ("CellType", "all", "cl"),
|
2597
|
-
"assay": ("ExperimentalFactor", "all", "efo"),
|
2598
|
-
"self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
|
2599
|
-
"development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
|
2600
|
-
"disease": ("Disease", "all", "mondo"),
|
2601
|
-
# "organism": ("Organism", "vertebrates", "ensembl"),
|
2602
|
-
"sex": ("Phenotype", "all", "pato"),
|
2603
|
-
"tissue": ("Tissue", "all", "uberon"),
|
2604
|
-
}
|
2605
|
-
# fmt: on
|
2606
|
-
|
2607
|
-
# Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
|
2608
|
-
entity_to_sources = {
|
2609
|
-
entity: _fetch_bionty_source(*params)
|
2610
|
-
for entity, params in entity_mapping.items()
|
2611
|
-
if entity in obs.columns
|
2612
|
-
or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
|
2613
|
-
or entity == "var_index"
|
2614
|
-
}
|
2615
|
-
|
2616
|
-
return entity_to_sources
|
2617
|
-
|
2618
|
-
def _convert_name_to_ontology_id(self, values: pd.Series, field: FieldAttr):
|
2619
|
-
"""Converts a column that stores a name into a column that stores the ontology id.
|
2620
|
-
|
2621
|
-
cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
|
2622
|
-
"""
|
2623
|
-
field_name = field.field.name
|
2624
|
-
assert field_name == "name" # noqa: S101
|
2625
|
-
cols = ["name", "ontology_id"]
|
2626
|
-
registry = field.field.model
|
2401
|
+
@classmethod
|
2402
|
+
def _get_cxg_categoricals(cls) -> dict[str, FieldAttr]:
|
2403
|
+
"""Returns the CELLxGENE schema mapped fields."""
|
2404
|
+
from ._cellxgene_schemas import _get_cxg_categoricals
|
2627
2405
|
|
2628
|
-
|
2629
|
-
validated_records = registry.filter(**{f"{field_name}__in": values})
|
2630
|
-
mapper = (
|
2631
|
-
pd.DataFrame(validated_records.values_list(*cols))
|
2632
|
-
.set_index(0)
|
2633
|
-
.to_dict()[1]
|
2634
|
-
)
|
2635
|
-
return values.map(mapper)
|
2406
|
+
return _get_cxg_categoricals()
|
2636
2407
|
|
2637
|
-
def validate(self) -> bool:
|
2408
|
+
def validate(self) -> bool:
|
2638
2409
|
"""Validates the AnnData object against most cellxgene requirements."""
|
2410
|
+
from ._cellxgene_schemas import RESERVED_NAMES
|
2411
|
+
|
2639
2412
|
# Verify that all required obs columns are present
|
2413
|
+
required_columns = list(self.cxg_categoricals_defaults.keys()) + ["donor_id"]
|
2640
2414
|
missing_obs_fields = [
|
2641
2415
|
name
|
2642
|
-
for name in
|
2416
|
+
for name in required_columns
|
2643
2417
|
if name not in self._adata.obs.columns
|
2644
2418
|
and f"{name}_ontology_term_id" not in self._adata.obs.columns
|
2645
2419
|
]
|
2646
2420
|
if len(missing_obs_fields) > 0:
|
2647
|
-
|
2648
|
-
|
2649
|
-
|
2650
|
-
"consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
|
2651
|
-
"to automatically add these columns with default values."
|
2421
|
+
logger.error(
|
2422
|
+
f"missing required obs columns {_format_values(missing_obs_fields)}\n"
|
2423
|
+
" → consider initializing a Curate object with `defaults=cxg.CellxGeneAnnDataCatManager.cxg_categoricals_defaults` to automatically add these columns with default values"
|
2652
2424
|
)
|
2653
2425
|
return False
|
2654
2426
|
|
2655
2427
|
# Verify that no cellxgene reserved names are present
|
2656
|
-
reserved_names = {
|
2657
|
-
"ethnicity",
|
2658
|
-
"ethnicity_ontology_term_id",
|
2659
|
-
"X_normalization",
|
2660
|
-
"default_field",
|
2661
|
-
"layer_descriptions",
|
2662
|
-
"tags",
|
2663
|
-
"versions",
|
2664
|
-
"contributors",
|
2665
|
-
"preprint_doi",
|
2666
|
-
"project_description",
|
2667
|
-
"project_links",
|
2668
|
-
"project_name",
|
2669
|
-
"publication_doi",
|
2670
|
-
}
|
2671
2428
|
matched_columns = [
|
2672
|
-
column for column in self._adata.obs.columns if column in
|
2429
|
+
column for column in self._adata.obs.columns if column in RESERVED_NAMES
|
2673
2430
|
]
|
2674
2431
|
if len(matched_columns) > 0:
|
2675
2432
|
raise ValueError(
|
@@ -2696,6 +2453,26 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2696
2453
|
Returns:
|
2697
2454
|
An AnnData object which adheres to the cellxgene-schema.
|
2698
2455
|
"""
|
2456
|
+
|
2457
|
+
def _convert_name_to_ontology_id(values: pd.Series, field: FieldAttr):
|
2458
|
+
"""Converts a column that stores a name into a column that stores the ontology id.
|
2459
|
+
|
2460
|
+
cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
|
2461
|
+
"""
|
2462
|
+
field_name = field.field.name
|
2463
|
+
assert field_name == "name" # noqa: S101
|
2464
|
+
cols = ["name", "ontology_id"]
|
2465
|
+
registry = field.field.model
|
2466
|
+
|
2467
|
+
if hasattr(registry, "ontology_id"):
|
2468
|
+
validated_records = registry.filter(**{f"{field_name}__in": values})
|
2469
|
+
mapper = (
|
2470
|
+
pd.DataFrame(validated_records.values_list(*cols))
|
2471
|
+
.set_index(0)
|
2472
|
+
.to_dict()[1]
|
2473
|
+
)
|
2474
|
+
return values.map(mapper)
|
2475
|
+
|
2699
2476
|
# Create a copy since we modify the AnnData object extensively
|
2700
2477
|
adata_cxg = self._adata.copy()
|
2701
2478
|
|
@@ -2715,7 +2492,7 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2715
2492
|
# convert name column to ontology_term_id column
|
2716
2493
|
for column in adata_cxg.obs.columns:
|
2717
2494
|
if column in self.categoricals and not column.endswith("_ontology_term_id"):
|
2718
|
-
mapped_column =
|
2495
|
+
mapped_column = _convert_name_to_ontology_id(
|
2719
2496
|
adata_cxg.obs[column], field=self.categoricals.get(column)
|
2720
2497
|
)
|
2721
2498
|
if mapped_column is not None:
|
@@ -2881,7 +2658,7 @@ class TimeHandler:
|
|
2881
2658
|
|
2882
2659
|
|
2883
2660
|
class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
2884
|
-
"""
|
2661
|
+
"""Categorical manager for `AnnData` to manage perturbations."""
|
2885
2662
|
|
2886
2663
|
PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
|
2887
2664
|
|
@@ -2892,45 +2669,32 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
2892
2669
|
pert_dose: bool = True,
|
2893
2670
|
pert_time: bool = True,
|
2894
2671
|
*,
|
2672
|
+
cxg_schema_version: Literal["5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
|
2895
2673
|
verbosity: str = "hint",
|
2896
|
-
cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
|
2897
2674
|
):
|
2898
2675
|
"""Initialize the curator with configuration and validation settings."""
|
2899
|
-
import bionty as bt
|
2900
|
-
|
2901
2676
|
self._pert_time = pert_time
|
2902
2677
|
self._pert_dose = pert_dose
|
2903
2678
|
|
2904
2679
|
self._validate_initial_data(adata)
|
2905
|
-
self.
|
2906
|
-
|
2907
|
-
self._setup_sources(adata)
|
2908
|
-
self._setup_compound_source()
|
2680
|
+
categoricals, categoricals_defaults = self._configure_categoricals(adata)
|
2909
2681
|
|
2910
2682
|
super().__init__(
|
2911
2683
|
adata=adata,
|
2912
|
-
categoricals=
|
2913
|
-
defaults=
|
2914
|
-
verbosity=verbosity,
|
2684
|
+
categoricals=categoricals,
|
2685
|
+
defaults=categoricals_defaults,
|
2915
2686
|
organism=organism,
|
2916
|
-
extra_sources=self.
|
2687
|
+
extra_sources=self._configure_sources(adata),
|
2917
2688
|
schema_version=cxg_schema_version,
|
2689
|
+
verbosity=verbosity,
|
2918
2690
|
)
|
2919
2691
|
|
2920
|
-
def
|
2692
|
+
def _configure_categoricals(self, adata: ad.AnnData):
|
2921
2693
|
"""Set up default configuration values."""
|
2922
2694
|
import bionty as bt
|
2923
2695
|
import wetlab as wl
|
2924
2696
|
|
2925
|
-
|
2926
|
-
CellxGeneAnnDataCatManager._get_categoricals_defaults()
|
2927
|
-
| {
|
2928
|
-
"cell_line": "unknown",
|
2929
|
-
"pert_target": "unknown",
|
2930
|
-
}
|
2931
|
-
)
|
2932
|
-
|
2933
|
-
self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
|
2697
|
+
categoricals = CellxGeneAnnDataCatManager._get_cxg_categoricals() | {
|
2934
2698
|
k: v
|
2935
2699
|
for k, v in {
|
2936
2700
|
"cell_line": bt.CellLine.name,
|
@@ -2942,22 +2706,40 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
2942
2706
|
}.items()
|
2943
2707
|
if k in adata.obs.columns
|
2944
2708
|
}
|
2945
|
-
# if "donor_id" in
|
2946
|
-
#
|
2709
|
+
# if "donor_id" in categoricals:
|
2710
|
+
# categoricals["donor_id"] = Donor.name
|
2947
2711
|
|
2948
|
-
|
2712
|
+
categoricals_defaults = CellxGeneAnnDataCatManager.cxg_categoricals_defaults | {
|
2713
|
+
"cell_line": "unknown",
|
2714
|
+
"pert_target": "unknown",
|
2715
|
+
}
|
2716
|
+
|
2717
|
+
return categoricals, categoricals_defaults
|
2718
|
+
|
2719
|
+
def _configure_sources(self, adata: ad.AnnData):
|
2949
2720
|
"""Set up data sources."""
|
2950
|
-
|
2951
|
-
|
2952
|
-
|
2953
|
-
|
2954
|
-
|
2721
|
+
import bionty as bt
|
2722
|
+
import wetlab as wl
|
2723
|
+
|
2724
|
+
sources = {}
|
2725
|
+
if "cell_line" in adata.obs.columns:
|
2726
|
+
sources["cell_line"] = bt.Source.filter(
|
2727
|
+
entity="bionty.CellLine", name="depmap"
|
2728
|
+
).first()
|
2955
2729
|
if "pert_compound" in adata.obs.columns:
|
2956
|
-
|
2730
|
+
with logger.mute():
|
2731
|
+
chebi_source = bt.Source.filter(
|
2732
|
+
entity="wetlab.Compound", name="chebi"
|
2733
|
+
).first()
|
2734
|
+
if not chebi_source:
|
2735
|
+
wl.Compound.add_source(
|
2736
|
+
bt.Source.filter(entity="Drug", name="chebi").first()
|
2737
|
+
)
|
2957
2738
|
|
2958
|
-
|
2739
|
+
sources["pert_compound"] = bt.Source.filter(
|
2959
2740
|
entity="wetlab.Compound", name="chebi"
|
2960
2741
|
).first()
|
2742
|
+
return sources
|
2961
2743
|
|
2962
2744
|
def _validate_initial_data(self, adata: ad.AnnData):
|
2963
2745
|
"""Validate the initial data structure."""
|
@@ -3005,20 +2787,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
3005
2787
|
adata.obs[col_name].cat.remove_unused_categories()
|
3006
2788
|
logger.important(f"mapped 'pert_name' to '{col_name}'")
|
3007
2789
|
|
3008
|
-
def _setup_compound_source(self):
|
3009
|
-
"""Set up the compound source with muted logging."""
|
3010
|
-
import bionty as bt
|
3011
|
-
import wetlab as wl
|
3012
|
-
|
3013
|
-
with logger.mute():
|
3014
|
-
chebi_source = bt.Source.filter(
|
3015
|
-
entity="wetlab.Compound", name="chebi"
|
3016
|
-
).first()
|
3017
|
-
if not chebi_source:
|
3018
|
-
wl.Compound.add_source(
|
3019
|
-
bt.Source.filter(entity="Drug", name="chebi").first()
|
3020
|
-
)
|
3021
|
-
|
3022
2790
|
def validate(self) -> bool: # type: ignore
|
3023
2791
|
"""Validate the AnnData object."""
|
3024
2792
|
validated = super().validate()
|
@@ -3136,8 +2904,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
3136
2904
|
|
3137
2905
|
def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
3138
2906
|
"""Make sure the source and organism are saved in the same database as the registry."""
|
3139
|
-
from lamindb.core._settings import settings
|
3140
|
-
|
3141
2907
|
db = registry.filter().db
|
3142
2908
|
source = kwargs.get("source")
|
3143
2909
|
organism = kwargs.get("organism")
|
@@ -3162,44 +2928,15 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
|
3162
2928
|
return filter_kwargs
|
3163
2929
|
|
3164
2930
|
|
3165
|
-
def
|
3166
|
-
values: Iterable[str],
|
3167
|
-
field: FieldAttr,
|
3168
|
-
registry: type[Record],
|
3169
|
-
exclude: str | list | None = None,
|
3170
|
-
**kwargs,
|
3171
|
-
):
|
3172
|
-
"""Inspect values using a registry."""
|
3173
|
-
# inspect exclude values in the default instance
|
3174
|
-
values = list(values)
|
3175
|
-
include_validated = []
|
3176
|
-
if exclude is not None:
|
3177
|
-
exclude = [exclude] if isinstance(exclude, str) else exclude
|
3178
|
-
exclude = [i for i in exclude if i in values]
|
3179
|
-
if len(exclude) > 0:
|
3180
|
-
# exclude values are validated without source and organism
|
3181
|
-
inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
|
3182
|
-
# if exclude values are validated, remove them from the values
|
3183
|
-
values = [i for i in values if i not in inspect_result_exclude.validated]
|
3184
|
-
include_validated = inspect_result_exclude.validated
|
3185
|
-
|
3186
|
-
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
|
3187
|
-
inspect_result._validated += include_validated
|
3188
|
-
inspect_result._non_validated = [
|
3189
|
-
i for i in inspect_result.non_validated if i not in include_validated
|
3190
|
-
]
|
3191
|
-
|
3192
|
-
return inspect_result
|
3193
|
-
|
3194
|
-
|
3195
|
-
def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
|
2931
|
+
def configure_organism(registry: Record, organism: str | None = None) -> dict[str, str]:
|
3196
2932
|
"""Check if a registry needs an organism and return the organism name."""
|
3197
|
-
|
2933
|
+
from ..models._from_values import _is_organism_required
|
2934
|
+
|
2935
|
+
if _is_organism_required(registry):
|
3198
2936
|
import bionty as bt
|
3199
2937
|
|
3200
|
-
if organism is None
|
3201
|
-
return {}
|
3202
|
-
return {"organism": organism or bt.settings.organism.name}
|
2938
|
+
if organism is not None or bt.settings.organism is not None:
|
2939
|
+
return {"organism": organism or bt.settings.organism.name}
|
3203
2940
|
return {}
|
3204
2941
|
|
3205
2942
|
|
@@ -3209,7 +2946,6 @@ def validate_categories(
|
|
3209
2946
|
key: str,
|
3210
2947
|
organism: str | None = None,
|
3211
2948
|
source: Record | None = None,
|
3212
|
-
exclude: str | list | None = None,
|
3213
2949
|
hint_print: str | None = None,
|
3214
2950
|
curator: CatManager | None = None,
|
3215
2951
|
) -> tuple[bool, list[str]]:
|
@@ -3221,13 +2957,9 @@ def validate_categories(
|
|
3221
2957
|
key: The key referencing the slot in the DataFrame.
|
3222
2958
|
organism: The organism name.
|
3223
2959
|
source: The source record.
|
3224
|
-
exclude: Exclude specific values from validation.
|
3225
2960
|
standardize: Whether to standardize the values.
|
3226
2961
|
hint_print: The hint to print that suggests fixing non-validated values.
|
3227
2962
|
"""
|
3228
|
-
from lamindb.core._settings import settings
|
3229
|
-
from lamindb.models._from_values import _format_values
|
3230
|
-
|
3231
2963
|
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
3232
2964
|
|
3233
2965
|
def _log_mapping_info():
|
@@ -3237,36 +2969,26 @@ def validate_categories(
|
|
3237
2969
|
|
3238
2970
|
registry = field.field.model
|
3239
2971
|
|
3240
|
-
# {"organism": organism_name
|
3241
|
-
kwargs =
|
2972
|
+
# {"organism": organism_name}
|
2973
|
+
kwargs = configure_organism(registry, organism)
|
3242
2974
|
kwargs.update({"source": source} if source else {})
|
3243
2975
|
kwargs_current = get_current_filter_kwargs(registry, kwargs)
|
3244
2976
|
|
3245
2977
|
# inspect values from the default instance
|
3246
|
-
inspect_result =
|
3247
|
-
values=values,
|
3248
|
-
field=field,
|
3249
|
-
registry=registry,
|
3250
|
-
exclude=exclude,
|
3251
|
-
**kwargs_current,
|
3252
|
-
)
|
2978
|
+
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs_current)
|
3253
2979
|
non_validated = inspect_result.non_validated
|
3254
2980
|
syn_mapper = inspect_result.synonyms_mapper
|
3255
2981
|
|
3256
2982
|
# inspect the non-validated values from public (bionty only)
|
3257
2983
|
values_validated = []
|
3258
2984
|
if hasattr(registry, "public"):
|
3259
|
-
|
3260
|
-
|
3261
|
-
|
3262
|
-
|
3263
|
-
|
3264
|
-
|
3265
|
-
|
3266
|
-
)
|
3267
|
-
values_validated += [getattr(r, field.field.name) for r in public_records]
|
3268
|
-
finally:
|
3269
|
-
settings.verbosity = verbosity
|
2985
|
+
public_records = registry.from_values(
|
2986
|
+
non_validated,
|
2987
|
+
field=field,
|
2988
|
+
mute=True,
|
2989
|
+
**kwargs_current,
|
2990
|
+
)
|
2991
|
+
values_validated += [getattr(r, field.field.name) for r in public_records]
|
3270
2992
|
|
3271
2993
|
# logging messages
|
3272
2994
|
non_validated_hint_print = hint_print or f'.add_new_from("{key}")'
|
@@ -3330,7 +3052,6 @@ def validate_categories_in_df(
|
|
3330
3052
|
df: pd.DataFrame,
|
3331
3053
|
fields: dict[str, FieldAttr],
|
3332
3054
|
sources: dict[str, Record] = None,
|
3333
|
-
exclude: dict | None = None,
|
3334
3055
|
curator: CatManager | None = None,
|
3335
3056
|
**kwargs,
|
3336
3057
|
) -> tuple[bool, dict]:
|
@@ -3348,7 +3069,6 @@ def validate_categories_in_df(
|
|
3348
3069
|
field=field,
|
3349
3070
|
key=key,
|
3350
3071
|
source=sources.get(key),
|
3351
|
-
exclude=exclude.get(key) if exclude else None,
|
3352
3072
|
curator=curator,
|
3353
3073
|
**kwargs,
|
3354
3074
|
)
|
@@ -3359,9 +3079,10 @@ def validate_categories_in_df(
|
|
3359
3079
|
|
3360
3080
|
|
3361
3081
|
def save_artifact(
|
3362
|
-
data: pd.DataFrame |
|
3082
|
+
data: pd.DataFrame | ScverseDataStructures,
|
3083
|
+
*,
|
3363
3084
|
fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
|
3364
|
-
|
3085
|
+
index_field: FieldAttr | dict[str, FieldAttr] | None = None,
|
3365
3086
|
description: str | None = None,
|
3366
3087
|
organism: str | None = None,
|
3367
3088
|
key: str | None = None,
|
@@ -3369,73 +3090,64 @@ def save_artifact(
|
|
3369
3090
|
revises: Artifact | None = None,
|
3370
3091
|
run: Run | None = None,
|
3371
3092
|
schema: Schema | None = None,
|
3093
|
+
**kwargs,
|
3372
3094
|
) -> Artifact:
|
3373
3095
|
"""Save all metadata with an Artifact.
|
3374
3096
|
|
3375
3097
|
Args:
|
3376
|
-
data: The
|
3098
|
+
data: The object to save.
|
3377
3099
|
fields: A dictionary mapping obs_column to registry_field.
|
3378
|
-
|
3100
|
+
index_field: The registry field to validate variables index against.
|
3379
3101
|
description: A description of the artifact.
|
3380
3102
|
organism: The organism name.
|
3381
|
-
type: The artifact type.
|
3382
3103
|
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
3383
3104
|
artifact: A already registered artifact. Passing this will not save a new artifact from data.
|
3384
3105
|
revises: Previous version of the artifact. Triggers a revision.
|
3385
3106
|
run: The run that creates the artifact.
|
3107
|
+
schema: The Schema to associate with the Artifact.
|
3386
3108
|
|
3387
3109
|
Returns:
|
3388
3110
|
The saved Artifact.
|
3389
3111
|
"""
|
3390
|
-
from ..models.artifact import add_labels
|
3112
|
+
from ..models.artifact import add_labels
|
3391
3113
|
|
3392
3114
|
if artifact is None:
|
3393
|
-
if
|
3394
|
-
artifact = Artifact.
|
3115
|
+
if isinstance(data, pd.DataFrame):
|
3116
|
+
artifact = Artifact.from_df(
|
3395
3117
|
data, description=description, key=key, revises=revises, run=run
|
3396
3118
|
)
|
3397
|
-
elif isinstance(data,
|
3398
|
-
artifact = Artifact.
|
3119
|
+
elif isinstance(data, AnnData):
|
3120
|
+
artifact = Artifact.from_anndata(
|
3399
3121
|
data, description=description, key=key, revises=revises, run=run
|
3400
3122
|
)
|
3401
3123
|
elif data_is_mudata(data):
|
3402
3124
|
artifact = Artifact.from_mudata(
|
3403
|
-
data,
|
3404
|
-
|
3405
|
-
|
3406
|
-
|
3407
|
-
run=run
|
3125
|
+
data, description=description, key=key, revises=revises, run=run
|
3126
|
+
)
|
3127
|
+
elif data_is_spatialdata(data):
|
3128
|
+
artifact = Artifact.from_spatialdata(
|
3129
|
+
data, description=description, key=key, revises=revises, run=run
|
3130
|
+
)
|
3131
|
+
else:
|
3132
|
+
raise InvalidArgument( # pragma: no cover
|
3133
|
+
"data must be one of pd.Dataframe, AnnData, MuData, SpatialData."
|
3408
3134
|
)
|
3409
|
-
artifact.schema = schema
|
3410
3135
|
artifact.save()
|
3411
3136
|
|
3412
|
-
if organism is not None and
|
3413
|
-
feature_kwargs =
|
3137
|
+
if organism is not None and index_field is not None:
|
3138
|
+
feature_kwargs = configure_organism(
|
3414
3139
|
(
|
3415
|
-
list(
|
3416
|
-
if isinstance(
|
3417
|
-
else
|
3140
|
+
list(index_field.values())[0].field.model
|
3141
|
+
if isinstance(index_field, dict)
|
3142
|
+
else index_field.field.model
|
3418
3143
|
),
|
3419
3144
|
organism,
|
3420
3145
|
)
|
3421
3146
|
else:
|
3422
3147
|
feature_kwargs = {}
|
3423
3148
|
|
3424
|
-
if artifact.otype == "DataFrame":
|
3425
|
-
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) # type: ignore
|
3426
|
-
elif artifact.otype == "AnnData":
|
3427
|
-
artifact.features._add_set_from_anndata( # type: ignore
|
3428
|
-
var_field=columns_field, **feature_kwargs
|
3429
|
-
)
|
3430
|
-
elif artifact.otype == "MuData":
|
3431
|
-
artifact.features._add_set_from_mudata( # type: ignore
|
3432
|
-
var_fields=columns_field, **feature_kwargs
|
3433
|
-
)
|
3434
|
-
else:
|
3435
|
-
raise NotImplementedError
|
3436
|
-
|
3437
3149
|
def _add_labels(
|
3438
|
-
data,
|
3150
|
+
data: pd.DataFrame | ScverseDataStructures,
|
3439
3151
|
artifact: Artifact,
|
3440
3152
|
fields: dict[str, FieldAttr],
|
3441
3153
|
feature_ref_is_name: bool | None = None,
|
@@ -3444,7 +3156,7 @@ def save_artifact(
|
|
3444
3156
|
for key, field in fields.items():
|
3445
3157
|
feature = features.get(key)
|
3446
3158
|
registry = field.field.model
|
3447
|
-
filter_kwargs =
|
3159
|
+
filter_kwargs = configure_organism(registry, organism)
|
3448
3160
|
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
3449
3161
|
df = data if isinstance(data, pd.DataFrame) else data.obs
|
3450
3162
|
# multi-value columns are separated by "|"
|
@@ -3471,35 +3183,81 @@ def save_artifact(
|
|
3471
3183
|
from_curator=True,
|
3472
3184
|
)
|
3473
3185
|
|
3474
|
-
|
3475
|
-
|
3476
|
-
|
3477
|
-
|
3478
|
-
|
3479
|
-
|
3480
|
-
|
3481
|
-
|
3482
|
-
|
3483
|
-
|
3484
|
-
|
3485
|
-
|
3486
|
-
|
3487
|
-
|
3488
|
-
|
3489
|
-
|
3490
|
-
|
3491
|
-
|
3492
|
-
|
3493
|
-
|
3494
|
-
|
3495
|
-
|
3496
|
-
|
3497
|
-
|
3498
|
-
|
3499
|
-
|
3500
|
-
|
3501
|
-
|
3502
|
-
|
3186
|
+
match artifact.otype:
|
3187
|
+
case "DataFrame":
|
3188
|
+
artifact.features._add_set_from_df(field=index_field, **feature_kwargs) # type: ignore
|
3189
|
+
_add_labels(
|
3190
|
+
data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
|
3191
|
+
)
|
3192
|
+
case "AnnData":
|
3193
|
+
artifact.features._add_set_from_anndata( # type: ignore
|
3194
|
+
var_field=index_field, **feature_kwargs
|
3195
|
+
)
|
3196
|
+
_add_labels(
|
3197
|
+
data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
|
3198
|
+
)
|
3199
|
+
case "MuData":
|
3200
|
+
artifact.features._add_set_from_mudata( # type: ignore
|
3201
|
+
var_fields=index_field, **feature_kwargs
|
3202
|
+
)
|
3203
|
+
for modality, modality_fields in fields.items():
|
3204
|
+
column_field_modality = index_field.get(modality)
|
3205
|
+
if modality == "obs":
|
3206
|
+
_add_labels(
|
3207
|
+
data,
|
3208
|
+
artifact,
|
3209
|
+
modality_fields,
|
3210
|
+
feature_ref_is_name=(
|
3211
|
+
None
|
3212
|
+
if column_field_modality is None
|
3213
|
+
else _ref_is_name(column_field_modality)
|
3214
|
+
),
|
3215
|
+
)
|
3216
|
+
else:
|
3217
|
+
_add_labels(
|
3218
|
+
data[modality],
|
3219
|
+
artifact,
|
3220
|
+
modality_fields,
|
3221
|
+
feature_ref_is_name=(
|
3222
|
+
None
|
3223
|
+
if column_field_modality is None
|
3224
|
+
else _ref_is_name(column_field_modality)
|
3225
|
+
),
|
3226
|
+
)
|
3227
|
+
case "SpatialData":
|
3228
|
+
artifact.features._add_set_from_spatialdata( # type: ignore
|
3229
|
+
sample_metadata_key=kwargs.get("sample_metadata_key", "sample"),
|
3230
|
+
var_fields=index_field,
|
3231
|
+
**feature_kwargs,
|
3232
|
+
)
|
3233
|
+
sample_metadata_key = kwargs.get("sample_metadata_key", "sample")
|
3234
|
+
for accessor, accessor_fields in fields.items():
|
3235
|
+
column_field = index_field.get(accessor)
|
3236
|
+
if accessor == sample_metadata_key:
|
3237
|
+
_add_labels(
|
3238
|
+
data.get_attrs(
|
3239
|
+
key=sample_metadata_key, return_as="df", flatten=True
|
3240
|
+
),
|
3241
|
+
artifact,
|
3242
|
+
accessor_fields,
|
3243
|
+
feature_ref_is_name=(
|
3244
|
+
None if column_field is None else _ref_is_name(column_field)
|
3245
|
+
),
|
3246
|
+
)
|
3247
|
+
else:
|
3248
|
+
_add_labels(
|
3249
|
+
data.tables[accessor],
|
3250
|
+
artifact,
|
3251
|
+
accessor_fields,
|
3252
|
+
feature_ref_is_name=(
|
3253
|
+
None if column_field is None else _ref_is_name(column_field)
|
3254
|
+
),
|
3255
|
+
)
|
3256
|
+
case _:
|
3257
|
+
raise NotImplementedError # pragma: no cover
|
3258
|
+
|
3259
|
+
artifact.schema = schema
|
3260
|
+
artifact.save()
|
3503
3261
|
|
3504
3262
|
slug = ln_setup.settings.instance.slug
|
3505
3263
|
if ln_setup.settings.instance.is_remote: # pdagma: no cover
|
@@ -3529,8 +3287,7 @@ def update_registry(
|
|
3529
3287
|
organism: str | None = None,
|
3530
3288
|
dtype: str | None = None,
|
3531
3289
|
source: Record | None = None,
|
3532
|
-
|
3533
|
-
**kwargs,
|
3290
|
+
**create_kwargs,
|
3534
3291
|
) -> None:
|
3535
3292
|
"""Save features or labels records in the default instance..
|
3536
3293
|
|
@@ -3543,14 +3300,12 @@ def update_registry(
|
|
3543
3300
|
organism: The organism name.
|
3544
3301
|
dtype: The type of the feature.
|
3545
3302
|
source: The source record.
|
3546
|
-
|
3547
|
-
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
3303
|
+
**create_kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
3548
3304
|
"""
|
3549
|
-
from lamindb.core._settings import settings
|
3550
3305
|
from lamindb.models.save import save as ln_save
|
3551
3306
|
|
3552
3307
|
registry = field.field.model
|
3553
|
-
filter_kwargs =
|
3308
|
+
filter_kwargs = configure_organism(registry, organism)
|
3554
3309
|
filter_kwargs.update({"source": source} if source else {})
|
3555
3310
|
values = [i for i in values if isinstance(i, str) and i]
|
3556
3311
|
if not values:
|
@@ -3608,14 +3363,16 @@ def update_registry(
|
|
3608
3363
|
registry(
|
3609
3364
|
**init_kwargs,
|
3610
3365
|
**{k: v for k, v in filter_kwargs.items() if k != "source"},
|
3611
|
-
**{
|
3366
|
+
**{
|
3367
|
+
k: v for k, v in create_kwargs.items() if k != "sources"
|
3368
|
+
},
|
3612
3369
|
)
|
3613
3370
|
)
|
3614
3371
|
ln_save(non_validated_records)
|
3615
3372
|
|
3616
3373
|
# save parent labels for ulabels, for example a parent label "project" for label "project001"
|
3617
3374
|
if registry == ULabel and field.field.name == "name":
|
3618
|
-
|
3375
|
+
save_ulabels_type(values, field=field, key=key)
|
3619
3376
|
|
3620
3377
|
finally:
|
3621
3378
|
settings.verbosity = verbosity
|
@@ -3653,16 +3410,18 @@ def log_saved_labels(
|
|
3653
3410
|
)
|
3654
3411
|
|
3655
3412
|
|
3656
|
-
def
|
3657
|
-
"""Save
|
3413
|
+
def save_ulabels_type(values: list[str], field: FieldAttr, key: str) -> None:
|
3414
|
+
"""Save the ULabel type of the given labels."""
|
3658
3415
|
registry = field.field.model
|
3659
3416
|
assert registry == ULabel # noqa: S101
|
3660
|
-
all_records = registry.
|
3661
|
-
|
3662
|
-
|
3663
|
-
|
3664
|
-
|
3665
|
-
|
3417
|
+
all_records = registry.filter(**{field.field.name: list(values)}).all()
|
3418
|
+
# so `tissue_type` becomes `TissueType`
|
3419
|
+
type_name = "".join([i.capitalize() for i in key.lower().split("_")])
|
3420
|
+
ulabel_type = registry.filter(name=type_name, is_type=True).one_or_none()
|
3421
|
+
if ulabel_type is None:
|
3422
|
+
ulabel_type = registry(name=type_name, is_type=True).save()
|
3423
|
+
logger.important(f"Created a ULabel type: {ulabel_type}")
|
3424
|
+
all_records.update(type=ulabel_type)
|
3666
3425
|
|
3667
3426
|
|
3668
3427
|
def _save_organism(name: str):
|
@@ -3761,7 +3520,6 @@ def from_tiledbsoma(
|
|
3761
3520
|
obs_columns: FieldAttr = Feature.name,
|
3762
3521
|
organism: str | None = None,
|
3763
3522
|
sources: dict[str, Record] | None = None,
|
3764
|
-
exclude: dict[str, str | list[str]] | None = None,
|
3765
3523
|
) -> TiledbsomaCatManager:
|
3766
3524
|
return TiledbsomaCatManager(
|
3767
3525
|
experiment_uri=experiment_uri,
|
@@ -3770,7 +3528,6 @@ def from_tiledbsoma(
|
|
3770
3528
|
obs_columns=obs_columns,
|
3771
3529
|
organism=organism,
|
3772
3530
|
sources=sources,
|
3773
|
-
exclude=exclude,
|
3774
3531
|
)
|
3775
3532
|
|
3776
3533
|
|
@@ -3782,7 +3539,6 @@ def from_spatialdata(
|
|
3782
3539
|
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
3783
3540
|
organism: str | None = None,
|
3784
3541
|
sources: dict[str, dict[str, Record]] | None = None,
|
3785
|
-
exclude: dict[str, dict] | None = None,
|
3786
3542
|
verbosity: str = "hint",
|
3787
3543
|
*,
|
3788
3544
|
sample_metadata_key: str = "sample",
|
@@ -3799,7 +3555,6 @@ def from_spatialdata(
|
|
3799
3555
|
verbosity=verbosity,
|
3800
3556
|
organism=organism,
|
3801
3557
|
sources=sources,
|
3802
|
-
exclude=exclude,
|
3803
3558
|
sample_metadata_key=sample_metadata_key,
|
3804
3559
|
)
|
3805
3560
|
|