lamindb 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/core/_context.py +6 -0
- lamindb/core/datasets/__init__.py +1 -0
- lamindb/core/datasets/_core.py +23 -0
- lamindb/core/datasets/_small.py +16 -2
- lamindb/core/storage/objects.py +1 -2
- lamindb/curators/__init__.py +1269 -1513
- lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
- lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
- lamindb/models/_feature_manager.py +65 -14
- lamindb/models/_from_values.py +113 -78
- lamindb/models/artifact.py +138 -95
- lamindb/models/can_curate.py +185 -216
- lamindb/models/feature.py +32 -2
- lamindb/models/project.py +69 -7
- lamindb/models/record.py +43 -25
- lamindb/models/run.py +18 -1
- lamindb/models/schema.py +0 -8
- {lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/METADATA +6 -5
- {lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/RECORD +22 -22
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
- {lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/LICENSE +0 -0
- {lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/WHEEL +0 -0
lamindb/curators/__init__.py
CHANGED
@@ -1,25 +1,27 @@
|
|
1
1
|
"""Curators.
|
2
2
|
|
3
|
-
.. versionadded:: 1.1.0
|
4
|
-
|
5
3
|
.. autosummary::
|
6
4
|
:toctree: .
|
7
5
|
|
8
|
-
Curator
|
9
6
|
DataFrameCurator
|
10
7
|
AnnDataCurator
|
8
|
+
MuDataCurator
|
9
|
+
SpatialDataCurator
|
11
10
|
|
12
|
-
|
11
|
+
Helper classes.
|
13
12
|
|
14
13
|
.. autosummary::
|
15
14
|
:toctree: .
|
16
15
|
|
16
|
+
Curator
|
17
|
+
SlotsCurator
|
17
18
|
CatManager
|
19
|
+
CatLookup
|
18
20
|
DataFrameCatManager
|
19
21
|
AnnDataCatManager
|
20
22
|
MuDataCatManager
|
23
|
+
SpatialDataCatManager
|
21
24
|
TiledbsomaCatManager
|
22
|
-
CurateLookup
|
23
25
|
|
24
26
|
"""
|
25
27
|
|
@@ -27,7 +29,6 @@ from __future__ import annotations
|
|
27
29
|
|
28
30
|
import copy
|
29
31
|
import re
|
30
|
-
from importlib import resources
|
31
32
|
from itertools import chain
|
32
33
|
from typing import TYPE_CHECKING, Any, Literal
|
33
34
|
|
@@ -37,45 +38,44 @@ import pandas as pd
|
|
37
38
|
import pandera
|
38
39
|
import pyarrow as pa
|
39
40
|
from lamin_utils import colors, logger
|
40
|
-
from lamindb_setup.core import deprecated
|
41
|
+
from lamindb_setup.core import deprecated
|
41
42
|
from lamindb_setup.core._docs import doc_args
|
42
43
|
from lamindb_setup.core.upath import UPath
|
43
44
|
|
44
|
-
from lamindb.core.storage._backed_access import backed_access
|
45
|
-
|
46
|
-
from ._cellxgene_schemas import _read_schema_versions
|
47
|
-
|
48
45
|
if TYPE_CHECKING:
|
49
|
-
from anndata import AnnData
|
50
46
|
from lamindb_setup.core.types import UPathStr
|
47
|
+
from mudata import MuData
|
48
|
+
from spatialdata import SpatialData
|
51
49
|
|
52
|
-
from lamindb.
|
50
|
+
from lamindb.core.types import ScverseDataStructures
|
53
51
|
from lamindb.models import Record
|
54
52
|
from lamindb.base.types import FieldAttr # noqa
|
55
53
|
from lamindb.core._settings import settings
|
56
54
|
from lamindb.models import (
|
57
55
|
Artifact,
|
58
|
-
Collection,
|
59
56
|
Feature,
|
60
57
|
Record,
|
61
58
|
Run,
|
62
59
|
Schema,
|
63
60
|
ULabel,
|
64
61
|
)
|
65
|
-
from lamindb.models.
|
66
|
-
|
62
|
+
from lamindb.models.artifact import (
|
63
|
+
add_labels,
|
64
|
+
data_is_anndata,
|
65
|
+
data_is_mudata,
|
66
|
+
data_is_spatialdata,
|
67
|
+
)
|
67
68
|
from lamindb.models.feature import parse_dtype, parse_dtype_single_cat
|
68
69
|
from lamindb.models._from_values import _format_values
|
69
70
|
|
70
71
|
from ..errors import InvalidArgument, ValidationError
|
72
|
+
from anndata import AnnData
|
71
73
|
|
72
74
|
if TYPE_CHECKING:
|
73
75
|
from collections.abc import Iterable, MutableMapping
|
74
76
|
from typing import Any
|
75
77
|
|
76
78
|
from lamindb_setup.core.types import UPathStr
|
77
|
-
from mudata import MuData
|
78
|
-
from spatialdata import SpatialData
|
79
79
|
|
80
80
|
from lamindb.models.query_set import RecordList
|
81
81
|
|
@@ -86,7 +86,7 @@ def strip_ansi_codes(text):
|
|
86
86
|
return ansi_pattern.sub("", text)
|
87
87
|
|
88
88
|
|
89
|
-
class
|
89
|
+
class CatLookup:
|
90
90
|
"""Lookup categories from the reference instance.
|
91
91
|
|
92
92
|
Args:
|
@@ -94,10 +94,10 @@ class CurateLookup:
|
|
94
94
|
slots: A dictionary of slot fields to lookup.
|
95
95
|
public: Whether to lookup from the public instance. Defaults to False.
|
96
96
|
|
97
|
-
Example
|
98
|
-
|
99
|
-
|
100
|
-
|
97
|
+
Example::
|
98
|
+
|
99
|
+
curator = ln.curators.DataFrameCurator(...)
|
100
|
+
curator.cat.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
|
101
101
|
|
102
102
|
"""
|
103
103
|
|
@@ -163,7 +163,7 @@ SLOTS_DOCSTRING = """Curator objects by slot.
|
|
163
163
|
"""
|
164
164
|
|
165
165
|
|
166
|
-
VALIDATE_DOCSTRING = """Validate dataset.
|
166
|
+
VALIDATE_DOCSTRING = """Validate dataset against Schema.
|
167
167
|
|
168
168
|
Raises:
|
169
169
|
lamindb.errors.ValidationError: If validation fails.
|
@@ -183,15 +183,17 @@ Returns:
|
|
183
183
|
|
184
184
|
|
185
185
|
class Curator:
|
186
|
-
"""
|
186
|
+
"""Curator base class.
|
187
187
|
|
188
188
|
A `Curator` object makes it easy to validate, standardize & annotate datasets.
|
189
189
|
|
190
|
-
.. versionadded:: 1.1.0
|
191
|
-
|
192
190
|
See:
|
193
191
|
- :class:`~lamindb.curators.DataFrameCurator`
|
194
192
|
- :class:`~lamindb.curators.AnnDataCurator`
|
193
|
+
- :class:`~lamindb.curators.MuDataCurator`
|
194
|
+
- :class:`~lamindb.curators.SpatialDataCurator`
|
195
|
+
|
196
|
+
.. versionadded:: 1.1.0
|
195
197
|
"""
|
196
198
|
|
197
199
|
def __init__(self, dataset: Any, schema: Schema | None = None):
|
@@ -199,7 +201,12 @@ class Curator:
|
|
199
201
|
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
200
202
|
if isinstance(self._dataset, Artifact):
|
201
203
|
self._artifact = self._dataset
|
202
|
-
if self._artifact.otype in {
|
204
|
+
if self._artifact.otype in {
|
205
|
+
"DataFrame",
|
206
|
+
"AnnData",
|
207
|
+
"MuData",
|
208
|
+
"SpatialData",
|
209
|
+
}:
|
203
210
|
self._dataset = self._dataset.load()
|
204
211
|
self._schema: Schema | None = schema
|
205
212
|
self._is_validated: bool = False
|
@@ -225,9 +232,72 @@ class Curator:
|
|
225
232
|
pass
|
226
233
|
|
227
234
|
|
235
|
+
class SlotsCurator(Curator):
|
236
|
+
"""Curator for a dataset with slots.
|
237
|
+
|
238
|
+
Args:
|
239
|
+
dataset: The dataset to validate & annotate.
|
240
|
+
schema: A `Schema` object that defines the validation constraints.
|
241
|
+
|
242
|
+
.. versionadded:: 1.3.0
|
243
|
+
"""
|
244
|
+
|
245
|
+
def __init__(
|
246
|
+
self,
|
247
|
+
dataset: Any,
|
248
|
+
schema: Schema,
|
249
|
+
) -> None:
|
250
|
+
super().__init__(dataset=dataset, schema=schema)
|
251
|
+
self._slots: dict[str, DataFrameCurator] = {}
|
252
|
+
|
253
|
+
# used in MuDataCurator and SpatialDataCurator
|
254
|
+
# in form of {table/modality_key: var_field}
|
255
|
+
self._var_fields: dict[str, FieldAttr] = {}
|
256
|
+
# in form of {table/modality_key: categoricals}
|
257
|
+
self._categoricals: dict[str, dict[str, FieldAttr]] = {}
|
258
|
+
|
259
|
+
@property
|
260
|
+
@doc_args(SLOTS_DOCSTRING)
|
261
|
+
def slots(self) -> dict[str, DataFrameCurator]:
|
262
|
+
"""{}""" # noqa: D415
|
263
|
+
return self._slots
|
264
|
+
|
265
|
+
@doc_args(VALIDATE_DOCSTRING)
|
266
|
+
def validate(self) -> None:
|
267
|
+
"""{}""" # noqa: D415
|
268
|
+
for _, curator in self._slots.items():
|
269
|
+
curator.validate()
|
270
|
+
|
271
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
272
|
+
def save_artifact(
|
273
|
+
self,
|
274
|
+
*,
|
275
|
+
key: str | None = None,
|
276
|
+
description: str | None = None,
|
277
|
+
revises: Artifact | None = None,
|
278
|
+
run: Run | None = None,
|
279
|
+
) -> Artifact:
|
280
|
+
"""{}""" # noqa: D415
|
281
|
+
if not self._is_validated:
|
282
|
+
self.validate()
|
283
|
+
|
284
|
+
# default implementation for MuDataCurator and SpatialDataCurator
|
285
|
+
return save_artifact( # type: ignore
|
286
|
+
self._dataset,
|
287
|
+
key=key,
|
288
|
+
description=description,
|
289
|
+
fields=self._categoricals,
|
290
|
+
index_field=self._var_fields,
|
291
|
+
artifact=self._artifact,
|
292
|
+
revises=revises,
|
293
|
+
run=run,
|
294
|
+
schema=self._schema,
|
295
|
+
)
|
296
|
+
|
297
|
+
|
228
298
|
class DataFrameCurator(Curator):
|
229
299
|
# the example in the docstring is tested in test_curators_quickstart_example
|
230
|
-
"""Curator for
|
300
|
+
"""Curator for `DataFrame`.
|
231
301
|
|
232
302
|
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
233
303
|
|
@@ -282,7 +352,9 @@ class DataFrameCurator(Curator):
|
|
282
352
|
feature.dtype if not feature.dtype.startswith("cat") else "category"
|
283
353
|
)
|
284
354
|
pandera_columns[feature.name] = pandera.Column(
|
285
|
-
pandera_dtype,
|
355
|
+
pandera_dtype,
|
356
|
+
nullable=feature.nullable,
|
357
|
+
coerce=feature.coerce_dtype,
|
286
358
|
)
|
287
359
|
if feature.dtype.startswith("cat"):
|
288
360
|
categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
|
@@ -378,7 +450,7 @@ class DataFrameCurator(Curator):
|
|
378
450
|
description: str | None = None,
|
379
451
|
revises: Artifact | None = None,
|
380
452
|
run: Run | None = None,
|
381
|
-
):
|
453
|
+
) -> Artifact:
|
382
454
|
"""{}""" # noqa: D415
|
383
455
|
if not self._is_validated:
|
384
456
|
self.validate() # raises ValidationError if doesn't validate
|
@@ -387,7 +459,7 @@ class DataFrameCurator(Curator):
|
|
387
459
|
self._dataset,
|
388
460
|
description=description,
|
389
461
|
fields=self._cat_manager.categoricals,
|
390
|
-
|
462
|
+
index_field=result["field"],
|
391
463
|
key=key,
|
392
464
|
artifact=self._artifact,
|
393
465
|
revises=revises,
|
@@ -396,9 +468,9 @@ class DataFrameCurator(Curator):
|
|
396
468
|
)
|
397
469
|
|
398
470
|
|
399
|
-
class AnnDataCurator(
|
471
|
+
class AnnDataCurator(SlotsCurator):
|
400
472
|
# the example in the docstring is tested in test_curators_quickstart_example
|
401
|
-
"""Curator for
|
473
|
+
"""Curator for `AnnData`.
|
402
474
|
|
403
475
|
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
404
476
|
|
@@ -446,7 +518,7 @@ class AnnDataCurator(Curator):
|
|
446
518
|
).save()
|
447
519
|
|
448
520
|
# curate an AnnData
|
449
|
-
adata = datasets.small_dataset1(otype="AnnData")
|
521
|
+
adata = ln.core.datasets.small_dataset1(otype="AnnData")
|
450
522
|
curator = ln.curators.AnnDataCurator(adata, anndata_schema)
|
451
523
|
artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
|
452
524
|
assert artifact.schema == anndata_schema
|
@@ -466,9 +538,9 @@ class AnnDataCurator(Curator):
|
|
466
538
|
self._slots = {
|
467
539
|
slot: DataFrameCurator(
|
468
540
|
(
|
469
|
-
self._dataset
|
541
|
+
getattr(self._dataset, slot).T
|
470
542
|
if slot == "var"
|
471
|
-
else self._dataset
|
543
|
+
else getattr(self._dataset, slot)
|
472
544
|
),
|
473
545
|
slot_schema,
|
474
546
|
)
|
@@ -476,18 +548,6 @@ class AnnDataCurator(Curator):
|
|
476
548
|
if slot in {"obs", "var"}
|
477
549
|
}
|
478
550
|
|
479
|
-
@property
|
480
|
-
@doc_args(SLOTS_DOCSTRING)
|
481
|
-
def slots(self) -> dict[str, DataFrameCurator]:
|
482
|
-
"""{}""" # noqa: D415
|
483
|
-
return self._slots
|
484
|
-
|
485
|
-
@doc_args(VALIDATE_DOCSTRING)
|
486
|
-
def validate(self) -> None:
|
487
|
-
"""{}""" # noqa: D415
|
488
|
-
for _, curator in self._slots.items():
|
489
|
-
curator.validate()
|
490
|
-
|
491
551
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
492
552
|
def save_artifact(
|
493
553
|
self,
|
@@ -496,7 +556,7 @@ class AnnDataCurator(Curator):
|
|
496
556
|
description: str | None = None,
|
497
557
|
revises: Artifact | None = None,
|
498
558
|
run: Run | None = None,
|
499
|
-
):
|
559
|
+
) -> Artifact:
|
500
560
|
"""{}""" # noqa: D415
|
501
561
|
if not self._is_validated:
|
502
562
|
self.validate()
|
@@ -504,7 +564,7 @@ class AnnDataCurator(Curator):
|
|
504
564
|
self._dataset,
|
505
565
|
description=description,
|
506
566
|
fields=self.slots["obs"]._cat_manager.categoricals,
|
507
|
-
|
567
|
+
index_field=(
|
508
568
|
parse_dtype_single_cat(self.slots["var"]._schema.itype, is_itype=True)[
|
509
569
|
"field"
|
510
570
|
]
|
@@ -519,34 +579,286 @@ class AnnDataCurator(Curator):
|
|
519
579
|
)
|
520
580
|
|
521
581
|
|
522
|
-
|
523
|
-
|
582
|
+
def _assign_var_fields_categoricals_multimodal(
|
583
|
+
modality: str | None,
|
584
|
+
slot_type: str,
|
585
|
+
slot: str,
|
586
|
+
slot_schema: Schema,
|
587
|
+
var_fields: dict[str, FieldAttr],
|
588
|
+
categoricals: dict[str, dict[str, FieldAttr]],
|
589
|
+
slots: dict[str, DataFrameCurator],
|
590
|
+
) -> None:
|
591
|
+
"""Assigns var_fields and categoricals for multimodal data curators."""
|
592
|
+
if modality is not None:
|
593
|
+
# Makes sure that all tables are present
|
594
|
+
var_fields[modality] = None
|
595
|
+
categoricals[modality] = {}
|
596
|
+
|
597
|
+
if slot_type == "var":
|
598
|
+
var_field = parse_dtype_single_cat(slot_schema.itype, is_itype=True)["field"]
|
599
|
+
if modality is None:
|
600
|
+
# This should rarely/never be used since tables should have different var fields
|
601
|
+
var_fields[slot] = var_field # pragma: no cover
|
602
|
+
else:
|
603
|
+
# Note that this is NOT nested since the nested key is always "var"
|
604
|
+
var_fields[modality] = var_field
|
605
|
+
else:
|
606
|
+
obs_fields = slots[slot]._cat_manager.categoricals
|
607
|
+
if modality is None:
|
608
|
+
categoricals[slot] = obs_fields
|
609
|
+
else:
|
610
|
+
# Note that this is NOT nested since the nested key is always "obs"
|
611
|
+
categoricals[modality] = obs_fields
|
612
|
+
|
524
613
|
|
525
|
-
|
614
|
+
class MuDataCurator(SlotsCurator):
|
615
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
616
|
+
"""Curator for `MuData`.
|
617
|
+
|
618
|
+
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
526
619
|
|
527
|
-
|
620
|
+
.. versionadded:: 1.3.0
|
528
621
|
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
>>> columns=Feature.name, # map column names
|
533
|
-
>>> categoricals={"perturbation": ULabel.name}, # map categories
|
534
|
-
>>> )
|
535
|
-
>>> cat_manager.validate() # validate the dataframe
|
536
|
-
>>> artifact = cat_manager.save_artifact(description="my RNA-seq")
|
537
|
-
>>> artifact.describe() # see annotations
|
622
|
+
Args:
|
623
|
+
dataset: The MuData-like object to validate & annotate.
|
624
|
+
schema: A `Schema` object that defines the validation constraints.
|
538
625
|
|
539
|
-
|
626
|
+
Example::
|
540
627
|
|
541
|
-
|
628
|
+
import lamindb as ln
|
629
|
+
import bionty as bt
|
630
|
+
|
631
|
+
# define the global obs schema
|
632
|
+
obs_schema = ln.Schema(
|
633
|
+
name="mudata_papalexi21_subset_obs_schema",
|
634
|
+
features=[
|
635
|
+
ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
|
636
|
+
ln.Feature(name="replicate", dtype="cat[ULabel[Replicate]]").save(),
|
637
|
+
],
|
638
|
+
).save()
|
639
|
+
|
640
|
+
# define the ['rna'].obs schema
|
641
|
+
obs_schema_rna = ln.Schema(
|
642
|
+
name="mudata_papalexi21_subset_rna_obs_schema",
|
643
|
+
features=[
|
644
|
+
ln.Feature(name="nCount_RNA", dtype=int).save(),
|
645
|
+
ln.Feature(name="nFeature_RNA", dtype=int).save(),
|
646
|
+
ln.Feature(name="percent.mito", dtype=float).save(),
|
647
|
+
],
|
648
|
+
coerce_dtype=True,
|
649
|
+
).save()
|
650
|
+
|
651
|
+
# define the ['hto'].obs schema
|
652
|
+
obs_schema_hto = ln.Schema(
|
653
|
+
name="mudata_papalexi21_subset_hto_obs_schema",
|
654
|
+
features=[
|
655
|
+
ln.Feature(name="nCount_HTO", dtype=int).save(),
|
656
|
+
ln.Feature(name="nFeature_HTO", dtype=int).save(),
|
657
|
+
ln.Feature(name="technique", dtype=bt.ExperimentalFactor).save(),
|
658
|
+
],
|
659
|
+
coerce_dtype=True,
|
660
|
+
).save()
|
661
|
+
|
662
|
+
# define ['rna'].var schema
|
663
|
+
var_schema_rna = ln.Schema(
|
664
|
+
name="mudata_papalexi21_subset_rna_var_schema",
|
665
|
+
itype=bt.Gene.symbol,
|
666
|
+
dtype=float,
|
667
|
+
).save()
|
668
|
+
|
669
|
+
# define composite schema
|
670
|
+
mudata_schema = ln.Schema(
|
671
|
+
name="mudata_papalexi21_subset_mudata_schema",
|
672
|
+
otype="MuData",
|
673
|
+
components={
|
674
|
+
"obs": obs_schema,
|
675
|
+
"rna:obs": obs_schema_rna,
|
676
|
+
"hto:obs": obs_schema_hto,
|
677
|
+
"rna:var": var_schema_rna,
|
678
|
+
},
|
679
|
+
).save()
|
542
680
|
|
543
|
-
|
544
|
-
|
681
|
+
# curate a MuData
|
682
|
+
mdata = ln.core.datasets.mudata_papalexi21_subset()
|
683
|
+
bt.settings.organism = "human" # set the organism
|
684
|
+
curator = ln.curators.MuDataCurator(mdata, mudata_schema)
|
685
|
+
artifact = curator.save_artifact(key="example_datasets/mudata_papalexi21_subset.h5mu")
|
686
|
+
assert artifact.schema == mudata_schema
|
545
687
|
"""
|
546
688
|
|
547
689
|
def __init__(
|
548
|
-
self,
|
549
|
-
|
690
|
+
self,
|
691
|
+
dataset: MuData | Artifact,
|
692
|
+
schema: Schema,
|
693
|
+
) -> None:
|
694
|
+
super().__init__(dataset=dataset, schema=schema)
|
695
|
+
if not data_is_mudata(self._dataset):
|
696
|
+
raise InvalidArgument("dataset must be MuData-like.")
|
697
|
+
if schema.otype != "MuData":
|
698
|
+
raise InvalidArgument("Schema otype must be 'MuData'.")
|
699
|
+
|
700
|
+
for slot, slot_schema in schema.slots.items():
|
701
|
+
# Assign to _slots
|
702
|
+
if ":" in slot:
|
703
|
+
modality, modality_slot = slot.split(":")
|
704
|
+
schema_dataset = self._dataset.__getitem__(modality)
|
705
|
+
else:
|
706
|
+
modality, modality_slot = None, slot
|
707
|
+
schema_dataset = self._dataset
|
708
|
+
self._slots[slot] = DataFrameCurator(
|
709
|
+
(
|
710
|
+
getattr(schema_dataset, modality_slot).T
|
711
|
+
if modality_slot == "var"
|
712
|
+
else getattr(schema_dataset, modality_slot)
|
713
|
+
),
|
714
|
+
slot_schema,
|
715
|
+
)
|
716
|
+
_assign_var_fields_categoricals_multimodal(
|
717
|
+
modality=modality,
|
718
|
+
slot_type=modality_slot,
|
719
|
+
slot=slot,
|
720
|
+
slot_schema=slot_schema,
|
721
|
+
var_fields=self._var_fields,
|
722
|
+
categoricals=self._categoricals,
|
723
|
+
slots=self._slots,
|
724
|
+
)
|
725
|
+
|
726
|
+
# for consistency with BaseCatManager
|
727
|
+
self._columns_field = self._var_fields
|
728
|
+
|
729
|
+
|
730
|
+
class SpatialDataCurator(SlotsCurator):
|
731
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
732
|
+
"""Curator for `SpatialData`.
|
733
|
+
|
734
|
+
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
735
|
+
|
736
|
+
.. versionadded:: 1.3.0
|
737
|
+
|
738
|
+
Args:
|
739
|
+
dataset: The SpatialData-like object to validate & annotate.
|
740
|
+
schema: A `Schema` object that defines the validation constraints.
|
741
|
+
|
742
|
+
Example::
|
743
|
+
|
744
|
+
import lamindb as ln
|
745
|
+
import bionty as bt
|
746
|
+
|
747
|
+
# define sample schema
|
748
|
+
sample_schema = ln.Schema(
|
749
|
+
name="blobs_sample_level_metadata",
|
750
|
+
features=[
|
751
|
+
ln.Feature(name="assay", dtype=bt.ExperimentalFactor).save(),
|
752
|
+
ln.Feature(name="disease", dtype=bt.Disease).save(),
|
753
|
+
ln.Feature(name="development_stage", dtype=bt.DevelopmentalStage).save(),
|
754
|
+
],
|
755
|
+
coerce_dtype=True
|
756
|
+
).save()
|
757
|
+
|
758
|
+
# define table obs schema
|
759
|
+
blobs_obs_schema = ln.Schema(
|
760
|
+
name="blobs_obs_level_metadata",
|
761
|
+
features=[
|
762
|
+
ln.Feature(name="sample_region", dtype="str").save(),
|
763
|
+
],
|
764
|
+
coerce_dtype=True
|
765
|
+
).save()
|
766
|
+
|
767
|
+
# define table var schema
|
768
|
+
blobs_var_schema = ln.Schema(
|
769
|
+
name="blobs_var_schema",
|
770
|
+
itype=bt.Gene.ensembl_gene_id,
|
771
|
+
dtype=int
|
772
|
+
).save()
|
773
|
+
|
774
|
+
# define composite schema
|
775
|
+
spatialdata_schema = ln.Schema(
|
776
|
+
name="blobs_spatialdata_schema",
|
777
|
+
otype="SpatialData",
|
778
|
+
components={
|
779
|
+
"sample": sample_schema,
|
780
|
+
"table:obs": blobs_obs_schema,
|
781
|
+
"table:var": blobs_var_schema,
|
782
|
+
}).save()
|
783
|
+
|
784
|
+
# curate a SpatialData
|
785
|
+
spatialdata = ln.core.datasets.spatialdata_blobs()
|
786
|
+
curator = ln.curators.SpatialDataCurator(spatialdata, spatialdata_schema)
|
787
|
+
try:
|
788
|
+
curator.validate()
|
789
|
+
except ln.errors.ValidationError as error:
|
790
|
+
print(error)
|
791
|
+
|
792
|
+
# validate again (must pass now) and save artifact
|
793
|
+
artifact = curator.save_artifact(key="example_datasets/spatialdata1.zarr")
|
794
|
+
assert artifact.schema == spatialdata_schema
|
795
|
+
"""
|
796
|
+
|
797
|
+
def __init__(
|
798
|
+
self,
|
799
|
+
dataset: SpatialData | Artifact,
|
800
|
+
schema: Schema,
|
801
|
+
*,
|
802
|
+
sample_metadata_key: str | None = "sample",
|
803
|
+
) -> None:
|
804
|
+
super().__init__(dataset=dataset, schema=schema)
|
805
|
+
if not data_is_spatialdata(self._dataset):
|
806
|
+
raise InvalidArgument("dataset must be SpatialData-like.")
|
807
|
+
if schema.otype != "SpatialData":
|
808
|
+
raise InvalidArgument("Schema otype must be 'SpatialData'.")
|
809
|
+
|
810
|
+
for slot, slot_schema in schema.slots.items():
|
811
|
+
# Assign to _slots
|
812
|
+
if ":" in slot:
|
813
|
+
table_key, table_slot = slot.split(":")
|
814
|
+
schema_dataset = self._dataset.tables.__getitem__(table_key)
|
815
|
+
# sample metadata (does not have a `:` separator)
|
816
|
+
else:
|
817
|
+
table_key = None
|
818
|
+
table_slot = slot
|
819
|
+
schema_dataset = self._dataset.get_attrs(
|
820
|
+
key=sample_metadata_key, return_as="df", flatten=True
|
821
|
+
)
|
822
|
+
|
823
|
+
self._slots[slot] = DataFrameCurator(
|
824
|
+
(
|
825
|
+
getattr(schema_dataset, table_slot).T
|
826
|
+
if table_slot == "var"
|
827
|
+
else (
|
828
|
+
getattr(schema_dataset, table_slot)
|
829
|
+
if table_slot != sample_metadata_key
|
830
|
+
else schema_dataset
|
831
|
+
) # just take the schema_dataset if it's the sample metadata key
|
832
|
+
),
|
833
|
+
slot_schema,
|
834
|
+
)
|
835
|
+
|
836
|
+
_assign_var_fields_categoricals_multimodal(
|
837
|
+
modality=table_key,
|
838
|
+
slot_type=table_slot,
|
839
|
+
slot=slot,
|
840
|
+
slot_schema=slot_schema,
|
841
|
+
var_fields=self._var_fields,
|
842
|
+
categoricals=self._categoricals,
|
843
|
+
slots=self._slots,
|
844
|
+
)
|
845
|
+
|
846
|
+
# for consistency with BaseCatManager
|
847
|
+
self._columns_field = self._var_fields
|
848
|
+
|
849
|
+
|
850
|
+
class CatManager:
|
851
|
+
"""Manage categoricals by updating registries.
|
852
|
+
|
853
|
+
This class is accessible from within a `DataFrameCurator` via the `.cat` attribute.
|
854
|
+
|
855
|
+
If you find non-validated values, you have several options:
|
856
|
+
|
857
|
+
- new values found in the data can be registered via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.add_new_from`
|
858
|
+
- non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
|
859
|
+
"""
|
860
|
+
|
861
|
+
def __init__(self, *, dataset, categoricals, sources, organism, columns_field=None):
|
550
862
|
# the below is shared with Curator
|
551
863
|
self._artifact: Artifact = None # pass the dataset as an artifact
|
552
864
|
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
@@ -560,7 +872,6 @@ class CatManager:
|
|
560
872
|
self._non_validated = None
|
561
873
|
self._organism = organism
|
562
874
|
self._sources = sources or {}
|
563
|
-
self._exclude = exclude or {}
|
564
875
|
self._columns_field = columns_field
|
565
876
|
self._validate_category_error_messages: str = ""
|
566
877
|
|
@@ -645,10 +956,10 @@ class CatManager:
|
|
645
956
|
settings.verbosity = "warning"
|
646
957
|
self._artifact = save_artifact( # type: ignore
|
647
958
|
self._dataset,
|
959
|
+
key=key,
|
648
960
|
description=description,
|
649
961
|
fields=self.categoricals,
|
650
|
-
|
651
|
-
key=key,
|
962
|
+
index_field=self._columns_field,
|
652
963
|
artifact=self._artifact,
|
653
964
|
revises=revises,
|
654
965
|
run=run,
|
@@ -662,34 +973,7 @@ class CatManager:
|
|
662
973
|
|
663
974
|
|
664
975
|
class DataFrameCatManager(CatManager):
|
665
|
-
"""
|
666
|
-
|
667
|
-
See also :class:`~lamindb.Curator`.
|
668
|
-
|
669
|
-
Args:
|
670
|
-
df: The DataFrame object to curate.
|
671
|
-
columns: The field attribute for the feature column.
|
672
|
-
categoricals: A dictionary mapping column names to registry_field.
|
673
|
-
verbosity: The verbosity level.
|
674
|
-
organism: The organism name.
|
675
|
-
sources: A dictionary mapping column names to Source records.
|
676
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
677
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
678
|
-
using the exclude parameter ensures they are not validated.
|
679
|
-
|
680
|
-
Returns:
|
681
|
-
A curator object.
|
682
|
-
|
683
|
-
Examples:
|
684
|
-
>>> import bionty as bt
|
685
|
-
>>> curator = ln.Curator.from_df(
|
686
|
-
... df,
|
687
|
-
... categoricals={
|
688
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
689
|
-
... "donor_id": ULabel.name
|
690
|
-
... }
|
691
|
-
... )
|
692
|
-
"""
|
976
|
+
"""Categorical manager for `DataFrame`."""
|
693
977
|
|
694
978
|
def __init__(
|
695
979
|
self,
|
@@ -699,7 +983,6 @@ class DataFrameCatManager(CatManager):
|
|
699
983
|
verbosity: str = "hint",
|
700
984
|
organism: str | None = None,
|
701
985
|
sources: dict[str, Record] | None = None,
|
702
|
-
exclude: dict | None = None,
|
703
986
|
) -> None:
|
704
987
|
from lamindb.core._settings import settings
|
705
988
|
|
@@ -714,17 +997,16 @@ class DataFrameCatManager(CatManager):
|
|
714
997
|
organism=organism,
|
715
998
|
categoricals=categoricals,
|
716
999
|
sources=sources,
|
717
|
-
exclude=exclude,
|
718
1000
|
)
|
719
1001
|
self._save_columns()
|
720
1002
|
|
721
|
-
def lookup(self, public: bool = False) ->
|
1003
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
722
1004
|
"""Lookup categories.
|
723
1005
|
|
724
1006
|
Args:
|
725
1007
|
public: If "public", the lookup is performed on the public reference.
|
726
1008
|
"""
|
727
|
-
return
|
1009
|
+
return CatLookup(
|
728
1010
|
categoricals=self._categoricals,
|
729
1011
|
slots={"columns": self._columns_field},
|
730
1012
|
public=public,
|
@@ -739,7 +1021,6 @@ class DataFrameCatManager(CatManager):
|
|
739
1021
|
key="columns",
|
740
1022
|
validated_only=False,
|
741
1023
|
source=self._sources.get("columns"),
|
742
|
-
exclude=self._exclude.get("columns"),
|
743
1024
|
)
|
744
1025
|
|
745
1026
|
# Save the rest of the columns based on validated_only
|
@@ -752,7 +1033,6 @@ class DataFrameCatManager(CatManager):
|
|
752
1033
|
validated_only=validated_only,
|
753
1034
|
df=self._dataset, # Get the Feature type from df
|
754
1035
|
source=self._sources.get("columns"),
|
755
|
-
exclude=self._exclude.get("columns"),
|
756
1036
|
)
|
757
1037
|
|
758
1038
|
@deprecated(new_name="is run by default")
|
@@ -778,7 +1058,6 @@ class DataFrameCatManager(CatManager):
|
|
778
1058
|
self._dataset,
|
779
1059
|
fields=self.categoricals,
|
780
1060
|
sources=self._sources,
|
781
|
-
exclude=self._exclude,
|
782
1061
|
curator=self,
|
783
1062
|
organism=self._organism,
|
784
1063
|
)
|
@@ -852,7 +1131,6 @@ class DataFrameCatManager(CatManager):
|
|
852
1131
|
key=categorical,
|
853
1132
|
validated_only=validated_only,
|
854
1133
|
source=self._sources.get(categorical),
|
855
|
-
exclude=self._exclude.get(categorical),
|
856
1134
|
organism=self._organism,
|
857
1135
|
)
|
858
1136
|
# adding new records removes them from non_validated
|
@@ -882,32 +1160,7 @@ class DataFrameCatManager(CatManager):
|
|
882
1160
|
|
883
1161
|
|
884
1162
|
class AnnDataCatManager(CatManager):
|
885
|
-
"""
|
886
|
-
|
887
|
-
Args:
|
888
|
-
data: The AnnData object or an AnnData-like path.
|
889
|
-
var_index: The registry field for mapping the ``.var`` index.
|
890
|
-
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
891
|
-
obs_columns: The registry field for mapping the ``.obs.columns``.
|
892
|
-
verbosity: The verbosity level.
|
893
|
-
organism: The organism name.
|
894
|
-
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
895
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
896
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
897
|
-
using the exclude parameter ensures they are not validated.
|
898
|
-
|
899
|
-
Examples:
|
900
|
-
>>> import bionty as bt
|
901
|
-
>>> curator = ln.Curator.from_anndata(
|
902
|
-
... adata,
|
903
|
-
... var_index=bt.Gene.ensembl_gene_id,
|
904
|
-
... categoricals={
|
905
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
906
|
-
... "donor_id": ULabel.name
|
907
|
-
... },
|
908
|
-
... organism="human",
|
909
|
-
... )
|
910
|
-
"""
|
1163
|
+
"""Categorical manager for `AnnData`."""
|
911
1164
|
|
912
1165
|
def __init__(
|
913
1166
|
self,
|
@@ -918,13 +1171,10 @@ class AnnDataCatManager(CatManager):
|
|
918
1171
|
verbosity: str = "hint",
|
919
1172
|
organism: str | None = None,
|
920
1173
|
sources: dict[str, Record] | None = None,
|
921
|
-
exclude: dict | None = None,
|
922
1174
|
) -> None:
|
923
1175
|
if isinstance(var_index, str):
|
924
1176
|
raise TypeError("var_index parameter has to be a bionty field")
|
925
1177
|
|
926
|
-
if sources is None:
|
927
|
-
sources = {}
|
928
1178
|
if not data_is_anndata(data):
|
929
1179
|
raise TypeError("data has to be an AnnData object")
|
930
1180
|
|
@@ -935,12 +1185,12 @@ class AnnDataCatManager(CatManager):
|
|
935
1185
|
|
936
1186
|
self._obs_fields = categoricals or {}
|
937
1187
|
self._var_field = var_index
|
1188
|
+
self._sources = sources or {}
|
938
1189
|
super().__init__(
|
939
1190
|
dataset=data,
|
940
1191
|
categoricals=categoricals,
|
941
|
-
sources=
|
1192
|
+
sources=self._sources,
|
942
1193
|
organism=organism,
|
943
|
-
exclude=exclude,
|
944
1194
|
columns_field=var_index,
|
945
1195
|
)
|
946
1196
|
self._adata = self._dataset
|
@@ -950,8 +1200,7 @@ class AnnDataCatManager(CatManager):
|
|
950
1200
|
columns=obs_columns,
|
951
1201
|
verbosity=verbosity,
|
952
1202
|
organism=None,
|
953
|
-
sources=
|
954
|
-
exclude=exclude,
|
1203
|
+
sources=self._sources,
|
955
1204
|
)
|
956
1205
|
|
957
1206
|
@property
|
@@ -964,13 +1213,13 @@ class AnnDataCatManager(CatManager):
|
|
964
1213
|
"""Return the obs fields to validate against."""
|
965
1214
|
return self._obs_fields
|
966
1215
|
|
967
|
-
def lookup(self, public: bool = False) ->
|
1216
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
968
1217
|
"""Lookup categories.
|
969
1218
|
|
970
1219
|
Args:
|
971
1220
|
public: If "public", the lookup is performed on the public reference.
|
972
1221
|
"""
|
973
|
-
return
|
1222
|
+
return CatLookup(
|
974
1223
|
categoricals=self._obs_fields,
|
975
1224
|
slots={"columns": self._columns_field, "var_index": self._var_field},
|
976
1225
|
public=public,
|
@@ -989,7 +1238,6 @@ class AnnDataCatManager(CatManager):
|
|
989
1238
|
validated_only=validated_only,
|
990
1239
|
organism=self._organism,
|
991
1240
|
source=self._sources.get("var_index"),
|
992
|
-
exclude=self._exclude.get("var_index"),
|
993
1241
|
)
|
994
1242
|
|
995
1243
|
def add_new_from(self, key: str, **kwargs):
|
@@ -1033,7 +1281,6 @@ class AnnDataCatManager(CatManager):
|
|
1033
1281
|
key="var_index",
|
1034
1282
|
source=self._sources.get("var_index"),
|
1035
1283
|
hint_print=".add_new_from_var_index()",
|
1036
|
-
exclude=self._exclude.get("var_index"),
|
1037
1284
|
organism=self._organism, # type: ignore
|
1038
1285
|
)
|
1039
1286
|
else:
|
@@ -1077,59 +1324,29 @@ class AnnDataCatManager(CatManager):
|
|
1077
1324
|
|
1078
1325
|
|
1079
1326
|
class MuDataCatManager(CatManager):
|
1080
|
-
"""
|
1081
|
-
|
1082
|
-
Args:
|
1083
|
-
mdata: The MuData object to curate.
|
1084
|
-
var_index: The registry field for mapping the ``.var`` index for each modality.
|
1085
|
-
For example:
|
1086
|
-
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
|
1087
|
-
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
1088
|
-
Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
|
1089
|
-
verbosity: The verbosity level.
|
1090
|
-
organism: The organism name.
|
1091
|
-
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
1092
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
1093
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1094
|
-
using the exclude parameter ensures they are not validated.
|
1095
|
-
|
1096
|
-
Examples:
|
1097
|
-
>>> import bionty as bt
|
1098
|
-
>>> curator = ln.Curator.from_mudata(
|
1099
|
-
... mdata,
|
1100
|
-
... var_index={
|
1101
|
-
... "rna": bt.Gene.ensembl_gene_id,
|
1102
|
-
... "adt": CellMarker.name
|
1103
|
-
... },
|
1104
|
-
... categoricals={
|
1105
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
1106
|
-
... "donor_id": ULabel.name
|
1107
|
-
... },
|
1108
|
-
... organism="human",
|
1109
|
-
... )
|
1110
|
-
"""
|
1327
|
+
"""Categorical manager for `MuData`."""
|
1111
1328
|
|
1112
1329
|
def __init__(
|
1113
1330
|
self,
|
1114
1331
|
mdata: MuData | Artifact,
|
1115
|
-
var_index: dict[str, FieldAttr],
|
1332
|
+
var_index: dict[str, FieldAttr] | None = None,
|
1116
1333
|
categoricals: dict[str, FieldAttr] | None = None,
|
1117
1334
|
verbosity: str = "hint",
|
1118
1335
|
organism: str | None = None,
|
1119
1336
|
sources: dict[str, Record] | None = None,
|
1120
|
-
exclude: dict | None = None, # {modality: {field: [values]}}
|
1121
1337
|
) -> None:
|
1122
1338
|
super().__init__(
|
1123
1339
|
dataset=mdata,
|
1124
1340
|
categoricals={},
|
1125
1341
|
sources=sources,
|
1126
1342
|
organism=organism,
|
1127
|
-
exclude=exclude,
|
1128
1343
|
)
|
1129
|
-
self._columns_field =
|
1130
|
-
|
1344
|
+
self._columns_field = (
|
1345
|
+
var_index or {}
|
1346
|
+
) # this is for consistency with BaseCatManager
|
1347
|
+
self._var_fields = var_index or {}
|
1131
1348
|
self._verify_modality(self._var_fields.keys())
|
1132
|
-
self._obs_fields = self._parse_categoricals(categoricals)
|
1349
|
+
self._obs_fields = self._parse_categoricals(categoricals or {})
|
1133
1350
|
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
1134
1351
|
self._verbosity = verbosity
|
1135
1352
|
self._obs_df_curator = None
|
@@ -1140,7 +1357,6 @@ class MuDataCatManager(CatManager):
|
|
1140
1357
|
categoricals=self._obs_fields.get("obs", {}),
|
1141
1358
|
verbosity=verbosity,
|
1142
1359
|
sources=self._sources.get("obs"),
|
1143
|
-
exclude=self._exclude.get("obs"),
|
1144
1360
|
organism=organism,
|
1145
1361
|
)
|
1146
1362
|
self._mod_adata_curators = {
|
@@ -1150,7 +1366,6 @@ class MuDataCatManager(CatManager):
|
|
1150
1366
|
categoricals=self._obs_fields.get(modality),
|
1151
1367
|
verbosity=verbosity,
|
1152
1368
|
sources=self._sources.get(modality),
|
1153
|
-
exclude=self._exclude.get(modality),
|
1154
1369
|
organism=organism,
|
1155
1370
|
)
|
1156
1371
|
for modality in self._modalities
|
@@ -1199,7 +1414,7 @@ class MuDataCatManager(CatManager):
|
|
1199
1414
|
obs_fields["obs"][k] = v
|
1200
1415
|
return obs_fields
|
1201
1416
|
|
1202
|
-
def lookup(self, public: bool = False) ->
|
1417
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
1203
1418
|
"""Lookup categories.
|
1204
1419
|
|
1205
1420
|
Args:
|
@@ -1212,7 +1427,7 @@ class MuDataCatManager(CatManager):
|
|
1212
1427
|
obs_fields[k] = v
|
1213
1428
|
else:
|
1214
1429
|
obs_fields[f"{mod}:{k}"] = v
|
1215
|
-
return
|
1430
|
+
return CatLookup(
|
1216
1431
|
categoricals=obs_fields,
|
1217
1432
|
slots={
|
1218
1433
|
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
@@ -1271,8 +1486,6 @@ class MuDataCatManager(CatManager):
|
|
1271
1486
|
|
1272
1487
|
def validate(self) -> bool:
|
1273
1488
|
"""Validate categories."""
|
1274
|
-
from lamindb.core._settings import settings
|
1275
|
-
|
1276
1489
|
# add all validated records to the current instance
|
1277
1490
|
verbosity = settings.verbosity
|
1278
1491
|
try:
|
@@ -1329,393 +1542,290 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
|
1329
1542
|
)
|
1330
1543
|
|
1331
1544
|
|
1332
|
-
class
|
1333
|
-
"""
|
1334
|
-
|
1335
|
-
Args:
|
1336
|
-
experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
|
1337
|
-
var_index: The registry fields for mapping the `.var` indices for measurements.
|
1338
|
-
Should be in the form `{"measurement name": ("var column", field)}`.
|
1339
|
-
These keys should be used in the flattened form (`'{measurement name}__{column name in .var}'`)
|
1340
|
-
in `.standardize` or `.add_new_from`, see the output of `.var_index`.
|
1341
|
-
categoricals: A dictionary mapping categorical `.obs` columns to a registry field.
|
1342
|
-
obs_columns: The registry field for mapping the names of the `.obs` columns.
|
1343
|
-
organism: The organism name.
|
1344
|
-
sources: A dictionary mapping `.obs` columns to Source records.
|
1345
|
-
exclude: A dictionary mapping column names to values to exclude from validation.
|
1346
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1347
|
-
using the exclude parameter ensures they are not validated.
|
1348
|
-
|
1349
|
-
Examples:
|
1350
|
-
>>> import bionty as bt
|
1351
|
-
>>> curator = ln.Curator.from_tiledbsoma(
|
1352
|
-
... "./my_array_store.tiledbsoma",
|
1353
|
-
... var_index={"RNA": ("var_id", bt.Gene.symbol)},
|
1354
|
-
... categoricals={
|
1355
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
1356
|
-
... "donor_id": ULabel.name
|
1357
|
-
... },
|
1358
|
-
... organism="human",
|
1359
|
-
... )
|
1360
|
-
"""
|
1545
|
+
class SpatialDataCatManager(CatManager):
|
1546
|
+
"""Categorical manager for `SpatialData`."""
|
1361
1547
|
|
1362
1548
|
def __init__(
|
1363
1549
|
self,
|
1364
|
-
|
1365
|
-
var_index: dict[str,
|
1366
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
1367
|
-
|
1550
|
+
sdata: Any,
|
1551
|
+
var_index: dict[str, FieldAttr],
|
1552
|
+
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
1553
|
+
verbosity: str = "hint",
|
1368
1554
|
organism: str | None = None,
|
1369
|
-
sources: dict[str, Record] | None = None,
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1555
|
+
sources: dict[str, dict[str, Record]] | None = None,
|
1556
|
+
*,
|
1557
|
+
sample_metadata_key: str | None = "sample",
|
1558
|
+
) -> None:
|
1559
|
+
super().__init__(
|
1560
|
+
dataset=sdata,
|
1561
|
+
categoricals={},
|
1562
|
+
sources=sources,
|
1563
|
+
organism=organism,
|
1564
|
+
)
|
1565
|
+
if isinstance(sdata, Artifact):
|
1566
|
+
self._sdata = sdata.load()
|
1378
1567
|
else:
|
1379
|
-
self.
|
1380
|
-
|
1381
|
-
self.
|
1382
|
-
self.
|
1383
|
-
self.
|
1384
|
-
|
1385
|
-
self.
|
1386
|
-
|
1387
|
-
self._validated_values: dict[str, list] = {}
|
1388
|
-
# filled by _check_save_keys
|
1389
|
-
self._n_obs: int | None = None
|
1390
|
-
self._valid_obs_keys: list[str] | None = None
|
1391
|
-
self._obs_pa_schema: pa.lib.Schema | None = (
|
1392
|
-
None # this is needed to create the obs feature set
|
1568
|
+
self._sdata = self._dataset
|
1569
|
+
self._sample_metadata_key = sample_metadata_key
|
1570
|
+
self._write_path = None
|
1571
|
+
self._var_fields = var_index
|
1572
|
+
self._verify_accessor_exists(self._var_fields.keys())
|
1573
|
+
self._categoricals = categoricals
|
1574
|
+
self._table_keys = set(self._var_fields.keys()) | set(
|
1575
|
+
self._categoricals.keys() - {self._sample_metadata_key}
|
1393
1576
|
)
|
1394
|
-
self.
|
1395
|
-
self.
|
1396
|
-
self.
|
1397
|
-
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1402
|
-
|
1403
|
-
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1404
|
-
experiment_obs = experiment.obs
|
1405
|
-
self._n_obs = len(experiment_obs)
|
1406
|
-
self._obs_pa_schema = experiment_obs.schema
|
1407
|
-
valid_obs_keys = [
|
1408
|
-
k for k in self._obs_pa_schema.names if k != "soma_joinid"
|
1409
|
-
]
|
1410
|
-
self._valid_obs_keys = valid_obs_keys
|
1411
|
-
|
1412
|
-
valid_var_keys = []
|
1413
|
-
ms_list = []
|
1414
|
-
for ms in experiment.ms.keys():
|
1415
|
-
ms_list.append(ms)
|
1416
|
-
var_ms = experiment.ms[ms].var
|
1417
|
-
valid_var_keys += [
|
1418
|
-
f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
|
1419
|
-
]
|
1420
|
-
self._valid_var_keys = valid_var_keys
|
1577
|
+
self._verbosity = verbosity
|
1578
|
+
self._sample_df_curator = None
|
1579
|
+
if self._sample_metadata_key is not None:
|
1580
|
+
self._sample_metadata = self._sdata.get_attrs(
|
1581
|
+
key=self._sample_metadata_key, return_as="df", flatten=True
|
1582
|
+
)
|
1583
|
+
self._is_validated = False
|
1421
1584
|
|
1422
|
-
#
|
1585
|
+
# Check validity of keys in categoricals
|
1423
1586
|
nonval_keys = []
|
1424
|
-
for
|
1425
|
-
if
|
1426
|
-
|
1587
|
+
for accessor, accessor_categoricals in self._categoricals.items():
|
1588
|
+
if (
|
1589
|
+
accessor == self._sample_metadata_key
|
1590
|
+
and self._sample_metadata is not None
|
1591
|
+
):
|
1592
|
+
for key in accessor_categoricals.keys():
|
1593
|
+
if key not in self._sample_metadata.columns:
|
1594
|
+
nonval_keys.append(key)
|
1595
|
+
else:
|
1596
|
+
for key in accessor_categoricals.keys():
|
1597
|
+
if key not in self._sdata[accessor].obs.columns:
|
1598
|
+
nonval_keys.append(key)
|
1599
|
+
|
1427
1600
|
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1428
1601
|
|
1429
|
-
# check validity of keys in
|
1430
|
-
self._var_fields_flat = {}
|
1602
|
+
# check validity of keys in sources
|
1431
1603
|
nonval_keys = []
|
1432
|
-
for
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1604
|
+
for accessor, accessor_sources in self._sources.items():
|
1605
|
+
if (
|
1606
|
+
accessor == self._sample_metadata_key
|
1607
|
+
and self._sample_metadata is not None
|
1608
|
+
):
|
1609
|
+
columns = self._sample_metadata.columns
|
1610
|
+
elif accessor != self._sample_metadata_key:
|
1611
|
+
columns = self._sdata[accessor].obs.columns
|
1437
1612
|
else:
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1443
|
-
for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
|
1444
|
-
nonval_keys = []
|
1445
|
-
for arg_key in dct.keys():
|
1446
|
-
if arg_key not in valid_arg_keys:
|
1447
|
-
nonval_keys.append(arg_key)
|
1448
|
-
_maybe_curation_keys_not_present(nonval_keys, name)
|
1613
|
+
continue
|
1614
|
+
for key in accessor_sources:
|
1615
|
+
if key not in columns:
|
1616
|
+
nonval_keys.append(key)
|
1617
|
+
_maybe_curation_keys_not_present(nonval_keys, "sources")
|
1449
1618
|
|
1450
|
-
#
|
1451
|
-
register_columns = list(self._obs_fields.keys())
|
1452
|
-
organism = check_registry_organism(
|
1453
|
-
self._columns_field.field.model, self._organism
|
1454
|
-
).get("organism")
|
1455
|
-
update_registry(
|
1456
|
-
values=register_columns,
|
1457
|
-
field=self._columns_field,
|
1458
|
-
key="columns",
|
1459
|
-
validated_only=False,
|
1460
|
-
organism=organism,
|
1461
|
-
source=self._sources.get("columns"),
|
1462
|
-
exclude=self._exclude.get("columns"),
|
1463
|
-
)
|
1464
|
-
additional_columns = [k for k in valid_obs_keys if k not in register_columns]
|
1465
|
-
# no need to register with validated_only=True if columns are features
|
1619
|
+
# Set up sample level metadata and table Curator objects
|
1466
1620
|
if (
|
1467
|
-
|
1468
|
-
and self.
|
1621
|
+
self._sample_metadata_key is not None
|
1622
|
+
and self._sample_metadata_key in self._categoricals
|
1469
1623
|
):
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
|
1474
|
-
|
1624
|
+
self._sample_df_curator = DataFrameCatManager(
|
1625
|
+
df=self._sample_metadata,
|
1626
|
+
columns=Feature.name,
|
1627
|
+
categoricals=self._categoricals.get(self._sample_metadata_key, {}),
|
1628
|
+
verbosity=verbosity,
|
1629
|
+
sources=self._sources.get(self._sample_metadata_key),
|
1630
|
+
organism=organism,
|
1631
|
+
)
|
1632
|
+
self._table_adata_curators = {
|
1633
|
+
table: AnnDataCatManager(
|
1634
|
+
data=self._sdata[table],
|
1635
|
+
var_index=var_index.get(table),
|
1636
|
+
categoricals=self._categoricals.get(table),
|
1637
|
+
verbosity=verbosity,
|
1638
|
+
sources=self._sources.get(table),
|
1475
1639
|
organism=organism,
|
1476
|
-
source=self._sources.get("columns"),
|
1477
|
-
exclude=self._exclude.get("columns"),
|
1478
1640
|
)
|
1641
|
+
for table in self._table_keys
|
1642
|
+
}
|
1479
1643
|
|
1480
|
-
|
1481
|
-
"""Validate categories."""
|
1482
|
-
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1644
|
+
self._non_validated = None
|
1483
1645
|
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
var_ms = experiment.ms[ms].var
|
1489
|
-
var_ms_key = f"{ms}__{key}"
|
1490
|
-
# it was already validated and cached
|
1491
|
-
if var_ms_key in self._validated_values:
|
1492
|
-
continue
|
1493
|
-
var_ms_values = (
|
1494
|
-
var_ms.read(column_names=[key]).concat()[key].to_pylist()
|
1495
|
-
)
|
1496
|
-
organism = check_registry_organism(
|
1497
|
-
field.field.model, self._organism
|
1498
|
-
).get("organism")
|
1499
|
-
update_registry(
|
1500
|
-
values=var_ms_values,
|
1501
|
-
field=field,
|
1502
|
-
key=var_ms_key,
|
1503
|
-
validated_only=True,
|
1504
|
-
organism=organism,
|
1505
|
-
source=self._sources.get(var_ms_key),
|
1506
|
-
exclude=self._exclude.get(var_ms_key),
|
1507
|
-
)
|
1508
|
-
_, non_val = validate_categories(
|
1509
|
-
values=var_ms_values,
|
1510
|
-
field=field,
|
1511
|
-
key=var_ms_key,
|
1512
|
-
organism=organism,
|
1513
|
-
source=self._sources.get(var_ms_key),
|
1514
|
-
exclude=self._exclude.get(var_ms_key),
|
1515
|
-
)
|
1516
|
-
if len(non_val) > 0:
|
1517
|
-
validated = False
|
1518
|
-
self._non_validated_values[var_ms_key] = non_val
|
1519
|
-
else:
|
1520
|
-
self._validated_values[var_ms_key] = var_ms_values
|
1646
|
+
@property
|
1647
|
+
def var_index(self) -> FieldAttr:
|
1648
|
+
"""Return the registry fields to validate variables indices against."""
|
1649
|
+
return self._var_fields
|
1521
1650
|
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
continue
|
1527
|
-
values = pa.compute.unique(
|
1528
|
-
obs.read(column_names=[key]).concat()[key]
|
1529
|
-
).to_pylist()
|
1530
|
-
organism = check_registry_organism(
|
1531
|
-
field.field.model, self._organism
|
1532
|
-
).get("organism")
|
1533
|
-
update_registry(
|
1534
|
-
values=values,
|
1535
|
-
field=field,
|
1536
|
-
key=key,
|
1537
|
-
validated_only=True,
|
1538
|
-
organism=organism,
|
1539
|
-
source=self._sources.get(key),
|
1540
|
-
exclude=self._exclude.get(key),
|
1541
|
-
)
|
1542
|
-
_, non_val = validate_categories(
|
1543
|
-
values=values,
|
1544
|
-
field=field,
|
1545
|
-
key=key,
|
1546
|
-
organism=organism,
|
1547
|
-
source=self._sources.get(key),
|
1548
|
-
exclude=self._exclude.get(key),
|
1549
|
-
)
|
1550
|
-
if len(non_val) > 0:
|
1551
|
-
validated = False
|
1552
|
-
self._non_validated_values[key] = non_val
|
1553
|
-
else:
|
1554
|
-
self._validated_values[key] = values
|
1555
|
-
self._is_validated = validated
|
1556
|
-
return self._is_validated
|
1651
|
+
@property
|
1652
|
+
def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
|
1653
|
+
"""Return the categorical keys and fields to validate against."""
|
1654
|
+
return self._categoricals
|
1557
1655
|
|
1558
|
-
|
1559
|
-
|
1656
|
+
@property
|
1657
|
+
def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
|
1658
|
+
"""Return the non-validated features and labels."""
|
1659
|
+
if self._non_validated is None:
|
1660
|
+
raise ValidationError("Please run validate() first!")
|
1661
|
+
return self._non_validated
|
1560
1662
|
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1663
|
+
def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
|
1664
|
+
"""Verify that the accessors exist (either a valid table or in attrs)."""
|
1665
|
+
for acc in accessors:
|
1666
|
+
is_present = False
|
1667
|
+
try:
|
1668
|
+
self._sdata.get_attrs(key=acc)
|
1669
|
+
is_present = True
|
1670
|
+
except KeyError:
|
1671
|
+
if acc in self._sdata.tables.keys():
|
1672
|
+
is_present = True
|
1673
|
+
if not is_present:
|
1674
|
+
raise ValidationError(f"Accessor '{acc}' does not exist!")
|
1570
1675
|
|
1571
|
-
def
|
1572
|
-
"""
|
1676
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
1677
|
+
"""Look up categories.
|
1573
1678
|
|
1574
1679
|
Args:
|
1575
|
-
|
1576
|
-
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
1577
|
-
or a column name in `.obs`.
|
1680
|
+
public: Whether the lookup is performed on the public reference.
|
1578
1681
|
"""
|
1579
|
-
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
|
1584
|
-
|
1585
|
-
|
1682
|
+
cat_values_dict = list(self.categoricals.values())[0]
|
1683
|
+
return CatLookup(
|
1684
|
+
categoricals=cat_values_dict,
|
1685
|
+
slots={"accessors": cat_values_dict.keys()},
|
1686
|
+
public=public,
|
1687
|
+
)
|
1688
|
+
|
1689
|
+
def _update_registry_all(self) -> None:
|
1690
|
+
"""Saves labels of all features for sample and table metadata."""
|
1691
|
+
if self._sample_df_curator is not None:
|
1692
|
+
self._sample_df_curator._update_registry_all(
|
1693
|
+
validated_only=True,
|
1586
1694
|
)
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1590
|
-
)
|
1591
|
-
keys = [key]
|
1592
|
-
for k in keys:
|
1593
|
-
values, field = self._non_validated_values_field(k)
|
1594
|
-
if len(values) == 0:
|
1595
|
-
continue
|
1596
|
-
organism = check_registry_organism(field.field.model, self._organism).get(
|
1597
|
-
"organism"
|
1598
|
-
)
|
1599
|
-
update_registry(
|
1600
|
-
values=values,
|
1601
|
-
field=field,
|
1602
|
-
key=k,
|
1603
|
-
validated_only=False,
|
1604
|
-
organism=organism,
|
1605
|
-
source=self._sources.get(k),
|
1606
|
-
exclude=self._exclude.get(k),
|
1607
|
-
**kwargs,
|
1695
|
+
for _, adata_curator in self._table_adata_curators.items():
|
1696
|
+
adata_curator._obs_df_curator._update_registry_all(
|
1697
|
+
validated_only=True,
|
1608
1698
|
)
|
1609
|
-
# update non-validated values list but keep the key there
|
1610
|
-
# it will be removed by .validate()
|
1611
|
-
if k in self._non_validated_values:
|
1612
|
-
self._non_validated_values[k] = []
|
1613
1699
|
|
1614
|
-
|
1615
|
-
|
1616
|
-
"""Return the non-validated features and labels."""
|
1617
|
-
non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
|
1618
|
-
return non_val
|
1700
|
+
def add_new_from_var_index(self, table: str, **kwargs) -> None:
|
1701
|
+
"""Save new values from ``.var.index`` of table.
|
1619
1702
|
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1703
|
+
Args:
|
1704
|
+
table: The table key.
|
1705
|
+
organism: The organism name.
|
1706
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
1707
|
+
"""
|
1708
|
+
if self._non_validated is None:
|
1709
|
+
raise ValidationError("Run .validate() first.")
|
1710
|
+
self._table_adata_curators[table].add_new_from_var_index(**kwargs)
|
1711
|
+
if table in self.non_validated.keys():
|
1712
|
+
if "var_index" in self._non_validated[table]:
|
1713
|
+
self._non_validated[table].pop("var_index")
|
1624
1714
|
|
1625
|
-
|
1626
|
-
|
1627
|
-
"""Return the obs fields to validate against."""
|
1628
|
-
return self._obs_fields
|
1715
|
+
if len(self.non_validated[table].values()) == 0:
|
1716
|
+
self.non_validated.pop(table)
|
1629
1717
|
|
1630
|
-
def
|
1631
|
-
|
1718
|
+
def add_new_from(
|
1719
|
+
self,
|
1720
|
+
key: str,
|
1721
|
+
accessor: str | None = None,
|
1722
|
+
**kwargs,
|
1723
|
+
) -> None:
|
1724
|
+
"""Save new values of categorical from sample level metadata or table.
|
1632
1725
|
|
1633
1726
|
Args:
|
1634
|
-
|
1727
|
+
key: The key referencing the slot in the DataFrame.
|
1728
|
+
accessor: The accessor key such as 'sample' or 'table x'.
|
1729
|
+
organism: The organism name.
|
1730
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
1635
1731
|
"""
|
1636
|
-
|
1637
|
-
|
1638
|
-
slots={"columns": self._columns_field, **self._var_fields_flat},
|
1639
|
-
public=public,
|
1640
|
-
)
|
1732
|
+
if self._non_validated is None:
|
1733
|
+
raise ValidationError("Run .validate() first.")
|
1641
1734
|
|
1642
|
-
|
1643
|
-
|
1735
|
+
if len(kwargs) > 0 and key == "all":
|
1736
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
1737
|
+
|
1738
|
+
if accessor not in self.categoricals:
|
1739
|
+
raise ValueError(
|
1740
|
+
f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
|
1741
|
+
)
|
1742
|
+
|
1743
|
+
if accessor in self._table_adata_curators:
|
1744
|
+
adata_curator = self._table_adata_curators[accessor]
|
1745
|
+
adata_curator.add_new_from(key=key, **kwargs)
|
1746
|
+
if accessor == self._sample_metadata_key:
|
1747
|
+
self._sample_df_curator.add_new_from(key=key, **kwargs)
|
1748
|
+
|
1749
|
+
if accessor in self.non_validated.keys():
|
1750
|
+
if len(self.non_validated[accessor].values()) == 0:
|
1751
|
+
self.non_validated.pop(accessor)
|
1752
|
+
|
1753
|
+
def standardize(self, key: str, accessor: str | None = None) -> None:
|
1754
|
+
"""Replace synonyms with canonical values.
|
1644
1755
|
|
1645
1756
|
Modifies the dataset inplace.
|
1646
1757
|
|
1647
1758
|
Args:
|
1648
|
-
key: The key referencing the slot in the
|
1649
|
-
|
1650
|
-
or a column name in `.obs`.
|
1759
|
+
key: The key referencing the slot in the table or sample metadata.
|
1760
|
+
accessor: The accessor key such as 'sample_key' or 'table_key'.
|
1651
1761
|
"""
|
1652
1762
|
if len(self.non_validated) == 0:
|
1653
1763
|
logger.warning("values are already standardized")
|
1654
1764
|
return
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1765
|
+
if self._artifact is not None:
|
1766
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
1767
|
+
|
1768
|
+
if accessor == self._sample_metadata_key:
|
1769
|
+
if key not in self._sample_metadata.columns:
|
1770
|
+
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
1658
1771
|
else:
|
1659
|
-
if
|
1660
|
-
|
1661
|
-
|
1662
|
-
|
1663
|
-
|
1772
|
+
if (
|
1773
|
+
key == "var_index" and self._sdata.tables[accessor].var.index is None
|
1774
|
+
) or (
|
1775
|
+
key != "var_index"
|
1776
|
+
and key not in self._sdata.tables[accessor].obs.columns
|
1777
|
+
):
|
1778
|
+
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
1664
1779
|
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
ms, _, slot_key = k.partition("__")
|
1671
|
-
slot = lambda experiment: experiment.ms[ms].var # noqa: B023
|
1672
|
-
else:
|
1673
|
-
slot = lambda experiment: experiment.obs
|
1674
|
-
slot_key = k
|
1675
|
-
# errors if public ontology and the model has no organism
|
1676
|
-
# has to be fixed in bionty
|
1677
|
-
organism = check_registry_organism(field.field.model, self._organism).get(
|
1678
|
-
"organism"
|
1679
|
-
)
|
1680
|
-
syn_mapper = standardize_categories(
|
1681
|
-
values=values,
|
1682
|
-
field=field,
|
1683
|
-
source=self._sources.get(k),
|
1684
|
-
organism=organism,
|
1685
|
-
)
|
1686
|
-
if (n_syn_mapper := len(syn_mapper)) == 0:
|
1687
|
-
continue
|
1780
|
+
if accessor in self._table_adata_curators.keys():
|
1781
|
+
adata_curator = self._table_adata_curators[accessor]
|
1782
|
+
adata_curator.standardize(key)
|
1783
|
+
if accessor == self._sample_metadata_key:
|
1784
|
+
self._sample_df_curator.standardize(key)
|
1688
1785
|
|
1689
|
-
|
1786
|
+
if len(self.non_validated[accessor].values()) == 0:
|
1787
|
+
self.non_validated.pop(accessor)
|
1690
1788
|
|
1691
|
-
|
1692
|
-
|
1693
|
-
table = slot(experiment).read(value_filter=value_filter).concat()
|
1789
|
+
def validate(self) -> bool:
|
1790
|
+
"""Validate variables and categorical observations.
|
1694
1791
|
|
1695
|
-
|
1696
|
-
|
1792
|
+
This method also registers the validated records in the current instance:
|
1793
|
+
- from public sources
|
1697
1794
|
|
1698
|
-
|
1699
|
-
|
1700
|
-
df[slot_key] = df[slot_key].map(
|
1701
|
-
lambda val: syn_mapper.get(val, val) # noqa
|
1702
|
-
)
|
1703
|
-
# write the mapped values
|
1704
|
-
with _open_tiledbsoma(self._dataset, mode="w") as experiment:
|
1705
|
-
slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
|
1706
|
-
# update non_validated dict
|
1707
|
-
non_val_k = [
|
1708
|
-
nv for nv in self._non_validated_values[k] if nv not in syn_mapper
|
1709
|
-
]
|
1710
|
-
self._non_validated_values[k] = non_val_k
|
1795
|
+
Args:
|
1796
|
+
organism: The organism name.
|
1711
1797
|
|
1712
|
-
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1718
|
-
|
1798
|
+
Returns:
|
1799
|
+
Whether the SpatialData object is validated.
|
1800
|
+
"""
|
1801
|
+
# add all validated records to the current instance
|
1802
|
+
verbosity = settings.verbosity
|
1803
|
+
try:
|
1804
|
+
settings.verbosity = "error"
|
1805
|
+
self._update_registry_all()
|
1806
|
+
finally:
|
1807
|
+
settings.verbosity = verbosity
|
1808
|
+
|
1809
|
+
self._non_validated = {} # type: ignore
|
1810
|
+
|
1811
|
+
sample_validated = True
|
1812
|
+
if self._sample_df_curator:
|
1813
|
+
logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
|
1814
|
+
sample_validated &= self._sample_df_curator.validate()
|
1815
|
+
if len(self._sample_df_curator.non_validated) > 0:
|
1816
|
+
self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
|
1817
|
+
logger.print("")
|
1818
|
+
|
1819
|
+
mods_validated = True
|
1820
|
+
for table, adata_curator in self._table_adata_curators.items():
|
1821
|
+
logger.info(f"validating categoricals of table '{table}' ...")
|
1822
|
+
mods_validated &= adata_curator.validate()
|
1823
|
+
if len(adata_curator.non_validated) > 0:
|
1824
|
+
self._non_validated[table] = adata_curator.non_validated # type: ignore
|
1825
|
+
logger.print("")
|
1826
|
+
|
1827
|
+
self._is_validated = sample_validated & mods_validated
|
1828
|
+
return self._is_validated
|
1719
1829
|
|
1720
1830
|
def save_artifact(
|
1721
1831
|
self,
|
@@ -1725,423 +1835,388 @@ class TiledbsomaCatManager(CatManager):
|
|
1725
1835
|
revises: Artifact | None = None,
|
1726
1836
|
run: Run | None = None,
|
1727
1837
|
) -> Artifact:
|
1728
|
-
"""Save the validated
|
1838
|
+
"""Save the validated SpatialData store and metadata.
|
1729
1839
|
|
1730
1840
|
Args:
|
1731
|
-
description: A description of the
|
1841
|
+
description: A description of the dataset.
|
1732
1842
|
key: A path-like key to reference artifact in default storage,
|
1733
|
-
e.g., `"
|
1843
|
+
e.g., `"myartifact.zarr"`. Artifacts with the same key form a version family.
|
1734
1844
|
revises: Previous version of the artifact. Triggers a revision.
|
1735
1845
|
run: The run that creates the artifact.
|
1736
1846
|
|
1737
1847
|
Returns:
|
1738
1848
|
A saved artifact record.
|
1739
1849
|
"""
|
1740
|
-
from lamindb.models.artifact import add_labels
|
1741
|
-
|
1742
1850
|
if not self._is_validated:
|
1743
1851
|
self.validate()
|
1744
1852
|
if not self._is_validated:
|
1745
1853
|
raise ValidationError("Dataset does not validate. Please curate.")
|
1746
1854
|
|
1747
|
-
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1855
|
+
return save_artifact(
|
1856
|
+
self._sdata,
|
1857
|
+
description=description,
|
1858
|
+
fields=self.categoricals,
|
1859
|
+
index_field=self.var_index,
|
1860
|
+
key=key,
|
1861
|
+
artifact=self._artifact,
|
1862
|
+
revises=revises,
|
1863
|
+
run=run,
|
1864
|
+
schema=None,
|
1865
|
+
organism=self._organism,
|
1866
|
+
sample_metadata_key=self._sample_metadata_key,
|
1867
|
+
)
|
1868
|
+
|
1869
|
+
|
1870
|
+
class TiledbsomaCatManager(CatManager):
|
1871
|
+
"""Categorical manager for `tiledbsoma.Experiment`."""
|
1872
|
+
|
1873
|
+
def __init__(
|
1874
|
+
self,
|
1875
|
+
experiment_uri: UPathStr | Artifact,
|
1876
|
+
var_index: dict[str, tuple[str, FieldAttr]],
|
1877
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
1878
|
+
obs_columns: FieldAttr = Feature.name,
|
1879
|
+
organism: str | None = None,
|
1880
|
+
sources: dict[str, Record] | None = None,
|
1881
|
+
):
|
1882
|
+
self._obs_fields = categoricals or {}
|
1883
|
+
self._var_fields = var_index
|
1884
|
+
self._columns_field = obs_columns
|
1885
|
+
if isinstance(experiment_uri, Artifact):
|
1886
|
+
self._dataset = experiment_uri.path
|
1887
|
+
self._artifact = experiment_uri
|
1758
1888
|
else:
|
1759
|
-
|
1889
|
+
self._dataset = UPath(experiment_uri)
|
1890
|
+
self._artifact = None
|
1891
|
+
self._organism = organism
|
1892
|
+
self._sources = sources or {}
|
1760
1893
|
|
1761
|
-
|
1762
|
-
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
1766
|
-
|
1767
|
-
|
1768
|
-
|
1769
|
-
|
1770
|
-
|
1771
|
-
|
1772
|
-
|
1773
|
-
field=self._columns_field,
|
1774
|
-
mute=True,
|
1775
|
-
organism=organism,
|
1776
|
-
)
|
1777
|
-
for ms in self._var_fields:
|
1778
|
-
var_key, var_field = self._var_fields[ms]
|
1779
|
-
organism = check_registry_organism(
|
1780
|
-
var_field.field.model, self._organism
|
1781
|
-
).get("organism")
|
1782
|
-
feature_sets[f"{ms}__var"] = Schema.from_values(
|
1783
|
-
values=self._validated_values[f"{ms}__{var_key}"],
|
1784
|
-
field=var_field,
|
1785
|
-
organism=organism,
|
1786
|
-
raise_validation_error=False,
|
1787
|
-
)
|
1788
|
-
artifact._staged_feature_sets = feature_sets
|
1789
|
-
|
1790
|
-
feature_ref_is_name = _ref_is_name(self._columns_field)
|
1791
|
-
features = Feature.lookup().dict()
|
1792
|
-
for key, field in self._obs_fields.items():
|
1793
|
-
feature = features.get(key)
|
1794
|
-
registry = field.field.model
|
1795
|
-
organism = check_registry_organism(field.field.model, self._organism).get(
|
1796
|
-
"organism"
|
1797
|
-
)
|
1798
|
-
labels = registry.from_values(
|
1799
|
-
values=self._validated_values[key], field=field, organism=organism
|
1800
|
-
)
|
1801
|
-
if len(labels) == 0:
|
1802
|
-
continue
|
1803
|
-
if hasattr(registry, "_name_field"):
|
1804
|
-
label_ref_is_name = field.field.name == registry._name_field
|
1805
|
-
add_labels(
|
1806
|
-
artifact,
|
1807
|
-
records=labels,
|
1808
|
-
feature=feature,
|
1809
|
-
feature_ref_is_name=feature_ref_is_name,
|
1810
|
-
label_ref_is_name=label_ref_is_name,
|
1811
|
-
from_curator=True,
|
1812
|
-
)
|
1813
|
-
|
1814
|
-
return artifact.save()
|
1815
|
-
|
1816
|
-
|
1817
|
-
class SpatialDataCatManager(CatManager):
|
1818
|
-
"""Curation flow for a ``Spatialdata`` object.
|
1819
|
-
|
1820
|
-
See also :class:`~lamindb.Curator`.
|
1821
|
-
|
1822
|
-
Note that if genes or other measurements are removed from the SpatialData object,
|
1823
|
-
the object should be recreated.
|
1894
|
+
self._is_validated: bool | None = False
|
1895
|
+
self._non_validated_values: dict[str, list] | None = None
|
1896
|
+
self._validated_values: dict[str, list] = {}
|
1897
|
+
# filled by _check_save_keys
|
1898
|
+
self._n_obs: int | None = None
|
1899
|
+
self._valid_obs_keys: list[str] | None = None
|
1900
|
+
self._obs_pa_schema: pa.lib.Schema | None = (
|
1901
|
+
None # this is needed to create the obs feature set
|
1902
|
+
)
|
1903
|
+
self._valid_var_keys: list[str] | None = None
|
1904
|
+
self._var_fields_flat: dict[str, FieldAttr] | None = None
|
1905
|
+
self._check_save_keys()
|
1824
1906
|
|
1825
|
-
|
1907
|
+
# check that the provided keys in var_index and categoricals are available in the store
|
1908
|
+
# and save features
|
1909
|
+
def _check_save_keys(self):
|
1910
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1826
1911
|
|
1827
|
-
|
1828
|
-
|
1829
|
-
|
1830
|
-
|
1912
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1913
|
+
experiment_obs = experiment.obs
|
1914
|
+
self._n_obs = len(experiment_obs)
|
1915
|
+
self._obs_pa_schema = experiment_obs.schema
|
1916
|
+
valid_obs_keys = [
|
1917
|
+
k for k in self._obs_pa_schema.names if k != "soma_joinid"
|
1918
|
+
]
|
1919
|
+
self._valid_obs_keys = valid_obs_keys
|
1831
1920
|
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1835
|
-
|
1836
|
-
|
1837
|
-
|
1838
|
-
|
1839
|
-
|
1840
|
-
|
1841
|
-
>>> import bionty as bt
|
1842
|
-
>>> curator = SpatialDataCatManager(
|
1843
|
-
... sdata,
|
1844
|
-
... var_index={
|
1845
|
-
... "table_1": bt.Gene.ensembl_gene_id,
|
1846
|
-
... },
|
1847
|
-
... categoricals={
|
1848
|
-
... "table1":
|
1849
|
-
... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
|
1850
|
-
... "sample":
|
1851
|
-
... {"experimental_factor": bt.ExperimentalFactor.name},
|
1852
|
-
... },
|
1853
|
-
... organism="human",
|
1854
|
-
... )
|
1855
|
-
"""
|
1921
|
+
valid_var_keys = []
|
1922
|
+
ms_list = []
|
1923
|
+
for ms in experiment.ms.keys():
|
1924
|
+
ms_list.append(ms)
|
1925
|
+
var_ms = experiment.ms[ms].var
|
1926
|
+
valid_var_keys += [
|
1927
|
+
f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
|
1928
|
+
]
|
1929
|
+
self._valid_var_keys = valid_var_keys
|
1856
1930
|
|
1857
|
-
|
1858
|
-
|
1859
|
-
|
1860
|
-
|
1861
|
-
|
1862
|
-
|
1863
|
-
organism: str | None = None,
|
1864
|
-
sources: dict[str, dict[str, Record]] | None = None,
|
1865
|
-
exclude: dict[str, dict] | None = None,
|
1866
|
-
*,
|
1867
|
-
sample_metadata_key: str | None = "sample",
|
1868
|
-
) -> None:
|
1869
|
-
super().__init__(
|
1870
|
-
dataset=sdata,
|
1871
|
-
categoricals={},
|
1872
|
-
sources=sources,
|
1873
|
-
organism=organism,
|
1874
|
-
exclude=exclude,
|
1875
|
-
)
|
1876
|
-
if isinstance(sdata, Artifact):
|
1877
|
-
self._sdata = sdata.load()
|
1878
|
-
else:
|
1879
|
-
self._sdata = self._dataset
|
1880
|
-
self._sample_metadata_key = sample_metadata_key
|
1881
|
-
self._write_path = None
|
1882
|
-
self._var_fields = var_index
|
1883
|
-
self._verify_accessor_exists(self._var_fields.keys())
|
1884
|
-
self._categoricals = categoricals
|
1885
|
-
self._table_keys = set(self._var_fields.keys()) | set(
|
1886
|
-
self._categoricals.keys() - {self._sample_metadata_key}
|
1887
|
-
)
|
1888
|
-
self._verbosity = verbosity
|
1889
|
-
self._sample_df_curator = None
|
1890
|
-
if self._sample_metadata_key is not None:
|
1891
|
-
self._sample_metadata = self._sdata.get_attrs(
|
1892
|
-
key=self._sample_metadata_key, return_as="df", flatten=True
|
1893
|
-
)
|
1894
|
-
self._is_validated = False
|
1931
|
+
# check validity of keys in categoricals
|
1932
|
+
nonval_keys = []
|
1933
|
+
for obs_key in self._obs_fields.keys():
|
1934
|
+
if obs_key not in valid_obs_keys:
|
1935
|
+
nonval_keys.append(obs_key)
|
1936
|
+
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1895
1937
|
|
1896
|
-
#
|
1938
|
+
# check validity of keys in var_index
|
1939
|
+
self._var_fields_flat = {}
|
1897
1940
|
nonval_keys = []
|
1898
|
-
for
|
1899
|
-
|
1900
|
-
|
1901
|
-
|
1902
|
-
|
1903
|
-
for key in accessor_categoricals.keys():
|
1904
|
-
if key not in self._sample_metadata.columns:
|
1905
|
-
nonval_keys.append(key)
|
1941
|
+
for ms_key in self._var_fields.keys():
|
1942
|
+
var_key, var_field = self._var_fields[ms_key]
|
1943
|
+
var_key_flat = f"{ms_key}__{var_key}"
|
1944
|
+
if var_key_flat not in valid_var_keys:
|
1945
|
+
nonval_keys.append(f"({ms_key}, {var_key})")
|
1906
1946
|
else:
|
1907
|
-
|
1908
|
-
|
1909
|
-
nonval_keys.append(key)
|
1910
|
-
|
1911
|
-
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1947
|
+
self._var_fields_flat[var_key_flat] = var_field
|
1948
|
+
_maybe_curation_keys_not_present(nonval_keys, "var_index")
|
1912
1949
|
|
1913
|
-
# check validity of keys in sources
|
1914
|
-
|
1915
|
-
|
1916
|
-
|
1917
|
-
|
1918
|
-
|
1919
|
-
|
1920
|
-
):
|
1921
|
-
columns = self._sample_metadata.columns
|
1922
|
-
elif accessor != self._sample_metadata_key:
|
1923
|
-
columns = self._sdata[accessor].obs.columns
|
1924
|
-
else:
|
1925
|
-
continue
|
1926
|
-
for key in accessor_sources:
|
1927
|
-
if key not in columns:
|
1928
|
-
nonval_keys.append(key)
|
1929
|
-
_maybe_curation_keys_not_present(nonval_keys, name)
|
1950
|
+
# check validity of keys in sources
|
1951
|
+
valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
|
1952
|
+
nonval_keys = []
|
1953
|
+
for arg_key in self._sources.keys():
|
1954
|
+
if arg_key not in valid_arg_keys:
|
1955
|
+
nonval_keys.append(arg_key)
|
1956
|
+
_maybe_curation_keys_not_present(nonval_keys, "sources")
|
1930
1957
|
|
1931
|
-
#
|
1958
|
+
# register obs columns' names
|
1959
|
+
register_columns = list(self._obs_fields.keys())
|
1960
|
+
organism = configure_organism(
|
1961
|
+
self._columns_field.field.model, self._organism
|
1962
|
+
).get("organism")
|
1963
|
+
update_registry(
|
1964
|
+
values=register_columns,
|
1965
|
+
field=self._columns_field,
|
1966
|
+
key="columns",
|
1967
|
+
validated_only=False,
|
1968
|
+
organism=organism,
|
1969
|
+
source=self._sources.get("columns"),
|
1970
|
+
)
|
1971
|
+
additional_columns = [k for k in valid_obs_keys if k not in register_columns]
|
1972
|
+
# no need to register with validated_only=True if columns are features
|
1932
1973
|
if (
|
1933
|
-
|
1934
|
-
and self.
|
1974
|
+
len(additional_columns) > 0
|
1975
|
+
and self._columns_field.field.model is not Feature
|
1935
1976
|
):
|
1936
|
-
|
1937
|
-
|
1938
|
-
|
1939
|
-
|
1940
|
-
|
1941
|
-
sources=self._sources.get(self._sample_metadata_key),
|
1942
|
-
exclude=self._exclude.get(self._sample_metadata_key),
|
1943
|
-
organism=organism,
|
1944
|
-
)
|
1945
|
-
self._table_adata_curators = {
|
1946
|
-
table: AnnDataCatManager(
|
1947
|
-
data=self._sdata[table],
|
1948
|
-
var_index=var_index.get(table),
|
1949
|
-
categoricals=self._categoricals.get(table),
|
1950
|
-
verbosity=verbosity,
|
1951
|
-
sources=self._sources.get(table),
|
1952
|
-
exclude=self._exclude.get(table),
|
1977
|
+
update_registry(
|
1978
|
+
values=additional_columns,
|
1979
|
+
field=self._columns_field,
|
1980
|
+
key="columns",
|
1981
|
+
validated_only=True,
|
1953
1982
|
organism=organism,
|
1983
|
+
source=self._sources.get("columns"),
|
1954
1984
|
)
|
1955
|
-
for table in self._table_keys
|
1956
|
-
}
|
1957
1985
|
|
1958
|
-
|
1986
|
+
def validate(self):
|
1987
|
+
"""Validate categories."""
|
1988
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1959
1989
|
|
1960
|
-
|
1961
|
-
|
1962
|
-
""
|
1963
|
-
|
1990
|
+
validated = True
|
1991
|
+
self._non_validated_values = {}
|
1992
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1993
|
+
for ms, (key, field) in self._var_fields.items():
|
1994
|
+
var_ms = experiment.ms[ms].var
|
1995
|
+
var_ms_key = f"{ms}__{key}"
|
1996
|
+
# it was already validated and cached
|
1997
|
+
if var_ms_key in self._validated_values:
|
1998
|
+
continue
|
1999
|
+
var_ms_values = (
|
2000
|
+
var_ms.read(column_names=[key]).concat()[key].to_pylist()
|
2001
|
+
)
|
2002
|
+
organism = configure_organism(field.field.model, self._organism).get(
|
2003
|
+
"organism"
|
2004
|
+
)
|
2005
|
+
update_registry(
|
2006
|
+
values=var_ms_values,
|
2007
|
+
field=field,
|
2008
|
+
key=var_ms_key,
|
2009
|
+
validated_only=True,
|
2010
|
+
organism=organism,
|
2011
|
+
source=self._sources.get(var_ms_key),
|
2012
|
+
)
|
2013
|
+
_, non_val = validate_categories(
|
2014
|
+
values=var_ms_values,
|
2015
|
+
field=field,
|
2016
|
+
key=var_ms_key,
|
2017
|
+
organism=organism,
|
2018
|
+
source=self._sources.get(var_ms_key),
|
2019
|
+
)
|
2020
|
+
if len(non_val) > 0:
|
2021
|
+
validated = False
|
2022
|
+
self._non_validated_values[var_ms_key] = non_val
|
2023
|
+
else:
|
2024
|
+
self._validated_values[var_ms_key] = var_ms_values
|
1964
2025
|
|
1965
|
-
|
1966
|
-
|
1967
|
-
|
1968
|
-
|
2026
|
+
obs = experiment.obs
|
2027
|
+
for key, field in self._obs_fields.items():
|
2028
|
+
# already validated and cached
|
2029
|
+
if key in self._validated_values:
|
2030
|
+
continue
|
2031
|
+
values = pa.compute.unique(
|
2032
|
+
obs.read(column_names=[key]).concat()[key]
|
2033
|
+
).to_pylist()
|
2034
|
+
organism = configure_organism(field.field.model, self._organism).get(
|
2035
|
+
"organism"
|
2036
|
+
)
|
2037
|
+
update_registry(
|
2038
|
+
values=values,
|
2039
|
+
field=field,
|
2040
|
+
key=key,
|
2041
|
+
validated_only=True,
|
2042
|
+
organism=organism,
|
2043
|
+
source=self._sources.get(key),
|
2044
|
+
)
|
2045
|
+
_, non_val = validate_categories(
|
2046
|
+
values=values,
|
2047
|
+
field=field,
|
2048
|
+
key=key,
|
2049
|
+
organism=organism,
|
2050
|
+
source=self._sources.get(key),
|
2051
|
+
)
|
2052
|
+
if len(non_val) > 0:
|
2053
|
+
validated = False
|
2054
|
+
self._non_validated_values[key] = non_val
|
2055
|
+
else:
|
2056
|
+
self._validated_values[key] = values
|
2057
|
+
self._is_validated = validated
|
2058
|
+
return self._is_validated
|
1969
2059
|
|
1970
|
-
|
1971
|
-
|
1972
|
-
"""Return the non-validated features and labels."""
|
1973
|
-
if self._non_validated is None:
|
1974
|
-
raise ValidationError("Please run validate() first!")
|
1975
|
-
return self._non_validated
|
2060
|
+
def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
|
2061
|
+
assert self._non_validated_values is not None # noqa: S101
|
1976
2062
|
|
1977
|
-
|
1978
|
-
|
1979
|
-
|
1980
|
-
|
1981
|
-
|
1982
|
-
|
1983
|
-
|
1984
|
-
|
1985
|
-
|
1986
|
-
is_present = True
|
1987
|
-
if not is_present:
|
1988
|
-
raise ValidationError(f"Accessor '{acc}' does not exist!")
|
2063
|
+
if key in self._valid_obs_keys:
|
2064
|
+
field = self._obs_fields[key]
|
2065
|
+
elif key in self._valid_var_keys:
|
2066
|
+
ms = key.partition("__")[0]
|
2067
|
+
field = self._var_fields[ms][1]
|
2068
|
+
else:
|
2069
|
+
raise KeyError(f"key {key} is invalid!")
|
2070
|
+
values = self._non_validated_values.get(key, [])
|
2071
|
+
return values, field
|
1989
2072
|
|
1990
|
-
def
|
1991
|
-
"""
|
2073
|
+
def add_new_from(self, key: str, **kwargs) -> None:
|
2074
|
+
"""Add validated & new categories.
|
1992
2075
|
|
1993
2076
|
Args:
|
1994
|
-
|
2077
|
+
key: The key referencing the slot in the `tiledbsoma` store.
|
2078
|
+
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
2079
|
+
or a column name in `.obs`.
|
1995
2080
|
"""
|
1996
|
-
|
1997
|
-
|
1998
|
-
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2002
|
-
|
2003
|
-
def _update_registry_all(self) -> None:
|
2004
|
-
"""Saves labels of all features for sample and table metadata."""
|
2005
|
-
if self._sample_df_curator is not None:
|
2006
|
-
self._sample_df_curator._update_registry_all(
|
2007
|
-
validated_only=True,
|
2081
|
+
if self._non_validated_values is None:
|
2082
|
+
raise ValidationError("Run .validate() first.")
|
2083
|
+
if key == "all":
|
2084
|
+
keys = list(self._non_validated_values.keys())
|
2085
|
+
else:
|
2086
|
+
avail_keys = list(
|
2087
|
+
chain(self._non_validated_values.keys(), self._validated_values.keys())
|
2008
2088
|
)
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2089
|
+
if key not in avail_keys:
|
2090
|
+
raise KeyError(
|
2091
|
+
f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
|
2092
|
+
)
|
2093
|
+
keys = [key]
|
2094
|
+
for k in keys:
|
2095
|
+
values, field = self._non_validated_values_field(k)
|
2096
|
+
if len(values) == 0:
|
2097
|
+
continue
|
2098
|
+
organism = configure_organism(field.field.model, self._organism).get(
|
2099
|
+
"organism"
|
2100
|
+
)
|
2101
|
+
update_registry(
|
2102
|
+
values=values,
|
2103
|
+
field=field,
|
2104
|
+
key=k,
|
2105
|
+
validated_only=False,
|
2106
|
+
organism=organism,
|
2107
|
+
source=self._sources.get(k),
|
2108
|
+
**kwargs,
|
2012
2109
|
)
|
2110
|
+
# update non-validated values list but keep the key there
|
2111
|
+
# it will be removed by .validate()
|
2112
|
+
if k in self._non_validated_values:
|
2113
|
+
self._non_validated_values[k] = []
|
2013
2114
|
|
2014
|
-
|
2015
|
-
|
2115
|
+
@property
|
2116
|
+
def non_validated(self) -> dict[str, list]:
|
2117
|
+
"""Return the non-validated features and labels."""
|
2118
|
+
non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
|
2119
|
+
return non_val
|
2016
2120
|
|
2017
|
-
|
2018
|
-
|
2019
|
-
|
2020
|
-
|
2021
|
-
"""
|
2022
|
-
if self._non_validated is None:
|
2023
|
-
raise ValidationError("Run .validate() first.")
|
2024
|
-
self._table_adata_curators[table].add_new_from_var_index(**kwargs)
|
2025
|
-
if table in self.non_validated.keys():
|
2026
|
-
if "var_index" in self._non_validated[table]:
|
2027
|
-
self._non_validated[table].pop("var_index")
|
2121
|
+
@property
|
2122
|
+
def var_index(self) -> dict[str, FieldAttr]:
|
2123
|
+
"""Return the registry fields with flattened keys to validate variables indices against."""
|
2124
|
+
return self._var_fields_flat
|
2028
2125
|
|
2029
|
-
|
2030
|
-
|
2126
|
+
@property
|
2127
|
+
def categoricals(self) -> dict[str, FieldAttr]:
|
2128
|
+
"""Return the obs fields to validate against."""
|
2129
|
+
return self._obs_fields
|
2031
2130
|
|
2032
|
-
def
|
2033
|
-
|
2034
|
-
key: str,
|
2035
|
-
accessor: str | None = None,
|
2036
|
-
**kwargs,
|
2037
|
-
) -> None:
|
2038
|
-
"""Save new values of categorical from sample level metadata or table.
|
2131
|
+
def lookup(self, public: bool = False) -> CatLookup:
|
2132
|
+
"""Lookup categories.
|
2039
2133
|
|
2040
2134
|
Args:
|
2041
|
-
|
2042
|
-
accessor: The accessor key such as 'sample' or 'table x'.
|
2043
|
-
organism: The organism name.
|
2044
|
-
**kwargs: Additional keyword arguments to pass to create new records.
|
2135
|
+
public: If "public", the lookup is performed on the public reference.
|
2045
2136
|
"""
|
2046
|
-
|
2047
|
-
|
2048
|
-
|
2049
|
-
|
2050
|
-
|
2051
|
-
|
2052
|
-
if accessor not in self.categoricals:
|
2053
|
-
raise ValueError(
|
2054
|
-
f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
|
2055
|
-
)
|
2056
|
-
|
2057
|
-
if accessor in self._table_adata_curators:
|
2058
|
-
adata_curator = self._table_adata_curators[accessor]
|
2059
|
-
adata_curator.add_new_from(key=key, **kwargs)
|
2060
|
-
if accessor == self._sample_metadata_key:
|
2061
|
-
self._sample_df_curator.add_new_from(key=key, **kwargs)
|
2062
|
-
|
2063
|
-
if accessor in self.non_validated.keys():
|
2064
|
-
if len(self.non_validated[accessor].values()) == 0:
|
2065
|
-
self.non_validated.pop(accessor)
|
2137
|
+
return CatLookup(
|
2138
|
+
categoricals=self._obs_fields,
|
2139
|
+
slots={"columns": self._columns_field, **self._var_fields_flat},
|
2140
|
+
public=public,
|
2141
|
+
)
|
2066
2142
|
|
2067
|
-
def standardize(self, key: str
|
2068
|
-
"""Replace synonyms with
|
2143
|
+
def standardize(self, key: str):
|
2144
|
+
"""Replace synonyms with standardized values.
|
2069
2145
|
|
2070
2146
|
Modifies the dataset inplace.
|
2071
2147
|
|
2072
2148
|
Args:
|
2073
|
-
key: The key referencing the slot in the
|
2074
|
-
|
2149
|
+
key: The key referencing the slot in the `tiledbsoma` store.
|
2150
|
+
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
2151
|
+
or a column name in `.obs`.
|
2075
2152
|
"""
|
2076
2153
|
if len(self.non_validated) == 0:
|
2077
2154
|
logger.warning("values are already standardized")
|
2078
2155
|
return
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2082
|
-
if accessor == self._sample_metadata_key:
|
2083
|
-
if key not in self._sample_metadata.columns:
|
2084
|
-
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
2156
|
+
avail_keys = list(self._non_validated_values.keys())
|
2157
|
+
if key == "all":
|
2158
|
+
keys = avail_keys
|
2085
2159
|
else:
|
2086
|
-
if
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
):
|
2092
|
-
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
2093
|
-
|
2094
|
-
if accessor in self._table_adata_curators.keys():
|
2095
|
-
adata_curator = self._table_adata_curators[accessor]
|
2096
|
-
adata_curator.standardize(key)
|
2097
|
-
if accessor == self._sample_metadata_key:
|
2098
|
-
self._sample_df_curator.standardize(key)
|
2099
|
-
|
2100
|
-
if len(self.non_validated[accessor].values()) == 0:
|
2101
|
-
self.non_validated.pop(accessor)
|
2102
|
-
|
2103
|
-
def validate(self) -> bool:
|
2104
|
-
"""Validate variables and categorical observations.
|
2105
|
-
|
2106
|
-
This method also registers the validated records in the current instance:
|
2107
|
-
- from public sources
|
2108
|
-
|
2109
|
-
Args:
|
2110
|
-
organism: The organism name.
|
2160
|
+
if key not in avail_keys:
|
2161
|
+
raise KeyError(
|
2162
|
+
f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
|
2163
|
+
)
|
2164
|
+
keys = [key]
|
2111
2165
|
|
2112
|
-
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2166
|
+
for k in keys:
|
2167
|
+
values, field = self._non_validated_values_field(k)
|
2168
|
+
if len(values) == 0:
|
2169
|
+
continue
|
2170
|
+
if k in self._valid_var_keys:
|
2171
|
+
ms, _, slot_key = k.partition("__")
|
2172
|
+
slot = lambda experiment: experiment.ms[ms].var # noqa: B023
|
2173
|
+
else:
|
2174
|
+
slot = lambda experiment: experiment.obs
|
2175
|
+
slot_key = k
|
2176
|
+
# errors if public ontology and the model has no organism
|
2177
|
+
# has to be fixed in bionty
|
2178
|
+
organism = configure_organism(field.field.model, self._organism).get(
|
2179
|
+
"organism"
|
2180
|
+
)
|
2181
|
+
syn_mapper = standardize_categories(
|
2182
|
+
values=values,
|
2183
|
+
field=field,
|
2184
|
+
source=self._sources.get(k),
|
2185
|
+
organism=organism,
|
2186
|
+
)
|
2187
|
+
if (n_syn_mapper := len(syn_mapper)) == 0:
|
2188
|
+
continue
|
2116
2189
|
|
2117
|
-
|
2118
|
-
verbosity = settings.verbosity
|
2119
|
-
try:
|
2120
|
-
settings.verbosity = "error"
|
2121
|
-
self._update_registry_all()
|
2122
|
-
finally:
|
2123
|
-
settings.verbosity = verbosity
|
2190
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
2124
2191
|
|
2125
|
-
|
2192
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
2193
|
+
value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
|
2194
|
+
table = slot(experiment).read(value_filter=value_filter).concat()
|
2126
2195
|
|
2127
|
-
|
2128
|
-
|
2129
|
-
logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
|
2130
|
-
sample_validated &= self._sample_df_curator.validate()
|
2131
|
-
if len(self._sample_df_curator.non_validated) > 0:
|
2132
|
-
self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
|
2133
|
-
logger.print("")
|
2196
|
+
if len(table) == 0:
|
2197
|
+
continue
|
2134
2198
|
|
2135
|
-
|
2136
|
-
|
2137
|
-
|
2138
|
-
|
2139
|
-
|
2140
|
-
|
2141
|
-
|
2199
|
+
df = table.to_pandas()
|
2200
|
+
# map values
|
2201
|
+
df[slot_key] = df[slot_key].map(
|
2202
|
+
lambda val: syn_mapper.get(val, val) # noqa
|
2203
|
+
)
|
2204
|
+
# write the mapped values
|
2205
|
+
with _open_tiledbsoma(self._dataset, mode="w") as experiment:
|
2206
|
+
slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
|
2207
|
+
# update non_validated dict
|
2208
|
+
non_val_k = [
|
2209
|
+
nv for nv in self._non_validated_values[k] if nv not in syn_mapper
|
2210
|
+
]
|
2211
|
+
self._non_validated_values[k] = non_val_k
|
2142
2212
|
|
2143
|
-
|
2144
|
-
|
2213
|
+
syn_mapper_print = _format_values(
|
2214
|
+
[f'"{m_k}" → "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
|
2215
|
+
)
|
2216
|
+
s = "s" if n_syn_mapper > 1 else ""
|
2217
|
+
logger.success(
|
2218
|
+
f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
|
2219
|
+
)
|
2145
2220
|
|
2146
2221
|
def save_artifact(
|
2147
2222
|
self,
|
@@ -2151,217 +2226,119 @@ class SpatialDataCatManager(CatManager):
|
|
2151
2226
|
revises: Artifact | None = None,
|
2152
2227
|
run: Run | None = None,
|
2153
2228
|
) -> Artifact:
|
2229
|
+
"""Save the validated `tiledbsoma` store and metadata.
|
2230
|
+
|
2231
|
+
Args:
|
2232
|
+
description: A description of the ``tiledbsoma`` store.
|
2233
|
+
key: A path-like key to reference artifact in default storage,
|
2234
|
+
e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
|
2235
|
+
revises: Previous version of the artifact. Triggers a revision.
|
2236
|
+
run: The run that creates the artifact.
|
2237
|
+
|
2238
|
+
Returns:
|
2239
|
+
A saved artifact record.
|
2240
|
+
"""
|
2154
2241
|
if not self._is_validated:
|
2155
2242
|
self.validate()
|
2156
2243
|
if not self._is_validated:
|
2157
2244
|
raise ValidationError("Dataset does not validate. Please curate.")
|
2158
2245
|
|
2159
|
-
|
2160
|
-
|
2161
|
-
|
2162
|
-
|
2163
|
-
self._artifact = Artifact.from_spatialdata(
|
2164
|
-
self._sdata,
|
2165
|
-
key=key,
|
2246
|
+
if self._artifact is None:
|
2247
|
+
artifact = Artifact(
|
2248
|
+
self._dataset,
|
2166
2249
|
description=description,
|
2250
|
+
key=key,
|
2167
2251
|
revises=revises,
|
2168
2252
|
run=run,
|
2169
2253
|
)
|
2170
|
-
self.
|
2254
|
+
artifact.n_observations = self._n_obs
|
2255
|
+
artifact.otype = "tiledbsoma"
|
2256
|
+
artifact.save()
|
2257
|
+
else:
|
2258
|
+
artifact = self._artifact
|
2171
2259
|
|
2172
|
-
|
2173
|
-
|
2174
|
-
|
2175
|
-
self._organism
|
2260
|
+
feature_sets = {}
|
2261
|
+
if len(self._obs_fields) > 0:
|
2262
|
+
organism = configure_organism(
|
2263
|
+
self._columns_field.field.model, self._organism
|
2264
|
+
).get("organism")
|
2265
|
+
empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
|
2266
|
+
mock_df = pa.Table.from_pydict(
|
2267
|
+
empty_dict, schema=self._obs_pa_schema
|
2268
|
+
).to_pandas()
|
2269
|
+
# in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
|
2270
|
+
feature_sets["obs"] = Schema.from_df(
|
2271
|
+
df=mock_df,
|
2272
|
+
field=self._columns_field,
|
2273
|
+
mute=True,
|
2274
|
+
organism=organism,
|
2176
2275
|
)
|
2177
|
-
|
2178
|
-
|
2179
|
-
|
2180
|
-
|
2181
|
-
obs_fields: dict[str, FieldAttr] = None,
|
2182
|
-
mute: bool = False,
|
2183
|
-
organism: str | Record | None = None,
|
2184
|
-
):
|
2185
|
-
"""Add Schemas from SpatialData."""
|
2186
|
-
if obs_fields is None:
|
2187
|
-
obs_fields = {}
|
2188
|
-
assert host.otype == "SpatialData" # noqa: S101
|
2189
|
-
|
2190
|
-
feature_sets = {}
|
2191
|
-
|
2192
|
-
# sample features
|
2193
|
-
sample_features = Feature.from_values(self._sample_metadata.columns) # type: ignore
|
2194
|
-
if len(sample_features) > 0:
|
2195
|
-
feature_sets[self._sample_metadata_key] = Schema(
|
2196
|
-
features=sample_features
|
2197
|
-
)
|
2198
|
-
|
2199
|
-
# table features
|
2200
|
-
for table, field in var_fields.items():
|
2201
|
-
table_fs = parse_staged_feature_sets_from_anndata(
|
2202
|
-
self._sdata[table],
|
2203
|
-
var_field=field,
|
2204
|
-
obs_field=obs_fields.get(table, Feature.name),
|
2205
|
-
mute=mute,
|
2206
|
-
organism=organism,
|
2207
|
-
)
|
2208
|
-
for k, v in table_fs.items():
|
2209
|
-
feature_sets[f"['{table}'].{k}"] = v
|
2210
|
-
|
2211
|
-
def _unify_staged_feature_sets_by_hash(
|
2212
|
-
feature_sets: MutableMapping[str, Schema],
|
2213
|
-
):
|
2214
|
-
unique_values: dict[str, Any] = {}
|
2215
|
-
|
2216
|
-
for key, value in feature_sets.items():
|
2217
|
-
value_hash = (
|
2218
|
-
value.hash
|
2219
|
-
) # Assuming each value has a .hash attribute
|
2220
|
-
if value_hash in unique_values:
|
2221
|
-
feature_sets[key] = unique_values[value_hash]
|
2222
|
-
else:
|
2223
|
-
unique_values[value_hash] = value
|
2224
|
-
|
2225
|
-
return feature_sets
|
2226
|
-
|
2227
|
-
# link feature sets
|
2228
|
-
host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
|
2229
|
-
feature_sets
|
2230
|
-
)
|
2231
|
-
host.save()
|
2232
|
-
|
2233
|
-
_add_set_from_spatialdata(
|
2234
|
-
self._artifact, var_fields=self._var_fields, **feature_kwargs
|
2276
|
+
for ms in self._var_fields:
|
2277
|
+
var_key, var_field = self._var_fields[ms]
|
2278
|
+
organism = configure_organism(var_field.field.model, self._organism).get(
|
2279
|
+
"organism"
|
2235
2280
|
)
|
2281
|
+
feature_sets[f"{ms}__var"] = Schema.from_values(
|
2282
|
+
values=self._validated_values[f"{ms}__{var_key}"],
|
2283
|
+
field=var_field,
|
2284
|
+
organism=organism,
|
2285
|
+
raise_validation_error=False,
|
2286
|
+
)
|
2287
|
+
artifact._staged_feature_sets = feature_sets
|
2236
2288
|
|
2237
|
-
|
2238
|
-
|
2239
|
-
|
2240
|
-
|
2241
|
-
|
2242
|
-
|
2243
|
-
|
2244
|
-
|
2245
|
-
|
2246
|
-
|
2247
|
-
feature = features.get(key)
|
2248
|
-
registry = field.field.model
|
2249
|
-
filter_kwargs = check_registry_organism(registry, self._organism)
|
2250
|
-
filter_kwargs_current = get_current_filter_kwargs(
|
2251
|
-
registry, filter_kwargs
|
2252
|
-
)
|
2253
|
-
df = data if isinstance(data, pd.DataFrame) else data.obs
|
2254
|
-
labels = registry.from_values(
|
2255
|
-
df[key],
|
2256
|
-
field=field,
|
2257
|
-
**filter_kwargs_current,
|
2258
|
-
)
|
2259
|
-
if len(labels) == 0:
|
2260
|
-
continue
|
2261
|
-
|
2262
|
-
label_ref_is_name = None
|
2263
|
-
if hasattr(registry, "_name_field"):
|
2264
|
-
label_ref_is_name = field.field.name == registry._name_field
|
2265
|
-
add_labels(
|
2266
|
-
artifact,
|
2267
|
-
records=labels,
|
2268
|
-
feature=feature,
|
2269
|
-
feature_ref_is_name=feature_ref_is_name,
|
2270
|
-
label_ref_is_name=label_ref_is_name,
|
2271
|
-
from_curator=True,
|
2272
|
-
)
|
2273
|
-
|
2274
|
-
for accessor, accessor_fields in self._categoricals.items():
|
2275
|
-
column_field = self._var_fields.get(accessor)
|
2276
|
-
if accessor == self._sample_metadata_key:
|
2277
|
-
_add_labels_from_spatialdata(
|
2278
|
-
self._sample_metadata,
|
2279
|
-
self._artifact,
|
2280
|
-
accessor_fields,
|
2281
|
-
feature_ref_is_name=(
|
2282
|
-
None if column_field is None else _ref_is_name(column_field)
|
2283
|
-
),
|
2284
|
-
)
|
2285
|
-
else:
|
2286
|
-
_add_labels_from_spatialdata(
|
2287
|
-
self._sdata.tables[accessor],
|
2288
|
-
self._artifact,
|
2289
|
-
accessor_fields,
|
2290
|
-
feature_ref_is_name=(
|
2291
|
-
None if column_field is None else _ref_is_name(column_field)
|
2292
|
-
),
|
2293
|
-
)
|
2294
|
-
|
2295
|
-
finally:
|
2296
|
-
settings.verbosity = verbosity
|
2297
|
-
|
2298
|
-
slug = ln_setup.settings.instance.slug
|
2299
|
-
if ln_setup.settings.instance.is_remote: # pragma: no cover
|
2300
|
-
logger.important(
|
2301
|
-
f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
|
2289
|
+
feature_ref_is_name = _ref_is_name(self._columns_field)
|
2290
|
+
features = Feature.lookup().dict()
|
2291
|
+
for key, field in self._obs_fields.items():
|
2292
|
+
feature = features.get(key)
|
2293
|
+
registry = field.field.model
|
2294
|
+
organism = configure_organism(field.field.model, self._organism).get(
|
2295
|
+
"organism"
|
2296
|
+
)
|
2297
|
+
labels = registry.from_values(
|
2298
|
+
values=self._validated_values[key], field=field, organism=organism
|
2302
2299
|
)
|
2300
|
+
if len(labels) == 0:
|
2301
|
+
continue
|
2302
|
+
if hasattr(registry, "_name_field"):
|
2303
|
+
label_ref_is_name = field.field.name == registry._name_field
|
2304
|
+
add_labels(
|
2305
|
+
artifact,
|
2306
|
+
records=labels,
|
2307
|
+
feature=feature,
|
2308
|
+
feature_ref_is_name=feature_ref_is_name,
|
2309
|
+
label_ref_is_name=label_ref_is_name,
|
2310
|
+
from_curator=True,
|
2311
|
+
)
|
2303
2312
|
|
2304
|
-
return
|
2313
|
+
return artifact.save()
|
2305
2314
|
|
2306
2315
|
|
2307
|
-
|
2308
|
-
|
2309
|
-
) -> dict[str, str]:
|
2310
|
-
"""Restrict the obs fields to name return only available obs fields.
|
2316
|
+
class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
2317
|
+
"""Categorical manager for `AnnData` respecting the CELLxGENE schema.
|
2311
2318
|
|
2312
|
-
|
2313
|
-
If both are available, we validate against ontology_id.
|
2314
|
-
If none are available, we validate against name.
|
2319
|
+
This will be superceded by a schema-based curation flow.
|
2315
2320
|
"""
|
2316
|
-
obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
|
2317
|
-
for name, field in obs_fields.items():
|
2318
|
-
if name.endswith("_ontology_term_id"):
|
2319
|
-
continue
|
2320
|
-
# if both the ontology id and the name are present, only validate on the ontology_id
|
2321
|
-
if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
|
2322
|
-
obs_fields_unique.pop(name)
|
2323
|
-
# if the neither name nor ontology id are present, validate on the name
|
2324
|
-
# this will raise error downstream, we just use name to be more readable
|
2325
|
-
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
2326
|
-
obs_fields_unique[name] = field
|
2327
|
-
|
2328
|
-
# Only retain obs_fields_unique that have keys in adata.obs.columns
|
2329
|
-
available_obs_fields = {
|
2330
|
-
k: v for k, v in obs_fields_unique.items() if k in obs.columns
|
2331
|
-
}
|
2332
|
-
|
2333
|
-
return available_obs_fields
|
2334
|
-
|
2335
2321
|
|
2336
|
-
|
2337
|
-
|
2338
|
-
|
2339
|
-
|
2340
|
-
|
2341
|
-
|
2342
|
-
|
2343
|
-
|
2344
|
-
|
2345
|
-
|
2346
|
-
logger.important(
|
2347
|
-
f"added default value '{default}' to the adata.obs['{name}']"
|
2348
|
-
)
|
2349
|
-
|
2350
|
-
|
2351
|
-
class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
2352
|
-
"""Annotation flow of AnnData based on CELLxGENE schema."""
|
2353
|
-
|
2354
|
-
_controls_were_created: bool | None = None
|
2322
|
+
cxg_categoricals_defaults = {
|
2323
|
+
"cell_type": "unknown",
|
2324
|
+
"development_stage": "unknown",
|
2325
|
+
"disease": "normal",
|
2326
|
+
"donor_id": "unknown",
|
2327
|
+
"self_reported_ethnicity": "unknown",
|
2328
|
+
"sex": "unknown",
|
2329
|
+
"suspension_type": "cell",
|
2330
|
+
"tissue_type": "tissue",
|
2331
|
+
}
|
2355
2332
|
|
2356
2333
|
def __init__(
|
2357
2334
|
self,
|
2358
|
-
adata: ad.AnnData
|
2335
|
+
adata: ad.AnnData,
|
2359
2336
|
categoricals: dict[str, FieldAttr] | None = None,
|
2360
2337
|
organism: Literal["human", "mouse"] = "human",
|
2361
2338
|
*,
|
2339
|
+
schema_version: Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
|
2362
2340
|
defaults: dict[str, str] = None,
|
2363
2341
|
extra_sources: dict[str, Record] = None,
|
2364
|
-
schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
|
2365
2342
|
verbosity: str = "hint",
|
2366
2343
|
) -> None:
|
2367
2344
|
"""CELLxGENE schema curator.
|
@@ -2371,304 +2348,85 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2371
2348
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
2372
2349
|
The CELLxGENE Curator maps against the required CELLxGENE fields by default.
|
2373
2350
|
organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
|
2351
|
+
schema_version: The CELLxGENE schema version to curate against.
|
2374
2352
|
defaults: Default values that are set if columns or column values are missing.
|
2375
2353
|
extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
|
2376
2354
|
These extra sources are joined with the CELLxGENE fixed sources.
|
2377
2355
|
Use this parameter when subclassing.
|
2378
|
-
exclude: A dictionary mapping column names to values to exclude.
|
2379
|
-
schema_version: The CELLxGENE schema version to curate against.
|
2380
2356
|
verbosity: The verbosity level.
|
2381
|
-
|
2382
2357
|
"""
|
2383
2358
|
import bionty as bt
|
2384
2359
|
|
2385
|
-
|
2360
|
+
from ._cellxgene_schemas import (
|
2361
|
+
_add_defaults_to_obs,
|
2362
|
+
_create_sources,
|
2363
|
+
_init_categoricals_additional_values,
|
2364
|
+
_restrict_obs_fields,
|
2365
|
+
)
|
2386
2366
|
|
2387
|
-
|
2367
|
+
# Add defaults first to ensure that we fetch valid sources
|
2368
|
+
if defaults:
|
2369
|
+
_add_defaults_to_obs(adata.obs, defaults)
|
2388
2370
|
|
2371
|
+
# Filter categoricals based on what's present in adata
|
2389
2372
|
if categoricals is None:
|
2390
|
-
categoricals =
|
2373
|
+
categoricals = self._get_cxg_categoricals()
|
2374
|
+
categoricals = _restrict_obs_fields(adata.obs, categoricals)
|
2391
2375
|
|
2392
|
-
|
2393
|
-
|
2394
|
-
VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
|
2395
|
-
if schema_version not in VALID_SCHEMA_VERSIONS:
|
2396
|
-
valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
|
2397
|
-
raise ValueError(
|
2398
|
-
f"Invalid schema_version: {schema_version}. "
|
2399
|
-
f"Valid versions are: {valid_versions}"
|
2400
|
-
)
|
2376
|
+
# Configure sources
|
2377
|
+
sources = _create_sources(categoricals, schema_version, organism)
|
2401
2378
|
self.schema_version = schema_version
|
2402
2379
|
self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
|
2403
|
-
with resources.path(
|
2404
|
-
"lamindb.curators._cellxgene_schemas", "schema_versions.yml"
|
2405
|
-
) as schema_versions_path:
|
2406
|
-
self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
|
2407
|
-
self.schema_version
|
2408
|
-
]
|
2409
|
-
|
2410
|
-
# Fetch AnnData obs to be able to set defaults and get sources
|
2411
|
-
if isinstance(adata, ad.AnnData):
|
2412
|
-
self._adata_obs = adata.obs
|
2413
|
-
else:
|
2414
|
-
self._adata_obs = backed_access(upath.create_path(adata)).obs # type: ignore
|
2415
|
-
|
2416
|
-
# Add defaults first to ensure that we fetch valid sources
|
2417
|
-
if defaults:
|
2418
|
-
_add_defaults_to_obs(self._adata_obs, defaults)
|
2419
|
-
|
2420
|
-
self.sources = self._create_sources(self._adata_obs)
|
2421
|
-
self.sources = {
|
2422
|
-
entity: source
|
2423
|
-
for entity, source in self.sources.items()
|
2424
|
-
if source is not None
|
2425
|
-
}
|
2426
|
-
|
2427
2380
|
# These sources are not a part of the cellxgene schema but rather passed through.
|
2428
2381
|
# This is useful when other Curators extend the CELLxGENE curator
|
2429
2382
|
if extra_sources:
|
2430
|
-
|
2383
|
+
sources = sources | extra_sources
|
2431
2384
|
|
2432
|
-
|
2433
|
-
exclude_keys = {
|
2434
|
-
entity: default
|
2435
|
-
for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
|
2436
|
-
if entity in self._adata_obs.columns # type: ignore
|
2437
|
-
}
|
2385
|
+
_init_categoricals_additional_values()
|
2438
2386
|
|
2439
2387
|
super().__init__(
|
2440
2388
|
data=adata,
|
2441
|
-
var_index=
|
2442
|
-
categoricals=
|
2389
|
+
var_index=bt.Gene.ensembl_gene_id,
|
2390
|
+
categoricals=categoricals,
|
2443
2391
|
verbosity=verbosity,
|
2444
2392
|
organism=organism,
|
2445
|
-
sources=
|
2446
|
-
exclude=exclude_keys,
|
2393
|
+
sources=sources,
|
2447
2394
|
)
|
2448
2395
|
|
2449
2396
|
@classmethod
|
2450
|
-
|
2451
|
-
import bionty as bt
|
2452
|
-
|
2453
|
-
import lamindb as ln
|
2454
|
-
|
2455
|
-
# Note: if you add another control below, be mindful to change the if condition that
|
2456
|
-
# triggers whether creating these records is re-considered
|
2457
|
-
if cls._controls_were_created is None:
|
2458
|
-
cls._controls_were_created = (
|
2459
|
-
ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
|
2460
|
-
is not None
|
2461
|
-
)
|
2462
|
-
if not cls._controls_were_created:
|
2463
|
-
logger.important("Creating control labels in the CellxGene schema.")
|
2464
|
-
bt.CellType(
|
2465
|
-
ontology_id="unknown",
|
2466
|
-
name="unknown",
|
2467
|
-
description="From CellxGene schema.",
|
2468
|
-
).save()
|
2469
|
-
pato = bt.Source.filter(name="pato", version="2024-03-28").one()
|
2470
|
-
normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
|
2471
|
-
bt.Disease(
|
2472
|
-
uid=normal.uid,
|
2473
|
-
name=normal.name,
|
2474
|
-
ontology_id=normal.ontology_id,
|
2475
|
-
description=normal.description,
|
2476
|
-
source=normal.source,
|
2477
|
-
).save()
|
2478
|
-
bt.Ethnicity(
|
2479
|
-
ontology_id="na", name="na", description="From CellxGene schema."
|
2480
|
-
).save()
|
2481
|
-
bt.Ethnicity(
|
2482
|
-
ontology_id="unknown",
|
2483
|
-
name="unknown",
|
2484
|
-
description="From CellxGene schema.",
|
2485
|
-
).save()
|
2486
|
-
bt.DevelopmentalStage(
|
2487
|
-
ontology_id="unknown",
|
2488
|
-
name="unknown",
|
2489
|
-
description="From CellxGene schema.",
|
2490
|
-
).save()
|
2491
|
-
bt.Phenotype(
|
2492
|
-
ontology_id="unknown",
|
2493
|
-
name="unknown",
|
2494
|
-
description="From CellxGene schema.",
|
2495
|
-
).save()
|
2496
|
-
|
2497
|
-
tissue_type = ln.ULabel(
|
2498
|
-
name="TissueType",
|
2499
|
-
is_type=True,
|
2500
|
-
description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
|
2501
|
-
).save()
|
2502
|
-
ln.ULabel(
|
2503
|
-
name="tissue", type=tissue_type, description="From CellxGene schema."
|
2504
|
-
).save()
|
2505
|
-
ln.ULabel(
|
2506
|
-
name="organoid", type=tissue_type, description="From CellxGene schema."
|
2507
|
-
).save()
|
2508
|
-
ln.ULabel(
|
2509
|
-
name="cell culture",
|
2510
|
-
type=tissue_type,
|
2511
|
-
description="From CellxGene schema.",
|
2512
|
-
).save()
|
2513
|
-
|
2514
|
-
suspension_type = ln.ULabel(
|
2515
|
-
name="SuspensionType",
|
2516
|
-
is_type=True,
|
2517
|
-
description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
|
2518
|
-
).save()
|
2519
|
-
ln.ULabel(
|
2520
|
-
name="cell", type=suspension_type, description="From CellxGene schema."
|
2521
|
-
).save()
|
2522
|
-
ln.ULabel(
|
2523
|
-
name="nucleus",
|
2524
|
-
type=suspension_type,
|
2525
|
-
description="From CellxGene schema.",
|
2526
|
-
).save()
|
2527
|
-
ln.ULabel(name="na", type=suspension_type).save()
|
2528
|
-
|
2529
|
-
@classmethod
|
2530
|
-
def _get_categoricals(cls) -> dict[str, FieldAttr]:
|
2531
|
-
import bionty as bt
|
2532
|
-
|
2533
|
-
return {
|
2534
|
-
"assay": bt.ExperimentalFactor.name,
|
2535
|
-
"assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
|
2536
|
-
"cell_type": bt.CellType.name,
|
2537
|
-
"cell_type_ontology_term_id": bt.CellType.ontology_id,
|
2538
|
-
"development_stage": bt.DevelopmentalStage.name,
|
2539
|
-
"development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
|
2540
|
-
"disease": bt.Disease.name,
|
2541
|
-
"disease_ontology_term_id": bt.Disease.ontology_id,
|
2542
|
-
# "donor_id": "str", via pandera
|
2543
|
-
"self_reported_ethnicity": bt.Ethnicity.name,
|
2544
|
-
"self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
|
2545
|
-
"sex": bt.Phenotype.name,
|
2546
|
-
"sex_ontology_term_id": bt.Phenotype.ontology_id,
|
2547
|
-
"suspension_type": ULabel.name,
|
2548
|
-
"tissue": bt.Tissue.name,
|
2549
|
-
"tissue_ontology_term_id": bt.Tissue.ontology_id,
|
2550
|
-
"tissue_type": ULabel.name,
|
2551
|
-
"organism": bt.Organism.name,
|
2552
|
-
"organism_ontology_term_id": bt.Organism.ontology_id,
|
2553
|
-
}
|
2554
|
-
|
2555
|
-
@classmethod
|
2397
|
+
@deprecated(new_name="cxg_categoricals_defaults")
|
2556
2398
|
def _get_categoricals_defaults(cls) -> dict[str, str]:
|
2557
|
-
return
|
2558
|
-
"cell_type": "unknown",
|
2559
|
-
"development_stage": "unknown",
|
2560
|
-
"disease": "normal",
|
2561
|
-
"donor_id": "unknown",
|
2562
|
-
"self_reported_ethnicity": "unknown",
|
2563
|
-
"sex": "unknown",
|
2564
|
-
"suspension_type": "cell",
|
2565
|
-
"tissue_type": "tissue",
|
2566
|
-
}
|
2567
|
-
|
2568
|
-
@property
|
2569
|
-
def pinned_ontologies(self) -> pd.DataFrame:
|
2570
|
-
return self._pinned_ontologies
|
2571
|
-
|
2572
|
-
@property
|
2573
|
-
def adata(self) -> AnnData:
|
2574
|
-
return self._adata
|
2575
|
-
|
2576
|
-
def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
|
2577
|
-
"""Creates a sources dictionary that can be passed to AnnDataCatManager."""
|
2578
|
-
import bionty as bt
|
2579
|
-
|
2580
|
-
# fmt: off
|
2581
|
-
def _fetch_bionty_source(
|
2582
|
-
entity: str, organism: str, source: str
|
2583
|
-
) -> bt.Source | None:
|
2584
|
-
"""Fetch the Bionty source of the pinned ontology.
|
2399
|
+
return cls.cxg_categoricals_defaults
|
2585
2400
|
|
2586
|
-
|
2587
|
-
|
2588
|
-
|
2589
|
-
|
2590
|
-
(self._pinned_ontologies["source"] == source), "version"].iloc[0]
|
2591
|
-
return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
|
2592
|
-
|
2593
|
-
entity_mapping = {
|
2594
|
-
"var_index": ("Gene", self.organism, "ensembl"),
|
2595
|
-
"cell_type": ("CellType", "all", "cl"),
|
2596
|
-
"assay": ("ExperimentalFactor", "all", "efo"),
|
2597
|
-
"self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
|
2598
|
-
"development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
|
2599
|
-
"disease": ("Disease", "all", "mondo"),
|
2600
|
-
# "organism": ("Organism", "vertebrates", "ensembl"),
|
2601
|
-
"sex": ("Phenotype", "all", "pato"),
|
2602
|
-
"tissue": ("Tissue", "all", "uberon"),
|
2603
|
-
}
|
2604
|
-
# fmt: on
|
2605
|
-
|
2606
|
-
# Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
|
2607
|
-
entity_to_sources = {
|
2608
|
-
entity: _fetch_bionty_source(*params)
|
2609
|
-
for entity, params in entity_mapping.items()
|
2610
|
-
if entity in obs.columns
|
2611
|
-
or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
|
2612
|
-
or entity == "var_index"
|
2613
|
-
}
|
2614
|
-
|
2615
|
-
return entity_to_sources
|
2616
|
-
|
2617
|
-
def _convert_name_to_ontology_id(self, values: pd.Series, field: FieldAttr):
|
2618
|
-
"""Converts a column that stores a name into a column that stores the ontology id.
|
2619
|
-
|
2620
|
-
cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
|
2621
|
-
"""
|
2622
|
-
field_name = field.field.name
|
2623
|
-
assert field_name == "name" # noqa: S101
|
2624
|
-
cols = ["name", "ontology_id"]
|
2625
|
-
registry = field.field.model
|
2401
|
+
@classmethod
|
2402
|
+
def _get_cxg_categoricals(cls) -> dict[str, FieldAttr]:
|
2403
|
+
"""Returns the CELLxGENE schema mapped fields."""
|
2404
|
+
from ._cellxgene_schemas import _get_cxg_categoricals
|
2626
2405
|
|
2627
|
-
|
2628
|
-
validated_records = registry.filter(**{f"{field_name}__in": values})
|
2629
|
-
mapper = (
|
2630
|
-
pd.DataFrame(validated_records.values_list(*cols))
|
2631
|
-
.set_index(0)
|
2632
|
-
.to_dict()[1]
|
2633
|
-
)
|
2634
|
-
return values.map(mapper)
|
2406
|
+
return _get_cxg_categoricals()
|
2635
2407
|
|
2636
|
-
def validate(self) -> bool:
|
2408
|
+
def validate(self) -> bool:
|
2637
2409
|
"""Validates the AnnData object against most cellxgene requirements."""
|
2410
|
+
from ._cellxgene_schemas import RESERVED_NAMES
|
2411
|
+
|
2638
2412
|
# Verify that all required obs columns are present
|
2413
|
+
required_columns = list(self.cxg_categoricals_defaults.keys()) + ["donor_id"]
|
2639
2414
|
missing_obs_fields = [
|
2640
2415
|
name
|
2641
|
-
for name in
|
2416
|
+
for name in required_columns
|
2642
2417
|
if name not in self._adata.obs.columns
|
2643
2418
|
and f"{name}_ontology_term_id" not in self._adata.obs.columns
|
2644
2419
|
]
|
2645
2420
|
if len(missing_obs_fields) > 0:
|
2646
|
-
|
2647
|
-
|
2648
|
-
|
2649
|
-
"consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
|
2650
|
-
"to automatically add these columns with default values."
|
2421
|
+
logger.error(
|
2422
|
+
f"missing required obs columns {_format_values(missing_obs_fields)}\n"
|
2423
|
+
" → consider initializing a Curate object with `defaults=cxg.CellxGeneAnnDataCatManager.cxg_categoricals_defaults` to automatically add these columns with default values"
|
2651
2424
|
)
|
2652
2425
|
return False
|
2653
2426
|
|
2654
2427
|
# Verify that no cellxgene reserved names are present
|
2655
|
-
reserved_names = {
|
2656
|
-
"ethnicity",
|
2657
|
-
"ethnicity_ontology_term_id",
|
2658
|
-
"X_normalization",
|
2659
|
-
"default_field",
|
2660
|
-
"layer_descriptions",
|
2661
|
-
"tags",
|
2662
|
-
"versions",
|
2663
|
-
"contributors",
|
2664
|
-
"preprint_doi",
|
2665
|
-
"project_description",
|
2666
|
-
"project_links",
|
2667
|
-
"project_name",
|
2668
|
-
"publication_doi",
|
2669
|
-
}
|
2670
2428
|
matched_columns = [
|
2671
|
-
column for column in self._adata.obs.columns if column in
|
2429
|
+
column for column in self._adata.obs.columns if column in RESERVED_NAMES
|
2672
2430
|
]
|
2673
2431
|
if len(matched_columns) > 0:
|
2674
2432
|
raise ValueError(
|
@@ -2695,6 +2453,26 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2695
2453
|
Returns:
|
2696
2454
|
An AnnData object which adheres to the cellxgene-schema.
|
2697
2455
|
"""
|
2456
|
+
|
2457
|
+
def _convert_name_to_ontology_id(values: pd.Series, field: FieldAttr):
|
2458
|
+
"""Converts a column that stores a name into a column that stores the ontology id.
|
2459
|
+
|
2460
|
+
cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
|
2461
|
+
"""
|
2462
|
+
field_name = field.field.name
|
2463
|
+
assert field_name == "name" # noqa: S101
|
2464
|
+
cols = ["name", "ontology_id"]
|
2465
|
+
registry = field.field.model
|
2466
|
+
|
2467
|
+
if hasattr(registry, "ontology_id"):
|
2468
|
+
validated_records = registry.filter(**{f"{field_name}__in": values})
|
2469
|
+
mapper = (
|
2470
|
+
pd.DataFrame(validated_records.values_list(*cols))
|
2471
|
+
.set_index(0)
|
2472
|
+
.to_dict()[1]
|
2473
|
+
)
|
2474
|
+
return values.map(mapper)
|
2475
|
+
|
2698
2476
|
# Create a copy since we modify the AnnData object extensively
|
2699
2477
|
adata_cxg = self._adata.copy()
|
2700
2478
|
|
@@ -2714,7 +2492,7 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2714
2492
|
# convert name column to ontology_term_id column
|
2715
2493
|
for column in adata_cxg.obs.columns:
|
2716
2494
|
if column in self.categoricals and not column.endswith("_ontology_term_id"):
|
2717
|
-
mapped_column =
|
2495
|
+
mapped_column = _convert_name_to_ontology_id(
|
2718
2496
|
adata_cxg.obs[column], field=self.categoricals.get(column)
|
2719
2497
|
)
|
2720
2498
|
if mapped_column is not None:
|
@@ -2880,7 +2658,7 @@ class TimeHandler:
|
|
2880
2658
|
|
2881
2659
|
|
2882
2660
|
class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
2883
|
-
"""
|
2661
|
+
"""Categorical manager for `AnnData` to manage perturbations."""
|
2884
2662
|
|
2885
2663
|
PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
|
2886
2664
|
|
@@ -2891,45 +2669,32 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
2891
2669
|
pert_dose: bool = True,
|
2892
2670
|
pert_time: bool = True,
|
2893
2671
|
*,
|
2672
|
+
cxg_schema_version: Literal["5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
|
2894
2673
|
verbosity: str = "hint",
|
2895
|
-
cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
|
2896
2674
|
):
|
2897
2675
|
"""Initialize the curator with configuration and validation settings."""
|
2898
|
-
import bionty as bt
|
2899
|
-
|
2900
2676
|
self._pert_time = pert_time
|
2901
2677
|
self._pert_dose = pert_dose
|
2902
2678
|
|
2903
2679
|
self._validate_initial_data(adata)
|
2904
|
-
self.
|
2905
|
-
|
2906
|
-
self._setup_sources(adata)
|
2907
|
-
self._setup_compound_source()
|
2680
|
+
categoricals, categoricals_defaults = self._configure_categoricals(adata)
|
2908
2681
|
|
2909
2682
|
super().__init__(
|
2910
2683
|
adata=adata,
|
2911
|
-
categoricals=
|
2912
|
-
defaults=
|
2913
|
-
verbosity=verbosity,
|
2684
|
+
categoricals=categoricals,
|
2685
|
+
defaults=categoricals_defaults,
|
2914
2686
|
organism=organism,
|
2915
|
-
extra_sources=self.
|
2687
|
+
extra_sources=self._configure_sources(adata),
|
2916
2688
|
schema_version=cxg_schema_version,
|
2689
|
+
verbosity=verbosity,
|
2917
2690
|
)
|
2918
2691
|
|
2919
|
-
def
|
2692
|
+
def _configure_categoricals(self, adata: ad.AnnData):
|
2920
2693
|
"""Set up default configuration values."""
|
2921
2694
|
import bionty as bt
|
2922
2695
|
import wetlab as wl
|
2923
2696
|
|
2924
|
-
|
2925
|
-
CellxGeneAnnDataCatManager._get_categoricals_defaults()
|
2926
|
-
| {
|
2927
|
-
"cell_line": "unknown",
|
2928
|
-
"pert_target": "unknown",
|
2929
|
-
}
|
2930
|
-
)
|
2931
|
-
|
2932
|
-
self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
|
2697
|
+
categoricals = CellxGeneAnnDataCatManager._get_cxg_categoricals() | {
|
2933
2698
|
k: v
|
2934
2699
|
for k, v in {
|
2935
2700
|
"cell_line": bt.CellLine.name,
|
@@ -2941,22 +2706,40 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
2941
2706
|
}.items()
|
2942
2707
|
if k in adata.obs.columns
|
2943
2708
|
}
|
2944
|
-
# if "donor_id" in
|
2945
|
-
#
|
2709
|
+
# if "donor_id" in categoricals:
|
2710
|
+
# categoricals["donor_id"] = Donor.name
|
2946
2711
|
|
2947
|
-
|
2712
|
+
categoricals_defaults = CellxGeneAnnDataCatManager.cxg_categoricals_defaults | {
|
2713
|
+
"cell_line": "unknown",
|
2714
|
+
"pert_target": "unknown",
|
2715
|
+
}
|
2716
|
+
|
2717
|
+
return categoricals, categoricals_defaults
|
2718
|
+
|
2719
|
+
def _configure_sources(self, adata: ad.AnnData):
|
2948
2720
|
"""Set up data sources."""
|
2949
|
-
|
2950
|
-
|
2951
|
-
|
2952
|
-
|
2953
|
-
|
2721
|
+
import bionty as bt
|
2722
|
+
import wetlab as wl
|
2723
|
+
|
2724
|
+
sources = {}
|
2725
|
+
if "cell_line" in adata.obs.columns:
|
2726
|
+
sources["cell_line"] = bt.Source.filter(
|
2727
|
+
entity="bionty.CellLine", name="depmap"
|
2728
|
+
).first()
|
2954
2729
|
if "pert_compound" in adata.obs.columns:
|
2955
|
-
|
2730
|
+
with logger.mute():
|
2731
|
+
chebi_source = bt.Source.filter(
|
2732
|
+
entity="wetlab.Compound", name="chebi"
|
2733
|
+
).first()
|
2734
|
+
if not chebi_source:
|
2735
|
+
wl.Compound.add_source(
|
2736
|
+
bt.Source.filter(entity="Drug", name="chebi").first()
|
2737
|
+
)
|
2956
2738
|
|
2957
|
-
|
2739
|
+
sources["pert_compound"] = bt.Source.filter(
|
2958
2740
|
entity="wetlab.Compound", name="chebi"
|
2959
2741
|
).first()
|
2742
|
+
return sources
|
2960
2743
|
|
2961
2744
|
def _validate_initial_data(self, adata: ad.AnnData):
|
2962
2745
|
"""Validate the initial data structure."""
|
@@ -3004,20 +2787,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
3004
2787
|
adata.obs[col_name].cat.remove_unused_categories()
|
3005
2788
|
logger.important(f"mapped 'pert_name' to '{col_name}'")
|
3006
2789
|
|
3007
|
-
def _setup_compound_source(self):
|
3008
|
-
"""Set up the compound source with muted logging."""
|
3009
|
-
import bionty as bt
|
3010
|
-
import wetlab as wl
|
3011
|
-
|
3012
|
-
with logger.mute():
|
3013
|
-
chebi_source = bt.Source.filter(
|
3014
|
-
entity="wetlab.Compound", name="chebi"
|
3015
|
-
).first()
|
3016
|
-
if not chebi_source:
|
3017
|
-
wl.Compound.add_source(
|
3018
|
-
bt.Source.filter(entity="Drug", name="chebi").first()
|
3019
|
-
)
|
3020
|
-
|
3021
2790
|
def validate(self) -> bool: # type: ignore
|
3022
2791
|
"""Validate the AnnData object."""
|
3023
2792
|
validated = super().validate()
|
@@ -3135,8 +2904,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
3135
2904
|
|
3136
2905
|
def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
3137
2906
|
"""Make sure the source and organism are saved in the same database as the registry."""
|
3138
|
-
from lamindb.core._settings import settings
|
3139
|
-
|
3140
2907
|
db = registry.filter().db
|
3141
2908
|
source = kwargs.get("source")
|
3142
2909
|
organism = kwargs.get("organism")
|
@@ -3161,44 +2928,15 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
|
3161
2928
|
return filter_kwargs
|
3162
2929
|
|
3163
2930
|
|
3164
|
-
def
|
3165
|
-
values: Iterable[str],
|
3166
|
-
field: FieldAttr,
|
3167
|
-
registry: type[Record],
|
3168
|
-
exclude: str | list | None = None,
|
3169
|
-
**kwargs,
|
3170
|
-
):
|
3171
|
-
"""Inspect values using a registry."""
|
3172
|
-
# inspect exclude values in the default instance
|
3173
|
-
values = list(values)
|
3174
|
-
include_validated = []
|
3175
|
-
if exclude is not None:
|
3176
|
-
exclude = [exclude] if isinstance(exclude, str) else exclude
|
3177
|
-
exclude = [i for i in exclude if i in values]
|
3178
|
-
if len(exclude) > 0:
|
3179
|
-
# exclude values are validated without source and organism
|
3180
|
-
inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
|
3181
|
-
# if exclude values are validated, remove them from the values
|
3182
|
-
values = [i for i in values if i not in inspect_result_exclude.validated]
|
3183
|
-
include_validated = inspect_result_exclude.validated
|
3184
|
-
|
3185
|
-
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
|
3186
|
-
inspect_result._validated += include_validated
|
3187
|
-
inspect_result._non_validated = [
|
3188
|
-
i for i in inspect_result.non_validated if i not in include_validated
|
3189
|
-
]
|
3190
|
-
|
3191
|
-
return inspect_result
|
3192
|
-
|
3193
|
-
|
3194
|
-
def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
|
2931
|
+
def configure_organism(registry: Record, organism: str | None = None) -> dict[str, str]:
|
3195
2932
|
"""Check if a registry needs an organism and return the organism name."""
|
3196
|
-
|
2933
|
+
from ..models._from_values import _is_organism_required
|
2934
|
+
|
2935
|
+
if _is_organism_required(registry):
|
3197
2936
|
import bionty as bt
|
3198
2937
|
|
3199
|
-
if organism is None
|
3200
|
-
return {}
|
3201
|
-
return {"organism": organism or bt.settings.organism.name}
|
2938
|
+
if organism is not None or bt.settings.organism is not None:
|
2939
|
+
return {"organism": organism or bt.settings.organism.name}
|
3202
2940
|
return {}
|
3203
2941
|
|
3204
2942
|
|
@@ -3208,7 +2946,6 @@ def validate_categories(
|
|
3208
2946
|
key: str,
|
3209
2947
|
organism: str | None = None,
|
3210
2948
|
source: Record | None = None,
|
3211
|
-
exclude: str | list | None = None,
|
3212
2949
|
hint_print: str | None = None,
|
3213
2950
|
curator: CatManager | None = None,
|
3214
2951
|
) -> tuple[bool, list[str]]:
|
@@ -3220,13 +2957,9 @@ def validate_categories(
|
|
3220
2957
|
key: The key referencing the slot in the DataFrame.
|
3221
2958
|
organism: The organism name.
|
3222
2959
|
source: The source record.
|
3223
|
-
exclude: Exclude specific values from validation.
|
3224
2960
|
standardize: Whether to standardize the values.
|
3225
2961
|
hint_print: The hint to print that suggests fixing non-validated values.
|
3226
2962
|
"""
|
3227
|
-
from lamindb.core._settings import settings
|
3228
|
-
from lamindb.models._from_values import _format_values
|
3229
|
-
|
3230
2963
|
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
3231
2964
|
|
3232
2965
|
def _log_mapping_info():
|
@@ -3236,36 +2969,26 @@ def validate_categories(
|
|
3236
2969
|
|
3237
2970
|
registry = field.field.model
|
3238
2971
|
|
3239
|
-
# {"organism": organism_name
|
3240
|
-
kwargs =
|
2972
|
+
# {"organism": organism_name}
|
2973
|
+
kwargs = configure_organism(registry, organism)
|
3241
2974
|
kwargs.update({"source": source} if source else {})
|
3242
2975
|
kwargs_current = get_current_filter_kwargs(registry, kwargs)
|
3243
2976
|
|
3244
2977
|
# inspect values from the default instance
|
3245
|
-
inspect_result =
|
3246
|
-
values=values,
|
3247
|
-
field=field,
|
3248
|
-
registry=registry,
|
3249
|
-
exclude=exclude,
|
3250
|
-
**kwargs_current,
|
3251
|
-
)
|
2978
|
+
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs_current)
|
3252
2979
|
non_validated = inspect_result.non_validated
|
3253
2980
|
syn_mapper = inspect_result.synonyms_mapper
|
3254
2981
|
|
3255
2982
|
# inspect the non-validated values from public (bionty only)
|
3256
2983
|
values_validated = []
|
3257
2984
|
if hasattr(registry, "public"):
|
3258
|
-
|
3259
|
-
|
3260
|
-
|
3261
|
-
|
3262
|
-
|
3263
|
-
|
3264
|
-
|
3265
|
-
)
|
3266
|
-
values_validated += [getattr(r, field.field.name) for r in public_records]
|
3267
|
-
finally:
|
3268
|
-
settings.verbosity = verbosity
|
2985
|
+
public_records = registry.from_values(
|
2986
|
+
non_validated,
|
2987
|
+
field=field,
|
2988
|
+
mute=True,
|
2989
|
+
**kwargs_current,
|
2990
|
+
)
|
2991
|
+
values_validated += [getattr(r, field.field.name) for r in public_records]
|
3269
2992
|
|
3270
2993
|
# logging messages
|
3271
2994
|
non_validated_hint_print = hint_print or f'.add_new_from("{key}")'
|
@@ -3329,7 +3052,6 @@ def validate_categories_in_df(
|
|
3329
3052
|
df: pd.DataFrame,
|
3330
3053
|
fields: dict[str, FieldAttr],
|
3331
3054
|
sources: dict[str, Record] = None,
|
3332
|
-
exclude: dict | None = None,
|
3333
3055
|
curator: CatManager | None = None,
|
3334
3056
|
**kwargs,
|
3335
3057
|
) -> tuple[bool, dict]:
|
@@ -3347,7 +3069,6 @@ def validate_categories_in_df(
|
|
3347
3069
|
field=field,
|
3348
3070
|
key=key,
|
3349
3071
|
source=sources.get(key),
|
3350
|
-
exclude=exclude.get(key) if exclude else None,
|
3351
3072
|
curator=curator,
|
3352
3073
|
**kwargs,
|
3353
3074
|
)
|
@@ -3358,9 +3079,10 @@ def validate_categories_in_df(
|
|
3358
3079
|
|
3359
3080
|
|
3360
3081
|
def save_artifact(
|
3361
|
-
data: pd.DataFrame |
|
3082
|
+
data: pd.DataFrame | ScverseDataStructures,
|
3083
|
+
*,
|
3362
3084
|
fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
|
3363
|
-
|
3085
|
+
index_field: FieldAttr | dict[str, FieldAttr] | None = None,
|
3364
3086
|
description: str | None = None,
|
3365
3087
|
organism: str | None = None,
|
3366
3088
|
key: str | None = None,
|
@@ -3368,73 +3090,64 @@ def save_artifact(
|
|
3368
3090
|
revises: Artifact | None = None,
|
3369
3091
|
run: Run | None = None,
|
3370
3092
|
schema: Schema | None = None,
|
3093
|
+
**kwargs,
|
3371
3094
|
) -> Artifact:
|
3372
3095
|
"""Save all metadata with an Artifact.
|
3373
3096
|
|
3374
3097
|
Args:
|
3375
|
-
data: The
|
3098
|
+
data: The object to save.
|
3376
3099
|
fields: A dictionary mapping obs_column to registry_field.
|
3377
|
-
|
3100
|
+
index_field: The registry field to validate variables index against.
|
3378
3101
|
description: A description of the artifact.
|
3379
3102
|
organism: The organism name.
|
3380
|
-
type: The artifact type.
|
3381
3103
|
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
3382
3104
|
artifact: A already registered artifact. Passing this will not save a new artifact from data.
|
3383
3105
|
revises: Previous version of the artifact. Triggers a revision.
|
3384
3106
|
run: The run that creates the artifact.
|
3107
|
+
schema: The Schema to associate with the Artifact.
|
3385
3108
|
|
3386
3109
|
Returns:
|
3387
3110
|
The saved Artifact.
|
3388
3111
|
"""
|
3389
|
-
from ..models.artifact import add_labels
|
3112
|
+
from ..models.artifact import add_labels
|
3390
3113
|
|
3391
3114
|
if artifact is None:
|
3392
|
-
if
|
3393
|
-
artifact = Artifact.
|
3115
|
+
if isinstance(data, pd.DataFrame):
|
3116
|
+
artifact = Artifact.from_df(
|
3394
3117
|
data, description=description, key=key, revises=revises, run=run
|
3395
3118
|
)
|
3396
|
-
elif isinstance(data,
|
3397
|
-
artifact = Artifact.
|
3119
|
+
elif isinstance(data, AnnData):
|
3120
|
+
artifact = Artifact.from_anndata(
|
3398
3121
|
data, description=description, key=key, revises=revises, run=run
|
3399
3122
|
)
|
3400
3123
|
elif data_is_mudata(data):
|
3401
3124
|
artifact = Artifact.from_mudata(
|
3402
|
-
data,
|
3403
|
-
|
3404
|
-
|
3405
|
-
|
3406
|
-
run=run
|
3125
|
+
data, description=description, key=key, revises=revises, run=run
|
3126
|
+
)
|
3127
|
+
elif data_is_spatialdata(data):
|
3128
|
+
artifact = Artifact.from_spatialdata(
|
3129
|
+
data, description=description, key=key, revises=revises, run=run
|
3130
|
+
)
|
3131
|
+
else:
|
3132
|
+
raise InvalidArgument( # pragma: no cover
|
3133
|
+
"data must be one of pd.Dataframe, AnnData, MuData, SpatialData."
|
3407
3134
|
)
|
3408
|
-
artifact.schema = schema
|
3409
3135
|
artifact.save()
|
3410
3136
|
|
3411
|
-
if organism is not None and
|
3412
|
-
feature_kwargs =
|
3137
|
+
if organism is not None and index_field is not None:
|
3138
|
+
feature_kwargs = configure_organism(
|
3413
3139
|
(
|
3414
|
-
list(
|
3415
|
-
if isinstance(
|
3416
|
-
else
|
3140
|
+
list(index_field.values())[0].field.model
|
3141
|
+
if isinstance(index_field, dict)
|
3142
|
+
else index_field.field.model
|
3417
3143
|
),
|
3418
3144
|
organism,
|
3419
3145
|
)
|
3420
3146
|
else:
|
3421
3147
|
feature_kwargs = {}
|
3422
3148
|
|
3423
|
-
if artifact.otype == "DataFrame":
|
3424
|
-
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) # type: ignore
|
3425
|
-
elif artifact.otype == "AnnData":
|
3426
|
-
artifact.features._add_set_from_anndata( # type: ignore
|
3427
|
-
var_field=columns_field, **feature_kwargs
|
3428
|
-
)
|
3429
|
-
elif artifact.otype == "MuData":
|
3430
|
-
artifact.features._add_set_from_mudata( # type: ignore
|
3431
|
-
var_fields=columns_field, **feature_kwargs
|
3432
|
-
)
|
3433
|
-
else:
|
3434
|
-
raise NotImplementedError
|
3435
|
-
|
3436
3149
|
def _add_labels(
|
3437
|
-
data,
|
3150
|
+
data: pd.DataFrame | ScverseDataStructures,
|
3438
3151
|
artifact: Artifact,
|
3439
3152
|
fields: dict[str, FieldAttr],
|
3440
3153
|
feature_ref_is_name: bool | None = None,
|
@@ -3443,7 +3156,7 @@ def save_artifact(
|
|
3443
3156
|
for key, field in fields.items():
|
3444
3157
|
feature = features.get(key)
|
3445
3158
|
registry = field.field.model
|
3446
|
-
filter_kwargs =
|
3159
|
+
filter_kwargs = configure_organism(registry, organism)
|
3447
3160
|
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
3448
3161
|
df = data if isinstance(data, pd.DataFrame) else data.obs
|
3449
3162
|
# multi-value columns are separated by "|"
|
@@ -3470,35 +3183,81 @@ def save_artifact(
|
|
3470
3183
|
from_curator=True,
|
3471
3184
|
)
|
3472
3185
|
|
3473
|
-
|
3474
|
-
|
3475
|
-
|
3476
|
-
|
3477
|
-
|
3478
|
-
|
3479
|
-
|
3480
|
-
|
3481
|
-
|
3482
|
-
|
3483
|
-
|
3484
|
-
|
3485
|
-
|
3486
|
-
|
3487
|
-
|
3488
|
-
|
3489
|
-
|
3490
|
-
|
3491
|
-
|
3492
|
-
|
3493
|
-
|
3494
|
-
|
3495
|
-
|
3496
|
-
|
3497
|
-
|
3498
|
-
|
3499
|
-
|
3500
|
-
|
3501
|
-
|
3186
|
+
match artifact.otype:
|
3187
|
+
case "DataFrame":
|
3188
|
+
artifact.features._add_set_from_df(field=index_field, **feature_kwargs) # type: ignore
|
3189
|
+
_add_labels(
|
3190
|
+
data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
|
3191
|
+
)
|
3192
|
+
case "AnnData":
|
3193
|
+
artifact.features._add_set_from_anndata( # type: ignore
|
3194
|
+
var_field=index_field, **feature_kwargs
|
3195
|
+
)
|
3196
|
+
_add_labels(
|
3197
|
+
data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
|
3198
|
+
)
|
3199
|
+
case "MuData":
|
3200
|
+
artifact.features._add_set_from_mudata( # type: ignore
|
3201
|
+
var_fields=index_field, **feature_kwargs
|
3202
|
+
)
|
3203
|
+
for modality, modality_fields in fields.items():
|
3204
|
+
column_field_modality = index_field.get(modality)
|
3205
|
+
if modality == "obs":
|
3206
|
+
_add_labels(
|
3207
|
+
data,
|
3208
|
+
artifact,
|
3209
|
+
modality_fields,
|
3210
|
+
feature_ref_is_name=(
|
3211
|
+
None
|
3212
|
+
if column_field_modality is None
|
3213
|
+
else _ref_is_name(column_field_modality)
|
3214
|
+
),
|
3215
|
+
)
|
3216
|
+
else:
|
3217
|
+
_add_labels(
|
3218
|
+
data[modality],
|
3219
|
+
artifact,
|
3220
|
+
modality_fields,
|
3221
|
+
feature_ref_is_name=(
|
3222
|
+
None
|
3223
|
+
if column_field_modality is None
|
3224
|
+
else _ref_is_name(column_field_modality)
|
3225
|
+
),
|
3226
|
+
)
|
3227
|
+
case "SpatialData":
|
3228
|
+
artifact.features._add_set_from_spatialdata( # type: ignore
|
3229
|
+
sample_metadata_key=kwargs.get("sample_metadata_key", "sample"),
|
3230
|
+
var_fields=index_field,
|
3231
|
+
**feature_kwargs,
|
3232
|
+
)
|
3233
|
+
sample_metadata_key = kwargs.get("sample_metadata_key", "sample")
|
3234
|
+
for accessor, accessor_fields in fields.items():
|
3235
|
+
column_field = index_field.get(accessor)
|
3236
|
+
if accessor == sample_metadata_key:
|
3237
|
+
_add_labels(
|
3238
|
+
data.get_attrs(
|
3239
|
+
key=sample_metadata_key, return_as="df", flatten=True
|
3240
|
+
),
|
3241
|
+
artifact,
|
3242
|
+
accessor_fields,
|
3243
|
+
feature_ref_is_name=(
|
3244
|
+
None if column_field is None else _ref_is_name(column_field)
|
3245
|
+
),
|
3246
|
+
)
|
3247
|
+
else:
|
3248
|
+
_add_labels(
|
3249
|
+
data.tables[accessor],
|
3250
|
+
artifact,
|
3251
|
+
accessor_fields,
|
3252
|
+
feature_ref_is_name=(
|
3253
|
+
None if column_field is None else _ref_is_name(column_field)
|
3254
|
+
),
|
3255
|
+
)
|
3256
|
+
case _:
|
3257
|
+
raise NotImplementedError # pragma: no cover
|
3258
|
+
|
3259
|
+
artifact.schema = schema
|
3260
|
+
artifact.save()
|
3502
3261
|
|
3503
3262
|
slug = ln_setup.settings.instance.slug
|
3504
3263
|
if ln_setup.settings.instance.is_remote: # pdagma: no cover
|
@@ -3528,8 +3287,7 @@ def update_registry(
|
|
3528
3287
|
organism: str | None = None,
|
3529
3288
|
dtype: str | None = None,
|
3530
3289
|
source: Record | None = None,
|
3531
|
-
|
3532
|
-
**kwargs,
|
3290
|
+
**create_kwargs,
|
3533
3291
|
) -> None:
|
3534
3292
|
"""Save features or labels records in the default instance..
|
3535
3293
|
|
@@ -3542,14 +3300,12 @@ def update_registry(
|
|
3542
3300
|
organism: The organism name.
|
3543
3301
|
dtype: The type of the feature.
|
3544
3302
|
source: The source record.
|
3545
|
-
|
3546
|
-
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
3303
|
+
**create_kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
3547
3304
|
"""
|
3548
|
-
from lamindb.core._settings import settings
|
3549
3305
|
from lamindb.models.save import save as ln_save
|
3550
3306
|
|
3551
3307
|
registry = field.field.model
|
3552
|
-
filter_kwargs =
|
3308
|
+
filter_kwargs = configure_organism(registry, organism)
|
3553
3309
|
filter_kwargs.update({"source": source} if source else {})
|
3554
3310
|
values = [i for i in values if isinstance(i, str) and i]
|
3555
3311
|
if not values:
|
@@ -3607,14 +3363,16 @@ def update_registry(
|
|
3607
3363
|
registry(
|
3608
3364
|
**init_kwargs,
|
3609
3365
|
**{k: v for k, v in filter_kwargs.items() if k != "source"},
|
3610
|
-
**{
|
3366
|
+
**{
|
3367
|
+
k: v for k, v in create_kwargs.items() if k != "sources"
|
3368
|
+
},
|
3611
3369
|
)
|
3612
3370
|
)
|
3613
3371
|
ln_save(non_validated_records)
|
3614
3372
|
|
3615
3373
|
# save parent labels for ulabels, for example a parent label "project" for label "project001"
|
3616
3374
|
if registry == ULabel and field.field.name == "name":
|
3617
|
-
|
3375
|
+
save_ulabels_type(values, field=field, key=key)
|
3618
3376
|
|
3619
3377
|
finally:
|
3620
3378
|
settings.verbosity = verbosity
|
@@ -3652,16 +3410,18 @@ def log_saved_labels(
|
|
3652
3410
|
)
|
3653
3411
|
|
3654
3412
|
|
3655
|
-
def
|
3656
|
-
"""Save
|
3413
|
+
def save_ulabels_type(values: list[str], field: FieldAttr, key: str) -> None:
|
3414
|
+
"""Save the ULabel type of the given labels."""
|
3657
3415
|
registry = field.field.model
|
3658
3416
|
assert registry == ULabel # noqa: S101
|
3659
|
-
all_records = registry.
|
3660
|
-
|
3661
|
-
|
3662
|
-
|
3663
|
-
|
3664
|
-
|
3417
|
+
all_records = registry.filter(**{field.field.name: list(values)}).all()
|
3418
|
+
# so `tissue_type` becomes `TissueType`
|
3419
|
+
type_name = "".join([i.capitalize() for i in key.lower().split("_")])
|
3420
|
+
ulabel_type = registry.filter(name=type_name, is_type=True).one_or_none()
|
3421
|
+
if ulabel_type is None:
|
3422
|
+
ulabel_type = registry(name=type_name, is_type=True).save()
|
3423
|
+
logger.important(f"Created a ULabel type: {ulabel_type}")
|
3424
|
+
all_records.update(type=ulabel_type)
|
3665
3425
|
|
3666
3426
|
|
3667
3427
|
def _save_organism(name: str):
|
@@ -3760,7 +3520,6 @@ def from_tiledbsoma(
|
|
3760
3520
|
obs_columns: FieldAttr = Feature.name,
|
3761
3521
|
organism: str | None = None,
|
3762
3522
|
sources: dict[str, Record] | None = None,
|
3763
|
-
exclude: dict[str, str | list[str]] | None = None,
|
3764
3523
|
) -> TiledbsomaCatManager:
|
3765
3524
|
return TiledbsomaCatManager(
|
3766
3525
|
experiment_uri=experiment_uri,
|
@@ -3769,7 +3528,6 @@ def from_tiledbsoma(
|
|
3769
3528
|
obs_columns=obs_columns,
|
3770
3529
|
organism=organism,
|
3771
3530
|
sources=sources,
|
3772
|
-
exclude=exclude,
|
3773
3531
|
)
|
3774
3532
|
|
3775
3533
|
|
@@ -3781,7 +3539,6 @@ def from_spatialdata(
|
|
3781
3539
|
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
3782
3540
|
organism: str | None = None,
|
3783
3541
|
sources: dict[str, dict[str, Record]] | None = None,
|
3784
|
-
exclude: dict[str, dict] | None = None,
|
3785
3542
|
verbosity: str = "hint",
|
3786
3543
|
*,
|
3787
3544
|
sample_metadata_key: str = "sample",
|
@@ -3798,7 +3555,6 @@ def from_spatialdata(
|
|
3798
3555
|
verbosity=verbosity,
|
3799
3556
|
organism=organism,
|
3800
3557
|
sources=sources,
|
3801
|
-
exclude=exclude,
|
3802
3558
|
sample_metadata_key=sample_metadata_key,
|
3803
3559
|
)
|
3804
3560
|
|