lamindb 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,25 +1,27 @@
1
1
  """Curators.
2
2
 
3
- .. versionadded:: 1.1.0
4
-
5
3
  .. autosummary::
6
4
  :toctree: .
7
5
 
8
- Curator
9
6
  DataFrameCurator
10
7
  AnnDataCurator
8
+ MuDataCurator
9
+ SpatialDataCurator
11
10
 
12
- CatManager:
11
+ Helper classes.
13
12
 
14
13
  .. autosummary::
15
14
  :toctree: .
16
15
 
16
+ Curator
17
+ SlotsCurator
17
18
  CatManager
19
+ CatLookup
18
20
  DataFrameCatManager
19
21
  AnnDataCatManager
20
22
  MuDataCatManager
23
+ SpatialDataCatManager
21
24
  TiledbsomaCatManager
22
- CurateLookup
23
25
 
24
26
  """
25
27
 
@@ -27,7 +29,6 @@ from __future__ import annotations
27
29
 
28
30
  import copy
29
31
  import re
30
- from importlib import resources
31
32
  from itertools import chain
32
33
  from typing import TYPE_CHECKING, Any, Literal
33
34
 
@@ -37,45 +38,44 @@ import pandas as pd
37
38
  import pandera
38
39
  import pyarrow as pa
39
40
  from lamin_utils import colors, logger
40
- from lamindb_setup.core import deprecated, upath
41
+ from lamindb_setup.core import deprecated
41
42
  from lamindb_setup.core._docs import doc_args
42
43
  from lamindb_setup.core.upath import UPath
43
44
 
44
- from lamindb.core.storage._backed_access import backed_access
45
-
46
- from ._cellxgene_schemas import _read_schema_versions
47
-
48
45
  if TYPE_CHECKING:
49
- from anndata import AnnData
50
46
  from lamindb_setup.core.types import UPathStr
47
+ from mudata import MuData
48
+ from spatialdata import SpatialData
51
49
 
52
- from lamindb.base.types import FieldAttr
50
+ from lamindb.core.types import ScverseDataStructures
53
51
  from lamindb.models import Record
54
52
  from lamindb.base.types import FieldAttr # noqa
55
53
  from lamindb.core._settings import settings
56
54
  from lamindb.models import (
57
55
  Artifact,
58
- Collection,
59
56
  Feature,
60
57
  Record,
61
58
  Run,
62
59
  Schema,
63
60
  ULabel,
64
61
  )
65
- from lamindb.models._feature_manager import parse_staged_feature_sets_from_anndata
66
- from lamindb.models.artifact import add_labels, data_is_anndata
62
+ from lamindb.models.artifact import (
63
+ add_labels,
64
+ data_is_anndata,
65
+ data_is_mudata,
66
+ data_is_spatialdata,
67
+ )
67
68
  from lamindb.models.feature import parse_dtype, parse_dtype_single_cat
68
69
  from lamindb.models._from_values import _format_values
69
70
 
70
71
  from ..errors import InvalidArgument, ValidationError
72
+ from anndata import AnnData
71
73
 
72
74
  if TYPE_CHECKING:
73
75
  from collections.abc import Iterable, MutableMapping
74
76
  from typing import Any
75
77
 
76
78
  from lamindb_setup.core.types import UPathStr
77
- from mudata import MuData
78
- from spatialdata import SpatialData
79
79
 
80
80
  from lamindb.models.query_set import RecordList
81
81
 
@@ -86,7 +86,7 @@ def strip_ansi_codes(text):
86
86
  return ansi_pattern.sub("", text)
87
87
 
88
88
 
89
- class CurateLookup:
89
+ class CatLookup:
90
90
  """Lookup categories from the reference instance.
91
91
 
92
92
  Args:
@@ -94,10 +94,10 @@ class CurateLookup:
94
94
  slots: A dictionary of slot fields to lookup.
95
95
  public: Whether to lookup from the public instance. Defaults to False.
96
96
 
97
- Example:
98
- >>> curator = ln.Curator.from_df(...)
99
- >>> curator.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
100
- <Category: alveolar_type_1_fibroblast_cell>
97
+ Example::
98
+
99
+ curator = ln.curators.DataFrameCurator(...)
100
+ curator.cat.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
101
101
 
102
102
  """
103
103
 
@@ -163,7 +163,7 @@ SLOTS_DOCSTRING = """Curator objects by slot.
163
163
  """
164
164
 
165
165
 
166
- VALIDATE_DOCSTRING = """Validate dataset.
166
+ VALIDATE_DOCSTRING = """Validate dataset against Schema.
167
167
 
168
168
  Raises:
169
169
  lamindb.errors.ValidationError: If validation fails.
@@ -183,15 +183,17 @@ Returns:
183
183
 
184
184
 
185
185
  class Curator:
186
- """Dataset curator.
186
+ """Curator base class.
187
187
 
188
188
  A `Curator` object makes it easy to validate, standardize & annotate datasets.
189
189
 
190
- .. versionadded:: 1.1.0
191
-
192
190
  See:
193
191
  - :class:`~lamindb.curators.DataFrameCurator`
194
192
  - :class:`~lamindb.curators.AnnDataCurator`
193
+ - :class:`~lamindb.curators.MuDataCurator`
194
+ - :class:`~lamindb.curators.SpatialDataCurator`
195
+
196
+ .. versionadded:: 1.1.0
195
197
  """
196
198
 
197
199
  def __init__(self, dataset: Any, schema: Schema | None = None):
@@ -199,7 +201,12 @@ class Curator:
199
201
  self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
200
202
  if isinstance(self._dataset, Artifact):
201
203
  self._artifact = self._dataset
202
- if self._artifact.otype in {"DataFrame", "AnnData"}:
204
+ if self._artifact.otype in {
205
+ "DataFrame",
206
+ "AnnData",
207
+ "MuData",
208
+ "SpatialData",
209
+ }:
203
210
  self._dataset = self._dataset.load()
204
211
  self._schema: Schema | None = schema
205
212
  self._is_validated: bool = False
@@ -225,9 +232,72 @@ class Curator:
225
232
  pass
226
233
 
227
234
 
235
+ class SlotsCurator(Curator):
236
+ """Curator for a dataset with slots.
237
+
238
+ Args:
239
+ dataset: The dataset to validate & annotate.
240
+ schema: A `Schema` object that defines the validation constraints.
241
+
242
+ .. versionadded:: 1.3.0
243
+ """
244
+
245
+ def __init__(
246
+ self,
247
+ dataset: Any,
248
+ schema: Schema,
249
+ ) -> None:
250
+ super().__init__(dataset=dataset, schema=schema)
251
+ self._slots: dict[str, DataFrameCurator] = {}
252
+
253
+ # used in MuDataCurator and SpatialDataCurator
254
+ # in form of {table/modality_key: var_field}
255
+ self._var_fields: dict[str, FieldAttr] = {}
256
+ # in form of {table/modality_key: categoricals}
257
+ self._categoricals: dict[str, dict[str, FieldAttr]] = {}
258
+
259
+ @property
260
+ @doc_args(SLOTS_DOCSTRING)
261
+ def slots(self) -> dict[str, DataFrameCurator]:
262
+ """{}""" # noqa: D415
263
+ return self._slots
264
+
265
+ @doc_args(VALIDATE_DOCSTRING)
266
+ def validate(self) -> None:
267
+ """{}""" # noqa: D415
268
+ for _, curator in self._slots.items():
269
+ curator.validate()
270
+
271
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
272
+ def save_artifact(
273
+ self,
274
+ *,
275
+ key: str | None = None,
276
+ description: str | None = None,
277
+ revises: Artifact | None = None,
278
+ run: Run | None = None,
279
+ ) -> Artifact:
280
+ """{}""" # noqa: D415
281
+ if not self._is_validated:
282
+ self.validate()
283
+
284
+ # default implementation for MuDataCurator and SpatialDataCurator
285
+ return save_artifact( # type: ignore
286
+ self._dataset,
287
+ key=key,
288
+ description=description,
289
+ fields=self._categoricals,
290
+ index_field=self._var_fields,
291
+ artifact=self._artifact,
292
+ revises=revises,
293
+ run=run,
294
+ schema=self._schema,
295
+ )
296
+
297
+
228
298
  class DataFrameCurator(Curator):
229
299
  # the example in the docstring is tested in test_curators_quickstart_example
230
- """Curator for a DataFrame object.
300
+ """Curator for `DataFrame`.
231
301
 
232
302
  See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
233
303
 
@@ -282,7 +352,9 @@ class DataFrameCurator(Curator):
282
352
  feature.dtype if not feature.dtype.startswith("cat") else "category"
283
353
  )
284
354
  pandera_columns[feature.name] = pandera.Column(
285
- pandera_dtype, nullable=feature.nullable
355
+ pandera_dtype,
356
+ nullable=feature.nullable,
357
+ coerce=feature.coerce_dtype,
286
358
  )
287
359
  if feature.dtype.startswith("cat"):
288
360
  categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
@@ -378,7 +450,7 @@ class DataFrameCurator(Curator):
378
450
  description: str | None = None,
379
451
  revises: Artifact | None = None,
380
452
  run: Run | None = None,
381
- ):
453
+ ) -> Artifact:
382
454
  """{}""" # noqa: D415
383
455
  if not self._is_validated:
384
456
  self.validate() # raises ValidationError if doesn't validate
@@ -387,7 +459,7 @@ class DataFrameCurator(Curator):
387
459
  self._dataset,
388
460
  description=description,
389
461
  fields=self._cat_manager.categoricals,
390
- columns_field=result["field"],
462
+ index_field=result["field"],
391
463
  key=key,
392
464
  artifact=self._artifact,
393
465
  revises=revises,
@@ -396,9 +468,9 @@ class DataFrameCurator(Curator):
396
468
  )
397
469
 
398
470
 
399
- class AnnDataCurator(Curator):
471
+ class AnnDataCurator(SlotsCurator):
400
472
  # the example in the docstring is tested in test_curators_quickstart_example
401
- """Curator for a DataFrame object.
473
+ """Curator for `AnnData`.
402
474
 
403
475
  See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
404
476
 
@@ -446,7 +518,7 @@ class AnnDataCurator(Curator):
446
518
  ).save()
447
519
 
448
520
  # curate an AnnData
449
- adata = datasets.small_dataset1(otype="AnnData")
521
+ adata = ln.core.datasets.small_dataset1(otype="AnnData")
450
522
  curator = ln.curators.AnnDataCurator(adata, anndata_schema)
451
523
  artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
452
524
  assert artifact.schema == anndata_schema
@@ -466,9 +538,9 @@ class AnnDataCurator(Curator):
466
538
  self._slots = {
467
539
  slot: DataFrameCurator(
468
540
  (
469
- self._dataset.__getattribute__(slot).T
541
+ getattr(self._dataset, slot).T
470
542
  if slot == "var"
471
- else self._dataset.__getattribute__(slot)
543
+ else getattr(self._dataset, slot)
472
544
  ),
473
545
  slot_schema,
474
546
  )
@@ -476,18 +548,6 @@ class AnnDataCurator(Curator):
476
548
  if slot in {"obs", "var"}
477
549
  }
478
550
 
479
- @property
480
- @doc_args(SLOTS_DOCSTRING)
481
- def slots(self) -> dict[str, DataFrameCurator]:
482
- """{}""" # noqa: D415
483
- return self._slots
484
-
485
- @doc_args(VALIDATE_DOCSTRING)
486
- def validate(self) -> None:
487
- """{}""" # noqa: D415
488
- for _, curator in self._slots.items():
489
- curator.validate()
490
-
491
551
  @doc_args(SAVE_ARTIFACT_DOCSTRING)
492
552
  def save_artifact(
493
553
  self,
@@ -496,7 +556,7 @@ class AnnDataCurator(Curator):
496
556
  description: str | None = None,
497
557
  revises: Artifact | None = None,
498
558
  run: Run | None = None,
499
- ):
559
+ ) -> Artifact:
500
560
  """{}""" # noqa: D415
501
561
  if not self._is_validated:
502
562
  self.validate()
@@ -504,7 +564,7 @@ class AnnDataCurator(Curator):
504
564
  self._dataset,
505
565
  description=description,
506
566
  fields=self.slots["obs"]._cat_manager.categoricals,
507
- columns_field=(
567
+ index_field=(
508
568
  parse_dtype_single_cat(self.slots["var"]._schema.itype, is_itype=True)[
509
569
  "field"
510
570
  ]
@@ -519,34 +579,286 @@ class AnnDataCurator(Curator):
519
579
  )
520
580
 
521
581
 
522
- class CatManager:
523
- """Manage valid categoricals by updating registries.
582
+ def _assign_var_fields_categoricals_multimodal(
583
+ modality: str | None,
584
+ slot_type: str,
585
+ slot: str,
586
+ slot_schema: Schema,
587
+ var_fields: dict[str, FieldAttr],
588
+ categoricals: dict[str, dict[str, FieldAttr]],
589
+ slots: dict[str, DataFrameCurator],
590
+ ) -> None:
591
+ """Assigns var_fields and categoricals for multimodal data curators."""
592
+ if modality is not None:
593
+ # Makes sure that all tables are present
594
+ var_fields[modality] = None
595
+ categoricals[modality] = {}
596
+
597
+ if slot_type == "var":
598
+ var_field = parse_dtype_single_cat(slot_schema.itype, is_itype=True)["field"]
599
+ if modality is None:
600
+ # This should rarely/never be used since tables should have different var fields
601
+ var_fields[slot] = var_field # pragma: no cover
602
+ else:
603
+ # Note that this is NOT nested since the nested key is always "var"
604
+ var_fields[modality] = var_field
605
+ else:
606
+ obs_fields = slots[slot]._cat_manager.categoricals
607
+ if modality is None:
608
+ categoricals[slot] = obs_fields
609
+ else:
610
+ # Note that this is NOT nested since the nested key is always "obs"
611
+ categoricals[modality] = obs_fields
612
+
524
613
 
525
- A `CatManager` object makes it easy to validate, standardize & annotate datasets.
614
+ class MuDataCurator(SlotsCurator):
615
+ # the example in the docstring is tested in test_curators_quickstart_example
616
+ """Curator for `MuData`.
617
+
618
+ See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
526
619
 
527
- Example:
620
+ .. versionadded:: 1.3.0
528
621
 
529
- >>> cat_manager = ln.CatManager(
530
- >>> dataset,
531
- >>> # define validation criteria as mappings
532
- >>> columns=Feature.name, # map column names
533
- >>> categoricals={"perturbation": ULabel.name}, # map categories
534
- >>> )
535
- >>> cat_manager.validate() # validate the dataframe
536
- >>> artifact = cat_manager.save_artifact(description="my RNA-seq")
537
- >>> artifact.describe() # see annotations
622
+ Args:
623
+ dataset: The MuData-like object to validate & annotate.
624
+ schema: A `Schema` object that defines the validation constraints.
538
625
 
539
- `cat_manager.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
626
+ Example::
540
627
 
541
- If you find non-validated values, you have several options:
628
+ import lamindb as ln
629
+ import bionty as bt
630
+
631
+ # define the global obs schema
632
+ obs_schema = ln.Schema(
633
+ name="mudata_papalexi21_subset_obs_schema",
634
+ features=[
635
+ ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
636
+ ln.Feature(name="replicate", dtype="cat[ULabel[Replicate]]").save(),
637
+ ],
638
+ ).save()
639
+
640
+ # define the ['rna'].obs schema
641
+ obs_schema_rna = ln.Schema(
642
+ name="mudata_papalexi21_subset_rna_obs_schema",
643
+ features=[
644
+ ln.Feature(name="nCount_RNA", dtype=int).save(),
645
+ ln.Feature(name="nFeature_RNA", dtype=int).save(),
646
+ ln.Feature(name="percent.mito", dtype=float).save(),
647
+ ],
648
+ coerce_dtype=True,
649
+ ).save()
650
+
651
+ # define the ['hto'].obs schema
652
+ obs_schema_hto = ln.Schema(
653
+ name="mudata_papalexi21_subset_hto_obs_schema",
654
+ features=[
655
+ ln.Feature(name="nCount_HTO", dtype=int).save(),
656
+ ln.Feature(name="nFeature_HTO", dtype=int).save(),
657
+ ln.Feature(name="technique", dtype=bt.ExperimentalFactor).save(),
658
+ ],
659
+ coerce_dtype=True,
660
+ ).save()
661
+
662
+ # define ['rna'].var schema
663
+ var_schema_rna = ln.Schema(
664
+ name="mudata_papalexi21_subset_rna_var_schema",
665
+ itype=bt.Gene.symbol,
666
+ dtype=float,
667
+ ).save()
668
+
669
+ # define composite schema
670
+ mudata_schema = ln.Schema(
671
+ name="mudata_papalexi21_subset_mudata_schema",
672
+ otype="MuData",
673
+ components={
674
+ "obs": obs_schema,
675
+ "rna:obs": obs_schema_rna,
676
+ "hto:obs": obs_schema_hto,
677
+ "rna:var": var_schema_rna,
678
+ },
679
+ ).save()
542
680
 
543
- - new values found in the data can be registered using :meth:`~lamindb.curators.DataFrameCatManager.add_new_from`
544
- - non-validated values can be accessed using :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
681
+ # curate a MuData
682
+ mdata = ln.core.datasets.mudata_papalexi21_subset()
683
+ bt.settings.organism = "human" # set the organism
684
+ curator = ln.curators.MuDataCurator(mdata, mudata_schema)
685
+ artifact = curator.save_artifact(key="example_datasets/mudata_papalexi21_subset.h5mu")
686
+ assert artifact.schema == mudata_schema
545
687
  """
546
688
 
547
689
  def __init__(
548
- self, *, dataset, categoricals, sources, organism, exclude, columns_field=None
549
- ):
690
+ self,
691
+ dataset: MuData | Artifact,
692
+ schema: Schema,
693
+ ) -> None:
694
+ super().__init__(dataset=dataset, schema=schema)
695
+ if not data_is_mudata(self._dataset):
696
+ raise InvalidArgument("dataset must be MuData-like.")
697
+ if schema.otype != "MuData":
698
+ raise InvalidArgument("Schema otype must be 'MuData'.")
699
+
700
+ for slot, slot_schema in schema.slots.items():
701
+ # Assign to _slots
702
+ if ":" in slot:
703
+ modality, modality_slot = slot.split(":")
704
+ schema_dataset = self._dataset.__getitem__(modality)
705
+ else:
706
+ modality, modality_slot = None, slot
707
+ schema_dataset = self._dataset
708
+ self._slots[slot] = DataFrameCurator(
709
+ (
710
+ getattr(schema_dataset, modality_slot).T
711
+ if modality_slot == "var"
712
+ else getattr(schema_dataset, modality_slot)
713
+ ),
714
+ slot_schema,
715
+ )
716
+ _assign_var_fields_categoricals_multimodal(
717
+ modality=modality,
718
+ slot_type=modality_slot,
719
+ slot=slot,
720
+ slot_schema=slot_schema,
721
+ var_fields=self._var_fields,
722
+ categoricals=self._categoricals,
723
+ slots=self._slots,
724
+ )
725
+
726
+ # for consistency with BaseCatManager
727
+ self._columns_field = self._var_fields
728
+
729
+
730
+ class SpatialDataCurator(SlotsCurator):
731
+ # the example in the docstring is tested in test_curators_quickstart_example
732
+ """Curator for `SpatialData`.
733
+
734
+ See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
735
+
736
+ .. versionadded:: 1.3.0
737
+
738
+ Args:
739
+ dataset: The SpatialData-like object to validate & annotate.
740
+ schema: A `Schema` object that defines the validation constraints.
741
+
742
+ Example::
743
+
744
+ import lamindb as ln
745
+ import bionty as bt
746
+
747
+ # define sample schema
748
+ sample_schema = ln.Schema(
749
+ name="blobs_sample_level_metadata",
750
+ features=[
751
+ ln.Feature(name="assay", dtype=bt.ExperimentalFactor).save(),
752
+ ln.Feature(name="disease", dtype=bt.Disease).save(),
753
+ ln.Feature(name="development_stage", dtype=bt.DevelopmentalStage).save(),
754
+ ],
755
+ coerce_dtype=True
756
+ ).save()
757
+
758
+ # define table obs schema
759
+ blobs_obs_schema = ln.Schema(
760
+ name="blobs_obs_level_metadata",
761
+ features=[
762
+ ln.Feature(name="sample_region", dtype="str").save(),
763
+ ],
764
+ coerce_dtype=True
765
+ ).save()
766
+
767
+ # define table var schema
768
+ blobs_var_schema = ln.Schema(
769
+ name="blobs_var_schema",
770
+ itype=bt.Gene.ensembl_gene_id,
771
+ dtype=int
772
+ ).save()
773
+
774
+ # define composite schema
775
+ spatialdata_schema = ln.Schema(
776
+ name="blobs_spatialdata_schema",
777
+ otype="SpatialData",
778
+ components={
779
+ "sample": sample_schema,
780
+ "table:obs": blobs_obs_schema,
781
+ "table:var": blobs_var_schema,
782
+ }).save()
783
+
784
+ # curate a SpatialData
785
+ spatialdata = ln.core.datasets.spatialdata_blobs()
786
+ curator = ln.curators.SpatialDataCurator(spatialdata, spatialdata_schema)
787
+ try:
788
+ curator.validate()
789
+ except ln.errors.ValidationError as error:
790
+ print(error)
791
+
792
+ # validate again (must pass now) and save artifact
793
+ artifact = curator.save_artifact(key="example_datasets/spatialdata1.zarr")
794
+ assert artifact.schema == spatialdata_schema
795
+ """
796
+
797
+ def __init__(
798
+ self,
799
+ dataset: SpatialData | Artifact,
800
+ schema: Schema,
801
+ *,
802
+ sample_metadata_key: str | None = "sample",
803
+ ) -> None:
804
+ super().__init__(dataset=dataset, schema=schema)
805
+ if not data_is_spatialdata(self._dataset):
806
+ raise InvalidArgument("dataset must be SpatialData-like.")
807
+ if schema.otype != "SpatialData":
808
+ raise InvalidArgument("Schema otype must be 'SpatialData'.")
809
+
810
+ for slot, slot_schema in schema.slots.items():
811
+ # Assign to _slots
812
+ if ":" in slot:
813
+ table_key, table_slot = slot.split(":")
814
+ schema_dataset = self._dataset.tables.__getitem__(table_key)
815
+ # sample metadata (does not have a `:` separator)
816
+ else:
817
+ table_key = None
818
+ table_slot = slot
819
+ schema_dataset = self._dataset.get_attrs(
820
+ key=sample_metadata_key, return_as="df", flatten=True
821
+ )
822
+
823
+ self._slots[slot] = DataFrameCurator(
824
+ (
825
+ getattr(schema_dataset, table_slot).T
826
+ if table_slot == "var"
827
+ else (
828
+ getattr(schema_dataset, table_slot)
829
+ if table_slot != sample_metadata_key
830
+ else schema_dataset
831
+ ) # just take the schema_dataset if it's the sample metadata key
832
+ ),
833
+ slot_schema,
834
+ )
835
+
836
+ _assign_var_fields_categoricals_multimodal(
837
+ modality=table_key,
838
+ slot_type=table_slot,
839
+ slot=slot,
840
+ slot_schema=slot_schema,
841
+ var_fields=self._var_fields,
842
+ categoricals=self._categoricals,
843
+ slots=self._slots,
844
+ )
845
+
846
+ # for consistency with BaseCatManager
847
+ self._columns_field = self._var_fields
848
+
849
+
850
+ class CatManager:
851
+ """Manage categoricals by updating registries.
852
+
853
+ This class is accessible from within a `DataFrameCurator` via the `.cat` attribute.
854
+
855
+ If you find non-validated values, you have several options:
856
+
857
+ - new values found in the data can be registered via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.add_new_from`
858
+ - non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
859
+ """
860
+
861
+ def __init__(self, *, dataset, categoricals, sources, organism, columns_field=None):
550
862
  # the below is shared with Curator
551
863
  self._artifact: Artifact = None # pass the dataset as an artifact
552
864
  self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
@@ -560,7 +872,6 @@ class CatManager:
560
872
  self._non_validated = None
561
873
  self._organism = organism
562
874
  self._sources = sources or {}
563
- self._exclude = exclude or {}
564
875
  self._columns_field = columns_field
565
876
  self._validate_category_error_messages: str = ""
566
877
 
@@ -645,10 +956,10 @@ class CatManager:
645
956
  settings.verbosity = "warning"
646
957
  self._artifact = save_artifact( # type: ignore
647
958
  self._dataset,
959
+ key=key,
648
960
  description=description,
649
961
  fields=self.categoricals,
650
- columns_field=self._columns_field,
651
- key=key,
962
+ index_field=self._columns_field,
652
963
  artifact=self._artifact,
653
964
  revises=revises,
654
965
  run=run,
@@ -662,34 +973,7 @@ class CatManager:
662
973
 
663
974
 
664
975
  class DataFrameCatManager(CatManager):
665
- """Curation flow for a DataFrame object.
666
-
667
- See also :class:`~lamindb.Curator`.
668
-
669
- Args:
670
- df: The DataFrame object to curate.
671
- columns: The field attribute for the feature column.
672
- categoricals: A dictionary mapping column names to registry_field.
673
- verbosity: The verbosity level.
674
- organism: The organism name.
675
- sources: A dictionary mapping column names to Source records.
676
- exclude: A dictionary mapping column names to values to exclude from validation.
677
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
678
- using the exclude parameter ensures they are not validated.
679
-
680
- Returns:
681
- A curator object.
682
-
683
- Examples:
684
- >>> import bionty as bt
685
- >>> curator = ln.Curator.from_df(
686
- ... df,
687
- ... categoricals={
688
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
689
- ... "donor_id": ULabel.name
690
- ... }
691
- ... )
692
- """
976
+ """Categorical manager for `DataFrame`."""
693
977
 
694
978
  def __init__(
695
979
  self,
@@ -699,7 +983,6 @@ class DataFrameCatManager(CatManager):
699
983
  verbosity: str = "hint",
700
984
  organism: str | None = None,
701
985
  sources: dict[str, Record] | None = None,
702
- exclude: dict | None = None,
703
986
  ) -> None:
704
987
  from lamindb.core._settings import settings
705
988
 
@@ -714,17 +997,16 @@ class DataFrameCatManager(CatManager):
714
997
  organism=organism,
715
998
  categoricals=categoricals,
716
999
  sources=sources,
717
- exclude=exclude,
718
1000
  )
719
1001
  self._save_columns()
720
1002
 
721
- def lookup(self, public: bool = False) -> CurateLookup:
1003
+ def lookup(self, public: bool = False) -> CatLookup:
722
1004
  """Lookup categories.
723
1005
 
724
1006
  Args:
725
1007
  public: If "public", the lookup is performed on the public reference.
726
1008
  """
727
- return CurateLookup(
1009
+ return CatLookup(
728
1010
  categoricals=self._categoricals,
729
1011
  slots={"columns": self._columns_field},
730
1012
  public=public,
@@ -739,7 +1021,6 @@ class DataFrameCatManager(CatManager):
739
1021
  key="columns",
740
1022
  validated_only=False,
741
1023
  source=self._sources.get("columns"),
742
- exclude=self._exclude.get("columns"),
743
1024
  )
744
1025
 
745
1026
  # Save the rest of the columns based on validated_only
@@ -752,7 +1033,6 @@ class DataFrameCatManager(CatManager):
752
1033
  validated_only=validated_only,
753
1034
  df=self._dataset, # Get the Feature type from df
754
1035
  source=self._sources.get("columns"),
755
- exclude=self._exclude.get("columns"),
756
1036
  )
757
1037
 
758
1038
  @deprecated(new_name="is run by default")
@@ -778,7 +1058,6 @@ class DataFrameCatManager(CatManager):
778
1058
  self._dataset,
779
1059
  fields=self.categoricals,
780
1060
  sources=self._sources,
781
- exclude=self._exclude,
782
1061
  curator=self,
783
1062
  organism=self._organism,
784
1063
  )
@@ -852,7 +1131,6 @@ class DataFrameCatManager(CatManager):
852
1131
  key=categorical,
853
1132
  validated_only=validated_only,
854
1133
  source=self._sources.get(categorical),
855
- exclude=self._exclude.get(categorical),
856
1134
  organism=self._organism,
857
1135
  )
858
1136
  # adding new records removes them from non_validated
@@ -882,32 +1160,7 @@ class DataFrameCatManager(CatManager):
882
1160
 
883
1161
 
884
1162
  class AnnDataCatManager(CatManager):
885
- """Manage categorical curation.
886
-
887
- Args:
888
- data: The AnnData object or an AnnData-like path.
889
- var_index: The registry field for mapping the ``.var`` index.
890
- categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
891
- obs_columns: The registry field for mapping the ``.obs.columns``.
892
- verbosity: The verbosity level.
893
- organism: The organism name.
894
- sources: A dictionary mapping ``.obs.columns`` to Source records.
895
- exclude: A dictionary mapping column names to values to exclude from validation.
896
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
897
- using the exclude parameter ensures they are not validated.
898
-
899
- Examples:
900
- >>> import bionty as bt
901
- >>> curator = ln.Curator.from_anndata(
902
- ... adata,
903
- ... var_index=bt.Gene.ensembl_gene_id,
904
- ... categoricals={
905
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
906
- ... "donor_id": ULabel.name
907
- ... },
908
- ... organism="human",
909
- ... )
910
- """
1163
+ """Categorical manager for `AnnData`."""
911
1164
 
912
1165
  def __init__(
913
1166
  self,
@@ -918,13 +1171,10 @@ class AnnDataCatManager(CatManager):
918
1171
  verbosity: str = "hint",
919
1172
  organism: str | None = None,
920
1173
  sources: dict[str, Record] | None = None,
921
- exclude: dict | None = None,
922
1174
  ) -> None:
923
1175
  if isinstance(var_index, str):
924
1176
  raise TypeError("var_index parameter has to be a bionty field")
925
1177
 
926
- if sources is None:
927
- sources = {}
928
1178
  if not data_is_anndata(data):
929
1179
  raise TypeError("data has to be an AnnData object")
930
1180
 
@@ -935,12 +1185,12 @@ class AnnDataCatManager(CatManager):
935
1185
 
936
1186
  self._obs_fields = categoricals or {}
937
1187
  self._var_field = var_index
1188
+ self._sources = sources or {}
938
1189
  super().__init__(
939
1190
  dataset=data,
940
1191
  categoricals=categoricals,
941
- sources=sources,
1192
+ sources=self._sources,
942
1193
  organism=organism,
943
- exclude=exclude,
944
1194
  columns_field=var_index,
945
1195
  )
946
1196
  self._adata = self._dataset
@@ -950,8 +1200,7 @@ class AnnDataCatManager(CatManager):
950
1200
  columns=obs_columns,
951
1201
  verbosity=verbosity,
952
1202
  organism=None,
953
- sources=sources,
954
- exclude=exclude,
1203
+ sources=self._sources,
955
1204
  )
956
1205
 
957
1206
  @property
@@ -964,13 +1213,13 @@ class AnnDataCatManager(CatManager):
964
1213
  """Return the obs fields to validate against."""
965
1214
  return self._obs_fields
966
1215
 
967
- def lookup(self, public: bool = False) -> CurateLookup:
1216
+ def lookup(self, public: bool = False) -> CatLookup:
968
1217
  """Lookup categories.
969
1218
 
970
1219
  Args:
971
1220
  public: If "public", the lookup is performed on the public reference.
972
1221
  """
973
- return CurateLookup(
1222
+ return CatLookup(
974
1223
  categoricals=self._obs_fields,
975
1224
  slots={"columns": self._columns_field, "var_index": self._var_field},
976
1225
  public=public,
@@ -989,7 +1238,6 @@ class AnnDataCatManager(CatManager):
989
1238
  validated_only=validated_only,
990
1239
  organism=self._organism,
991
1240
  source=self._sources.get("var_index"),
992
- exclude=self._exclude.get("var_index"),
993
1241
  )
994
1242
 
995
1243
  def add_new_from(self, key: str, **kwargs):
@@ -1033,7 +1281,6 @@ class AnnDataCatManager(CatManager):
1033
1281
  key="var_index",
1034
1282
  source=self._sources.get("var_index"),
1035
1283
  hint_print=".add_new_from_var_index()",
1036
- exclude=self._exclude.get("var_index"),
1037
1284
  organism=self._organism, # type: ignore
1038
1285
  )
1039
1286
  else:
@@ -1077,59 +1324,29 @@ class AnnDataCatManager(CatManager):
1077
1324
 
1078
1325
 
1079
1326
  class MuDataCatManager(CatManager):
1080
- """Curation flow for a ``MuData`` object.
1081
-
1082
- Args:
1083
- mdata: The MuData object to curate.
1084
- var_index: The registry field for mapping the ``.var`` index for each modality.
1085
- For example:
1086
- ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
1087
- categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
1088
- Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
1089
- verbosity: The verbosity level.
1090
- organism: The organism name.
1091
- sources: A dictionary mapping ``.obs.columns`` to Source records.
1092
- exclude: A dictionary mapping column names to values to exclude from validation.
1093
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1094
- using the exclude parameter ensures they are not validated.
1095
-
1096
- Examples:
1097
- >>> import bionty as bt
1098
- >>> curator = ln.Curator.from_mudata(
1099
- ... mdata,
1100
- ... var_index={
1101
- ... "rna": bt.Gene.ensembl_gene_id,
1102
- ... "adt": CellMarker.name
1103
- ... },
1104
- ... categoricals={
1105
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
1106
- ... "donor_id": ULabel.name
1107
- ... },
1108
- ... organism="human",
1109
- ... )
1110
- """
1327
+ """Categorical manager for `MuData`."""
1111
1328
 
1112
1329
  def __init__(
1113
1330
  self,
1114
1331
  mdata: MuData | Artifact,
1115
- var_index: dict[str, FieldAttr],
1332
+ var_index: dict[str, FieldAttr] | None = None,
1116
1333
  categoricals: dict[str, FieldAttr] | None = None,
1117
1334
  verbosity: str = "hint",
1118
1335
  organism: str | None = None,
1119
1336
  sources: dict[str, Record] | None = None,
1120
- exclude: dict | None = None, # {modality: {field: [values]}}
1121
1337
  ) -> None:
1122
1338
  super().__init__(
1123
1339
  dataset=mdata,
1124
1340
  categoricals={},
1125
1341
  sources=sources,
1126
1342
  organism=organism,
1127
- exclude=exclude,
1128
1343
  )
1129
- self._columns_field = var_index # this is for consistency with BaseCatManager
1130
- self._var_fields = var_index
1344
+ self._columns_field = (
1345
+ var_index or {}
1346
+ ) # this is for consistency with BaseCatManager
1347
+ self._var_fields = var_index or {}
1131
1348
  self._verify_modality(self._var_fields.keys())
1132
- self._obs_fields = self._parse_categoricals(categoricals)
1349
+ self._obs_fields = self._parse_categoricals(categoricals or {})
1133
1350
  self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
1134
1351
  self._verbosity = verbosity
1135
1352
  self._obs_df_curator = None
@@ -1140,7 +1357,6 @@ class MuDataCatManager(CatManager):
1140
1357
  categoricals=self._obs_fields.get("obs", {}),
1141
1358
  verbosity=verbosity,
1142
1359
  sources=self._sources.get("obs"),
1143
- exclude=self._exclude.get("obs"),
1144
1360
  organism=organism,
1145
1361
  )
1146
1362
  self._mod_adata_curators = {
@@ -1150,7 +1366,6 @@ class MuDataCatManager(CatManager):
1150
1366
  categoricals=self._obs_fields.get(modality),
1151
1367
  verbosity=verbosity,
1152
1368
  sources=self._sources.get(modality),
1153
- exclude=self._exclude.get(modality),
1154
1369
  organism=organism,
1155
1370
  )
1156
1371
  for modality in self._modalities
@@ -1199,7 +1414,7 @@ class MuDataCatManager(CatManager):
1199
1414
  obs_fields["obs"][k] = v
1200
1415
  return obs_fields
1201
1416
 
1202
- def lookup(self, public: bool = False) -> CurateLookup:
1417
+ def lookup(self, public: bool = False) -> CatLookup:
1203
1418
  """Lookup categories.
1204
1419
 
1205
1420
  Args:
@@ -1212,7 +1427,7 @@ class MuDataCatManager(CatManager):
1212
1427
  obs_fields[k] = v
1213
1428
  else:
1214
1429
  obs_fields[f"{mod}:{k}"] = v
1215
- return CurateLookup(
1430
+ return CatLookup(
1216
1431
  categoricals=obs_fields,
1217
1432
  slots={
1218
1433
  **{f"{k}_var_index": v for k, v in self._var_fields.items()},
@@ -1271,8 +1486,6 @@ class MuDataCatManager(CatManager):
1271
1486
 
1272
1487
  def validate(self) -> bool:
1273
1488
  """Validate categories."""
1274
- from lamindb.core._settings import settings
1275
-
1276
1489
  # add all validated records to the current instance
1277
1490
  verbosity = settings.verbosity
1278
1491
  try:
@@ -1329,393 +1542,290 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
1329
1542
  )
1330
1543
 
1331
1544
 
1332
- class TiledbsomaCatManager(CatManager):
1333
- """Curation flow for `tiledbsoma.Experiment`.
1334
-
1335
- Args:
1336
- experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
1337
- var_index: The registry fields for mapping the `.var` indices for measurements.
1338
- Should be in the form `{"measurement name": ("var column", field)}`.
1339
- These keys should be used in the flattened form (`'{measurement name}__{column name in .var}'`)
1340
- in `.standardize` or `.add_new_from`, see the output of `.var_index`.
1341
- categoricals: A dictionary mapping categorical `.obs` columns to a registry field.
1342
- obs_columns: The registry field for mapping the names of the `.obs` columns.
1343
- organism: The organism name.
1344
- sources: A dictionary mapping `.obs` columns to Source records.
1345
- exclude: A dictionary mapping column names to values to exclude from validation.
1346
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1347
- using the exclude parameter ensures they are not validated.
1348
-
1349
- Examples:
1350
- >>> import bionty as bt
1351
- >>> curator = ln.Curator.from_tiledbsoma(
1352
- ... "./my_array_store.tiledbsoma",
1353
- ... var_index={"RNA": ("var_id", bt.Gene.symbol)},
1354
- ... categoricals={
1355
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
1356
- ... "donor_id": ULabel.name
1357
- ... },
1358
- ... organism="human",
1359
- ... )
1360
- """
1545
+ class SpatialDataCatManager(CatManager):
1546
+ """Categorical manager for `SpatialData`."""
1361
1547
 
1362
1548
  def __init__(
1363
1549
  self,
1364
- experiment_uri: UPathStr | Artifact,
1365
- var_index: dict[str, tuple[str, FieldAttr]],
1366
- categoricals: dict[str, FieldAttr] | None = None,
1367
- obs_columns: FieldAttr = Feature.name,
1550
+ sdata: Any,
1551
+ var_index: dict[str, FieldAttr],
1552
+ categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1553
+ verbosity: str = "hint",
1368
1554
  organism: str | None = None,
1369
- sources: dict[str, Record] | None = None,
1370
- exclude: dict[str, str | list[str]] | None = None,
1371
- ):
1372
- self._obs_fields = categoricals or {}
1373
- self._var_fields = var_index
1374
- self._columns_field = obs_columns
1375
- if isinstance(experiment_uri, Artifact):
1376
- self._dataset = experiment_uri.path
1377
- self._artifact = experiment_uri
1555
+ sources: dict[str, dict[str, Record]] | None = None,
1556
+ *,
1557
+ sample_metadata_key: str | None = "sample",
1558
+ ) -> None:
1559
+ super().__init__(
1560
+ dataset=sdata,
1561
+ categoricals={},
1562
+ sources=sources,
1563
+ organism=organism,
1564
+ )
1565
+ if isinstance(sdata, Artifact):
1566
+ self._sdata = sdata.load()
1378
1567
  else:
1379
- self._dataset = UPath(experiment_uri)
1380
- self._artifact = None
1381
- self._organism = organism
1382
- self._sources = sources or {}
1383
- self._exclude = exclude or {}
1384
-
1385
- self._is_validated: bool | None = False
1386
- self._non_validated_values: dict[str, list] | None = None
1387
- self._validated_values: dict[str, list] = {}
1388
- # filled by _check_save_keys
1389
- self._n_obs: int | None = None
1390
- self._valid_obs_keys: list[str] | None = None
1391
- self._obs_pa_schema: pa.lib.Schema | None = (
1392
- None # this is needed to create the obs feature set
1568
+ self._sdata = self._dataset
1569
+ self._sample_metadata_key = sample_metadata_key
1570
+ self._write_path = None
1571
+ self._var_fields = var_index
1572
+ self._verify_accessor_exists(self._var_fields.keys())
1573
+ self._categoricals = categoricals
1574
+ self._table_keys = set(self._var_fields.keys()) | set(
1575
+ self._categoricals.keys() - {self._sample_metadata_key}
1393
1576
  )
1394
- self._valid_var_keys: list[str] | None = None
1395
- self._var_fields_flat: dict[str, FieldAttr] | None = None
1396
- self._check_save_keys()
1397
-
1398
- # check that the provided keys in var_index and categoricals are available in the store
1399
- # and save features
1400
- def _check_save_keys(self):
1401
- from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1402
-
1403
- with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1404
- experiment_obs = experiment.obs
1405
- self._n_obs = len(experiment_obs)
1406
- self._obs_pa_schema = experiment_obs.schema
1407
- valid_obs_keys = [
1408
- k for k in self._obs_pa_schema.names if k != "soma_joinid"
1409
- ]
1410
- self._valid_obs_keys = valid_obs_keys
1411
-
1412
- valid_var_keys = []
1413
- ms_list = []
1414
- for ms in experiment.ms.keys():
1415
- ms_list.append(ms)
1416
- var_ms = experiment.ms[ms].var
1417
- valid_var_keys += [
1418
- f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
1419
- ]
1420
- self._valid_var_keys = valid_var_keys
1577
+ self._verbosity = verbosity
1578
+ self._sample_df_curator = None
1579
+ if self._sample_metadata_key is not None:
1580
+ self._sample_metadata = self._sdata.get_attrs(
1581
+ key=self._sample_metadata_key, return_as="df", flatten=True
1582
+ )
1583
+ self._is_validated = False
1421
1584
 
1422
- # check validity of keys in categoricals
1585
+ # Check validity of keys in categoricals
1423
1586
  nonval_keys = []
1424
- for obs_key in self._obs_fields.keys():
1425
- if obs_key not in valid_obs_keys:
1426
- nonval_keys.append(obs_key)
1587
+ for accessor, accessor_categoricals in self._categoricals.items():
1588
+ if (
1589
+ accessor == self._sample_metadata_key
1590
+ and self._sample_metadata is not None
1591
+ ):
1592
+ for key in accessor_categoricals.keys():
1593
+ if key not in self._sample_metadata.columns:
1594
+ nonval_keys.append(key)
1595
+ else:
1596
+ for key in accessor_categoricals.keys():
1597
+ if key not in self._sdata[accessor].obs.columns:
1598
+ nonval_keys.append(key)
1599
+
1427
1600
  _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1428
1601
 
1429
- # check validity of keys in var_index
1430
- self._var_fields_flat = {}
1602
+ # check validity of keys in sources
1431
1603
  nonval_keys = []
1432
- for ms_key in self._var_fields.keys():
1433
- var_key, var_field = self._var_fields[ms_key]
1434
- var_key_flat = f"{ms_key}__{var_key}"
1435
- if var_key_flat not in valid_var_keys:
1436
- nonval_keys.append(f"({ms_key}, {var_key})")
1604
+ for accessor, accessor_sources in self._sources.items():
1605
+ if (
1606
+ accessor == self._sample_metadata_key
1607
+ and self._sample_metadata is not None
1608
+ ):
1609
+ columns = self._sample_metadata.columns
1610
+ elif accessor != self._sample_metadata_key:
1611
+ columns = self._sdata[accessor].obs.columns
1437
1612
  else:
1438
- self._var_fields_flat[var_key_flat] = var_field
1439
- _maybe_curation_keys_not_present(nonval_keys, "var_index")
1440
-
1441
- # check validity of keys in sources and exclude
1442
- valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
1443
- for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
1444
- nonval_keys = []
1445
- for arg_key in dct.keys():
1446
- if arg_key not in valid_arg_keys:
1447
- nonval_keys.append(arg_key)
1448
- _maybe_curation_keys_not_present(nonval_keys, name)
1613
+ continue
1614
+ for key in accessor_sources:
1615
+ if key not in columns:
1616
+ nonval_keys.append(key)
1617
+ _maybe_curation_keys_not_present(nonval_keys, "sources")
1449
1618
 
1450
- # register obs columns' names
1451
- register_columns = list(self._obs_fields.keys())
1452
- organism = check_registry_organism(
1453
- self._columns_field.field.model, self._organism
1454
- ).get("organism")
1455
- update_registry(
1456
- values=register_columns,
1457
- field=self._columns_field,
1458
- key="columns",
1459
- validated_only=False,
1460
- organism=organism,
1461
- source=self._sources.get("columns"),
1462
- exclude=self._exclude.get("columns"),
1463
- )
1464
- additional_columns = [k for k in valid_obs_keys if k not in register_columns]
1465
- # no need to register with validated_only=True if columns are features
1619
+ # Set up sample level metadata and table Curator objects
1466
1620
  if (
1467
- len(additional_columns) > 0
1468
- and self._columns_field.field.model is not Feature
1621
+ self._sample_metadata_key is not None
1622
+ and self._sample_metadata_key in self._categoricals
1469
1623
  ):
1470
- update_registry(
1471
- values=additional_columns,
1472
- field=self._columns_field,
1473
- key="columns",
1474
- validated_only=True,
1624
+ self._sample_df_curator = DataFrameCatManager(
1625
+ df=self._sample_metadata,
1626
+ columns=Feature.name,
1627
+ categoricals=self._categoricals.get(self._sample_metadata_key, {}),
1628
+ verbosity=verbosity,
1629
+ sources=self._sources.get(self._sample_metadata_key),
1630
+ organism=organism,
1631
+ )
1632
+ self._table_adata_curators = {
1633
+ table: AnnDataCatManager(
1634
+ data=self._sdata[table],
1635
+ var_index=var_index.get(table),
1636
+ categoricals=self._categoricals.get(table),
1637
+ verbosity=verbosity,
1638
+ sources=self._sources.get(table),
1475
1639
  organism=organism,
1476
- source=self._sources.get("columns"),
1477
- exclude=self._exclude.get("columns"),
1478
1640
  )
1641
+ for table in self._table_keys
1642
+ }
1479
1643
 
1480
- def validate(self):
1481
- """Validate categories."""
1482
- from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1644
+ self._non_validated = None
1483
1645
 
1484
- validated = True
1485
- self._non_validated_values = {}
1486
- with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1487
- for ms, (key, field) in self._var_fields.items():
1488
- var_ms = experiment.ms[ms].var
1489
- var_ms_key = f"{ms}__{key}"
1490
- # it was already validated and cached
1491
- if var_ms_key in self._validated_values:
1492
- continue
1493
- var_ms_values = (
1494
- var_ms.read(column_names=[key]).concat()[key].to_pylist()
1495
- )
1496
- organism = check_registry_organism(
1497
- field.field.model, self._organism
1498
- ).get("organism")
1499
- update_registry(
1500
- values=var_ms_values,
1501
- field=field,
1502
- key=var_ms_key,
1503
- validated_only=True,
1504
- organism=organism,
1505
- source=self._sources.get(var_ms_key),
1506
- exclude=self._exclude.get(var_ms_key),
1507
- )
1508
- _, non_val = validate_categories(
1509
- values=var_ms_values,
1510
- field=field,
1511
- key=var_ms_key,
1512
- organism=organism,
1513
- source=self._sources.get(var_ms_key),
1514
- exclude=self._exclude.get(var_ms_key),
1515
- )
1516
- if len(non_val) > 0:
1517
- validated = False
1518
- self._non_validated_values[var_ms_key] = non_val
1519
- else:
1520
- self._validated_values[var_ms_key] = var_ms_values
1646
+ @property
1647
+ def var_index(self) -> FieldAttr:
1648
+ """Return the registry fields to validate variables indices against."""
1649
+ return self._var_fields
1521
1650
 
1522
- obs = experiment.obs
1523
- for key, field in self._obs_fields.items():
1524
- # already validated and cached
1525
- if key in self._validated_values:
1526
- continue
1527
- values = pa.compute.unique(
1528
- obs.read(column_names=[key]).concat()[key]
1529
- ).to_pylist()
1530
- organism = check_registry_organism(
1531
- field.field.model, self._organism
1532
- ).get("organism")
1533
- update_registry(
1534
- values=values,
1535
- field=field,
1536
- key=key,
1537
- validated_only=True,
1538
- organism=organism,
1539
- source=self._sources.get(key),
1540
- exclude=self._exclude.get(key),
1541
- )
1542
- _, non_val = validate_categories(
1543
- values=values,
1544
- field=field,
1545
- key=key,
1546
- organism=organism,
1547
- source=self._sources.get(key),
1548
- exclude=self._exclude.get(key),
1549
- )
1550
- if len(non_val) > 0:
1551
- validated = False
1552
- self._non_validated_values[key] = non_val
1553
- else:
1554
- self._validated_values[key] = values
1555
- self._is_validated = validated
1556
- return self._is_validated
1651
+ @property
1652
+ def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
1653
+ """Return the categorical keys and fields to validate against."""
1654
+ return self._categoricals
1557
1655
 
1558
- def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
1559
- assert self._non_validated_values is not None # noqa: S101
1656
+ @property
1657
+ def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
1658
+ """Return the non-validated features and labels."""
1659
+ if self._non_validated is None:
1660
+ raise ValidationError("Please run validate() first!")
1661
+ return self._non_validated
1560
1662
 
1561
- if key in self._valid_obs_keys:
1562
- field = self._obs_fields[key]
1563
- elif key in self._valid_var_keys:
1564
- ms = key.partition("__")[0]
1565
- field = self._var_fields[ms][1]
1566
- else:
1567
- raise KeyError(f"key {key} is invalid!")
1568
- values = self._non_validated_values.get(key, [])
1569
- return values, field
1663
+ def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
1664
+ """Verify that the accessors exist (either a valid table or in attrs)."""
1665
+ for acc in accessors:
1666
+ is_present = False
1667
+ try:
1668
+ self._sdata.get_attrs(key=acc)
1669
+ is_present = True
1670
+ except KeyError:
1671
+ if acc in self._sdata.tables.keys():
1672
+ is_present = True
1673
+ if not is_present:
1674
+ raise ValidationError(f"Accessor '{acc}' does not exist!")
1570
1675
 
1571
- def add_new_from(self, key: str, **kwargs) -> None:
1572
- """Add validated & new categories.
1676
+ def lookup(self, public: bool = False) -> CatLookup:
1677
+ """Look up categories.
1573
1678
 
1574
1679
  Args:
1575
- key: The key referencing the slot in the `tiledbsoma` store.
1576
- It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
1577
- or a column name in `.obs`.
1680
+ public: Whether the lookup is performed on the public reference.
1578
1681
  """
1579
- if self._non_validated_values is None:
1580
- raise ValidationError("Run .validate() first.")
1581
- if key == "all":
1582
- keys = list(self._non_validated_values.keys())
1583
- else:
1584
- avail_keys = list(
1585
- chain(self._non_validated_values.keys(), self._validated_values.keys())
1682
+ cat_values_dict = list(self.categoricals.values())[0]
1683
+ return CatLookup(
1684
+ categoricals=cat_values_dict,
1685
+ slots={"accessors": cat_values_dict.keys()},
1686
+ public=public,
1687
+ )
1688
+
1689
+ def _update_registry_all(self) -> None:
1690
+ """Saves labels of all features for sample and table metadata."""
1691
+ if self._sample_df_curator is not None:
1692
+ self._sample_df_curator._update_registry_all(
1693
+ validated_only=True,
1586
1694
  )
1587
- if key not in avail_keys:
1588
- raise KeyError(
1589
- f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
1590
- )
1591
- keys = [key]
1592
- for k in keys:
1593
- values, field = self._non_validated_values_field(k)
1594
- if len(values) == 0:
1595
- continue
1596
- organism = check_registry_organism(field.field.model, self._organism).get(
1597
- "organism"
1598
- )
1599
- update_registry(
1600
- values=values,
1601
- field=field,
1602
- key=k,
1603
- validated_only=False,
1604
- organism=organism,
1605
- source=self._sources.get(k),
1606
- exclude=self._exclude.get(k),
1607
- **kwargs,
1695
+ for _, adata_curator in self._table_adata_curators.items():
1696
+ adata_curator._obs_df_curator._update_registry_all(
1697
+ validated_only=True,
1608
1698
  )
1609
- # update non-validated values list but keep the key there
1610
- # it will be removed by .validate()
1611
- if k in self._non_validated_values:
1612
- self._non_validated_values[k] = []
1613
1699
 
1614
- @property
1615
- def non_validated(self) -> dict[str, list]:
1616
- """Return the non-validated features and labels."""
1617
- non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
1618
- return non_val
1700
+ def add_new_from_var_index(self, table: str, **kwargs) -> None:
1701
+ """Save new values from ``.var.index`` of table.
1619
1702
 
1620
- @property
1621
- def var_index(self) -> dict[str, FieldAttr]:
1622
- """Return the registry fields with flattened keys to validate variables indices against."""
1623
- return self._var_fields_flat
1703
+ Args:
1704
+ table: The table key.
1705
+ organism: The organism name.
1706
+ **kwargs: Additional keyword arguments to pass to create new records.
1707
+ """
1708
+ if self._non_validated is None:
1709
+ raise ValidationError("Run .validate() first.")
1710
+ self._table_adata_curators[table].add_new_from_var_index(**kwargs)
1711
+ if table in self.non_validated.keys():
1712
+ if "var_index" in self._non_validated[table]:
1713
+ self._non_validated[table].pop("var_index")
1624
1714
 
1625
- @property
1626
- def categoricals(self) -> dict[str, FieldAttr]:
1627
- """Return the obs fields to validate against."""
1628
- return self._obs_fields
1715
+ if len(self.non_validated[table].values()) == 0:
1716
+ self.non_validated.pop(table)
1629
1717
 
1630
- def lookup(self, public: bool = False) -> CurateLookup:
1631
- """Lookup categories.
1718
+ def add_new_from(
1719
+ self,
1720
+ key: str,
1721
+ accessor: str | None = None,
1722
+ **kwargs,
1723
+ ) -> None:
1724
+ """Save new values of categorical from sample level metadata or table.
1632
1725
 
1633
1726
  Args:
1634
- public: If "public", the lookup is performed on the public reference.
1727
+ key: The key referencing the slot in the DataFrame.
1728
+ accessor: The accessor key such as 'sample' or 'table x'.
1729
+ organism: The organism name.
1730
+ **kwargs: Additional keyword arguments to pass to create new records.
1635
1731
  """
1636
- return CurateLookup(
1637
- categoricals=self._obs_fields,
1638
- slots={"columns": self._columns_field, **self._var_fields_flat},
1639
- public=public,
1640
- )
1732
+ if self._non_validated is None:
1733
+ raise ValidationError("Run .validate() first.")
1641
1734
 
1642
- def standardize(self, key: str):
1643
- """Replace synonyms with standardized values.
1735
+ if len(kwargs) > 0 and key == "all":
1736
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
1737
+
1738
+ if accessor not in self.categoricals:
1739
+ raise ValueError(
1740
+ f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
1741
+ )
1742
+
1743
+ if accessor in self._table_adata_curators:
1744
+ adata_curator = self._table_adata_curators[accessor]
1745
+ adata_curator.add_new_from(key=key, **kwargs)
1746
+ if accessor == self._sample_metadata_key:
1747
+ self._sample_df_curator.add_new_from(key=key, **kwargs)
1748
+
1749
+ if accessor in self.non_validated.keys():
1750
+ if len(self.non_validated[accessor].values()) == 0:
1751
+ self.non_validated.pop(accessor)
1752
+
1753
+ def standardize(self, key: str, accessor: str | None = None) -> None:
1754
+ """Replace synonyms with canonical values.
1644
1755
 
1645
1756
  Modifies the dataset inplace.
1646
1757
 
1647
1758
  Args:
1648
- key: The key referencing the slot in the `tiledbsoma` store.
1649
- It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
1650
- or a column name in `.obs`.
1759
+ key: The key referencing the slot in the table or sample metadata.
1760
+ accessor: The accessor key such as 'sample_key' or 'table_key'.
1651
1761
  """
1652
1762
  if len(self.non_validated) == 0:
1653
1763
  logger.warning("values are already standardized")
1654
1764
  return
1655
- avail_keys = list(self._non_validated_values.keys())
1656
- if key == "all":
1657
- keys = avail_keys
1765
+ if self._artifact is not None:
1766
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
1767
+
1768
+ if accessor == self._sample_metadata_key:
1769
+ if key not in self._sample_metadata.columns:
1770
+ raise ValueError(f"key '{key}' not present in '{accessor}'!")
1658
1771
  else:
1659
- if key not in avail_keys:
1660
- raise KeyError(
1661
- f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
1662
- )
1663
- keys = [key]
1772
+ if (
1773
+ key == "var_index" and self._sdata.tables[accessor].var.index is None
1774
+ ) or (
1775
+ key != "var_index"
1776
+ and key not in self._sdata.tables[accessor].obs.columns
1777
+ ):
1778
+ raise ValueError(f"key '{key}' not present in '{accessor}'!")
1664
1779
 
1665
- for k in keys:
1666
- values, field = self._non_validated_values_field(k)
1667
- if len(values) == 0:
1668
- continue
1669
- if k in self._valid_var_keys:
1670
- ms, _, slot_key = k.partition("__")
1671
- slot = lambda experiment: experiment.ms[ms].var # noqa: B023
1672
- else:
1673
- slot = lambda experiment: experiment.obs
1674
- slot_key = k
1675
- # errors if public ontology and the model has no organism
1676
- # has to be fixed in bionty
1677
- organism = check_registry_organism(field.field.model, self._organism).get(
1678
- "organism"
1679
- )
1680
- syn_mapper = standardize_categories(
1681
- values=values,
1682
- field=field,
1683
- source=self._sources.get(k),
1684
- organism=organism,
1685
- )
1686
- if (n_syn_mapper := len(syn_mapper)) == 0:
1687
- continue
1780
+ if accessor in self._table_adata_curators.keys():
1781
+ adata_curator = self._table_adata_curators[accessor]
1782
+ adata_curator.standardize(key)
1783
+ if accessor == self._sample_metadata_key:
1784
+ self._sample_df_curator.standardize(key)
1688
1785
 
1689
- from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1786
+ if len(self.non_validated[accessor].values()) == 0:
1787
+ self.non_validated.pop(accessor)
1690
1788
 
1691
- with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1692
- value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
1693
- table = slot(experiment).read(value_filter=value_filter).concat()
1789
+ def validate(self) -> bool:
1790
+ """Validate variables and categorical observations.
1694
1791
 
1695
- if len(table) == 0:
1696
- continue
1792
+ This method also registers the validated records in the current instance:
1793
+ - from public sources
1697
1794
 
1698
- df = table.to_pandas()
1699
- # map values
1700
- df[slot_key] = df[slot_key].map(
1701
- lambda val: syn_mapper.get(val, val) # noqa
1702
- )
1703
- # write the mapped values
1704
- with _open_tiledbsoma(self._dataset, mode="w") as experiment:
1705
- slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
1706
- # update non_validated dict
1707
- non_val_k = [
1708
- nv for nv in self._non_validated_values[k] if nv not in syn_mapper
1709
- ]
1710
- self._non_validated_values[k] = non_val_k
1795
+ Args:
1796
+ organism: The organism name.
1711
1797
 
1712
- syn_mapper_print = _format_values(
1713
- [f'"{m_k}" "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
1714
- )
1715
- s = "s" if n_syn_mapper > 1 else ""
1716
- logger.success(
1717
- f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
1718
- )
1798
+ Returns:
1799
+ Whether the SpatialData object is validated.
1800
+ """
1801
+ # add all validated records to the current instance
1802
+ verbosity = settings.verbosity
1803
+ try:
1804
+ settings.verbosity = "error"
1805
+ self._update_registry_all()
1806
+ finally:
1807
+ settings.verbosity = verbosity
1808
+
1809
+ self._non_validated = {} # type: ignore
1810
+
1811
+ sample_validated = True
1812
+ if self._sample_df_curator:
1813
+ logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
1814
+ sample_validated &= self._sample_df_curator.validate()
1815
+ if len(self._sample_df_curator.non_validated) > 0:
1816
+ self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
1817
+ logger.print("")
1818
+
1819
+ mods_validated = True
1820
+ for table, adata_curator in self._table_adata_curators.items():
1821
+ logger.info(f"validating categoricals of table '{table}' ...")
1822
+ mods_validated &= adata_curator.validate()
1823
+ if len(adata_curator.non_validated) > 0:
1824
+ self._non_validated[table] = adata_curator.non_validated # type: ignore
1825
+ logger.print("")
1826
+
1827
+ self._is_validated = sample_validated & mods_validated
1828
+ return self._is_validated
1719
1829
 
1720
1830
  def save_artifact(
1721
1831
  self,
@@ -1725,423 +1835,388 @@ class TiledbsomaCatManager(CatManager):
1725
1835
  revises: Artifact | None = None,
1726
1836
  run: Run | None = None,
1727
1837
  ) -> Artifact:
1728
- """Save the validated `tiledbsoma` store and metadata.
1838
+ """Save the validated SpatialData store and metadata.
1729
1839
 
1730
1840
  Args:
1731
- description: A description of the ``tiledbsoma`` store.
1841
+ description: A description of the dataset.
1732
1842
  key: A path-like key to reference artifact in default storage,
1733
- e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
1843
+ e.g., `"myartifact.zarr"`. Artifacts with the same key form a version family.
1734
1844
  revises: Previous version of the artifact. Triggers a revision.
1735
1845
  run: The run that creates the artifact.
1736
1846
 
1737
1847
  Returns:
1738
1848
  A saved artifact record.
1739
1849
  """
1740
- from lamindb.models.artifact import add_labels
1741
-
1742
1850
  if not self._is_validated:
1743
1851
  self.validate()
1744
1852
  if not self._is_validated:
1745
1853
  raise ValidationError("Dataset does not validate. Please curate.")
1746
1854
 
1747
- if self._artifact is None:
1748
- artifact = Artifact(
1749
- self._dataset,
1750
- description=description,
1751
- key=key,
1752
- revises=revises,
1753
- run=run,
1754
- )
1755
- artifact.n_observations = self._n_obs
1756
- artifact.otype = "tiledbsoma"
1757
- artifact.save()
1855
+ return save_artifact(
1856
+ self._sdata,
1857
+ description=description,
1858
+ fields=self.categoricals,
1859
+ index_field=self.var_index,
1860
+ key=key,
1861
+ artifact=self._artifact,
1862
+ revises=revises,
1863
+ run=run,
1864
+ schema=None,
1865
+ organism=self._organism,
1866
+ sample_metadata_key=self._sample_metadata_key,
1867
+ )
1868
+
1869
+
1870
+ class TiledbsomaCatManager(CatManager):
1871
+ """Categorical manager for `tiledbsoma.Experiment`."""
1872
+
1873
+ def __init__(
1874
+ self,
1875
+ experiment_uri: UPathStr | Artifact,
1876
+ var_index: dict[str, tuple[str, FieldAttr]],
1877
+ categoricals: dict[str, FieldAttr] | None = None,
1878
+ obs_columns: FieldAttr = Feature.name,
1879
+ organism: str | None = None,
1880
+ sources: dict[str, Record] | None = None,
1881
+ ):
1882
+ self._obs_fields = categoricals or {}
1883
+ self._var_fields = var_index
1884
+ self._columns_field = obs_columns
1885
+ if isinstance(experiment_uri, Artifact):
1886
+ self._dataset = experiment_uri.path
1887
+ self._artifact = experiment_uri
1758
1888
  else:
1759
- artifact = self._artifact
1889
+ self._dataset = UPath(experiment_uri)
1890
+ self._artifact = None
1891
+ self._organism = organism
1892
+ self._sources = sources or {}
1760
1893
 
1761
- feature_sets = {}
1762
- if len(self._obs_fields) > 0:
1763
- organism = check_registry_organism(
1764
- self._columns_field.field.model, self._organism
1765
- ).get("organism")
1766
- empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
1767
- mock_df = pa.Table.from_pydict(
1768
- empty_dict, schema=self._obs_pa_schema
1769
- ).to_pandas()
1770
- # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
1771
- feature_sets["obs"] = Schema.from_df(
1772
- df=mock_df,
1773
- field=self._columns_field,
1774
- mute=True,
1775
- organism=organism,
1776
- )
1777
- for ms in self._var_fields:
1778
- var_key, var_field = self._var_fields[ms]
1779
- organism = check_registry_organism(
1780
- var_field.field.model, self._organism
1781
- ).get("organism")
1782
- feature_sets[f"{ms}__var"] = Schema.from_values(
1783
- values=self._validated_values[f"{ms}__{var_key}"],
1784
- field=var_field,
1785
- organism=organism,
1786
- raise_validation_error=False,
1787
- )
1788
- artifact._staged_feature_sets = feature_sets
1789
-
1790
- feature_ref_is_name = _ref_is_name(self._columns_field)
1791
- features = Feature.lookup().dict()
1792
- for key, field in self._obs_fields.items():
1793
- feature = features.get(key)
1794
- registry = field.field.model
1795
- organism = check_registry_organism(field.field.model, self._organism).get(
1796
- "organism"
1797
- )
1798
- labels = registry.from_values(
1799
- values=self._validated_values[key], field=field, organism=organism
1800
- )
1801
- if len(labels) == 0:
1802
- continue
1803
- if hasattr(registry, "_name_field"):
1804
- label_ref_is_name = field.field.name == registry._name_field
1805
- add_labels(
1806
- artifact,
1807
- records=labels,
1808
- feature=feature,
1809
- feature_ref_is_name=feature_ref_is_name,
1810
- label_ref_is_name=label_ref_is_name,
1811
- from_curator=True,
1812
- )
1813
-
1814
- return artifact.save()
1815
-
1816
-
1817
- class SpatialDataCatManager(CatManager):
1818
- """Curation flow for a ``Spatialdata`` object.
1819
-
1820
- See also :class:`~lamindb.Curator`.
1821
-
1822
- Note that if genes or other measurements are removed from the SpatialData object,
1823
- the object should be recreated.
1894
+ self._is_validated: bool | None = False
1895
+ self._non_validated_values: dict[str, list] | None = None
1896
+ self._validated_values: dict[str, list] = {}
1897
+ # filled by _check_save_keys
1898
+ self._n_obs: int | None = None
1899
+ self._valid_obs_keys: list[str] | None = None
1900
+ self._obs_pa_schema: pa.lib.Schema | None = (
1901
+ None # this is needed to create the obs feature set
1902
+ )
1903
+ self._valid_var_keys: list[str] | None = None
1904
+ self._var_fields_flat: dict[str, FieldAttr] | None = None
1905
+ self._check_save_keys()
1824
1906
 
1825
- In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
1907
+ # check that the provided keys in var_index and categoricals are available in the store
1908
+ # and save features
1909
+ def _check_save_keys(self):
1910
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1826
1911
 
1827
- Args:
1828
- sdata: The SpatialData object to curate.
1829
- var_index: A dictionary mapping table keys to the ``.var`` indices.
1830
- categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
1912
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1913
+ experiment_obs = experiment.obs
1914
+ self._n_obs = len(experiment_obs)
1915
+ self._obs_pa_schema = experiment_obs.schema
1916
+ valid_obs_keys = [
1917
+ k for k in self._obs_pa_schema.names if k != "soma_joinid"
1918
+ ]
1919
+ self._valid_obs_keys = valid_obs_keys
1831
1920
 
1832
- organism: The organism name.
1833
- sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
1834
- exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
1835
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1836
- using the exclude parameter ensures they are not validated.
1837
- verbosity: The verbosity level of the logger.
1838
- sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
1839
-
1840
- Examples:
1841
- >>> import bionty as bt
1842
- >>> curator = SpatialDataCatManager(
1843
- ... sdata,
1844
- ... var_index={
1845
- ... "table_1": bt.Gene.ensembl_gene_id,
1846
- ... },
1847
- ... categoricals={
1848
- ... "table1":
1849
- ... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
1850
- ... "sample":
1851
- ... {"experimental_factor": bt.ExperimentalFactor.name},
1852
- ... },
1853
- ... organism="human",
1854
- ... )
1855
- """
1921
+ valid_var_keys = []
1922
+ ms_list = []
1923
+ for ms in experiment.ms.keys():
1924
+ ms_list.append(ms)
1925
+ var_ms = experiment.ms[ms].var
1926
+ valid_var_keys += [
1927
+ f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
1928
+ ]
1929
+ self._valid_var_keys = valid_var_keys
1856
1930
 
1857
- def __init__(
1858
- self,
1859
- sdata: Any,
1860
- var_index: dict[str, FieldAttr],
1861
- categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1862
- verbosity: str = "hint",
1863
- organism: str | None = None,
1864
- sources: dict[str, dict[str, Record]] | None = None,
1865
- exclude: dict[str, dict] | None = None,
1866
- *,
1867
- sample_metadata_key: str | None = "sample",
1868
- ) -> None:
1869
- super().__init__(
1870
- dataset=sdata,
1871
- categoricals={},
1872
- sources=sources,
1873
- organism=organism,
1874
- exclude=exclude,
1875
- )
1876
- if isinstance(sdata, Artifact):
1877
- self._sdata = sdata.load()
1878
- else:
1879
- self._sdata = self._dataset
1880
- self._sample_metadata_key = sample_metadata_key
1881
- self._write_path = None
1882
- self._var_fields = var_index
1883
- self._verify_accessor_exists(self._var_fields.keys())
1884
- self._categoricals = categoricals
1885
- self._table_keys = set(self._var_fields.keys()) | set(
1886
- self._categoricals.keys() - {self._sample_metadata_key}
1887
- )
1888
- self._verbosity = verbosity
1889
- self._sample_df_curator = None
1890
- if self._sample_metadata_key is not None:
1891
- self._sample_metadata = self._sdata.get_attrs(
1892
- key=self._sample_metadata_key, return_as="df", flatten=True
1893
- )
1894
- self._is_validated = False
1931
+ # check validity of keys in categoricals
1932
+ nonval_keys = []
1933
+ for obs_key in self._obs_fields.keys():
1934
+ if obs_key not in valid_obs_keys:
1935
+ nonval_keys.append(obs_key)
1936
+ _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1895
1937
 
1896
- # Check validity of keys in categoricals
1938
+ # check validity of keys in var_index
1939
+ self._var_fields_flat = {}
1897
1940
  nonval_keys = []
1898
- for accessor, accessor_categoricals in self._categoricals.items():
1899
- if (
1900
- accessor == self._sample_metadata_key
1901
- and self._sample_metadata is not None
1902
- ):
1903
- for key in accessor_categoricals.keys():
1904
- if key not in self._sample_metadata.columns:
1905
- nonval_keys.append(key)
1941
+ for ms_key in self._var_fields.keys():
1942
+ var_key, var_field = self._var_fields[ms_key]
1943
+ var_key_flat = f"{ms_key}__{var_key}"
1944
+ if var_key_flat not in valid_var_keys:
1945
+ nonval_keys.append(f"({ms_key}, {var_key})")
1906
1946
  else:
1907
- for key in accessor_categoricals.keys():
1908
- if key not in self._sdata[accessor].obs.columns:
1909
- nonval_keys.append(key)
1910
-
1911
- _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1947
+ self._var_fields_flat[var_key_flat] = var_field
1948
+ _maybe_curation_keys_not_present(nonval_keys, "var_index")
1912
1949
 
1913
- # check validity of keys in sources and exclude
1914
- for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
1915
- nonval_keys = []
1916
- for accessor, accessor_sources in dct.items():
1917
- if (
1918
- accessor == self._sample_metadata_key
1919
- and self._sample_metadata is not None
1920
- ):
1921
- columns = self._sample_metadata.columns
1922
- elif accessor != self._sample_metadata_key:
1923
- columns = self._sdata[accessor].obs.columns
1924
- else:
1925
- continue
1926
- for key in accessor_sources:
1927
- if key not in columns:
1928
- nonval_keys.append(key)
1929
- _maybe_curation_keys_not_present(nonval_keys, name)
1950
+ # check validity of keys in sources
1951
+ valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
1952
+ nonval_keys = []
1953
+ for arg_key in self._sources.keys():
1954
+ if arg_key not in valid_arg_keys:
1955
+ nonval_keys.append(arg_key)
1956
+ _maybe_curation_keys_not_present(nonval_keys, "sources")
1930
1957
 
1931
- # Set up sample level metadata and table Curator objects
1958
+ # register obs columns' names
1959
+ register_columns = list(self._obs_fields.keys())
1960
+ organism = configure_organism(
1961
+ self._columns_field.field.model, self._organism
1962
+ ).get("organism")
1963
+ update_registry(
1964
+ values=register_columns,
1965
+ field=self._columns_field,
1966
+ key="columns",
1967
+ validated_only=False,
1968
+ organism=organism,
1969
+ source=self._sources.get("columns"),
1970
+ )
1971
+ additional_columns = [k for k in valid_obs_keys if k not in register_columns]
1972
+ # no need to register with validated_only=True if columns are features
1932
1973
  if (
1933
- self._sample_metadata_key is not None
1934
- and self._sample_metadata_key in self._categoricals
1974
+ len(additional_columns) > 0
1975
+ and self._columns_field.field.model is not Feature
1935
1976
  ):
1936
- self._sample_df_curator = DataFrameCatManager(
1937
- df=self._sample_metadata,
1938
- columns=Feature.name,
1939
- categoricals=self._categoricals.get(self._sample_metadata_key, {}),
1940
- verbosity=verbosity,
1941
- sources=self._sources.get(self._sample_metadata_key),
1942
- exclude=self._exclude.get(self._sample_metadata_key),
1943
- organism=organism,
1944
- )
1945
- self._table_adata_curators = {
1946
- table: AnnDataCatManager(
1947
- data=self._sdata[table],
1948
- var_index=var_index.get(table),
1949
- categoricals=self._categoricals.get(table),
1950
- verbosity=verbosity,
1951
- sources=self._sources.get(table),
1952
- exclude=self._exclude.get(table),
1977
+ update_registry(
1978
+ values=additional_columns,
1979
+ field=self._columns_field,
1980
+ key="columns",
1981
+ validated_only=True,
1953
1982
  organism=organism,
1983
+ source=self._sources.get("columns"),
1954
1984
  )
1955
- for table in self._table_keys
1956
- }
1957
1985
 
1958
- self._non_validated = None
1986
+ def validate(self):
1987
+ """Validate categories."""
1988
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1959
1989
 
1960
- @property
1961
- def var_index(self) -> FieldAttr:
1962
- """Return the registry fields to validate variables indices against."""
1963
- return self._var_fields
1990
+ validated = True
1991
+ self._non_validated_values = {}
1992
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1993
+ for ms, (key, field) in self._var_fields.items():
1994
+ var_ms = experiment.ms[ms].var
1995
+ var_ms_key = f"{ms}__{key}"
1996
+ # it was already validated and cached
1997
+ if var_ms_key in self._validated_values:
1998
+ continue
1999
+ var_ms_values = (
2000
+ var_ms.read(column_names=[key]).concat()[key].to_pylist()
2001
+ )
2002
+ organism = configure_organism(field.field.model, self._organism).get(
2003
+ "organism"
2004
+ )
2005
+ update_registry(
2006
+ values=var_ms_values,
2007
+ field=field,
2008
+ key=var_ms_key,
2009
+ validated_only=True,
2010
+ organism=organism,
2011
+ source=self._sources.get(var_ms_key),
2012
+ )
2013
+ _, non_val = validate_categories(
2014
+ values=var_ms_values,
2015
+ field=field,
2016
+ key=var_ms_key,
2017
+ organism=organism,
2018
+ source=self._sources.get(var_ms_key),
2019
+ )
2020
+ if len(non_val) > 0:
2021
+ validated = False
2022
+ self._non_validated_values[var_ms_key] = non_val
2023
+ else:
2024
+ self._validated_values[var_ms_key] = var_ms_values
1964
2025
 
1965
- @property
1966
- def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
1967
- """Return the categorical keys and fields to validate against."""
1968
- return self._categoricals
2026
+ obs = experiment.obs
2027
+ for key, field in self._obs_fields.items():
2028
+ # already validated and cached
2029
+ if key in self._validated_values:
2030
+ continue
2031
+ values = pa.compute.unique(
2032
+ obs.read(column_names=[key]).concat()[key]
2033
+ ).to_pylist()
2034
+ organism = configure_organism(field.field.model, self._organism).get(
2035
+ "organism"
2036
+ )
2037
+ update_registry(
2038
+ values=values,
2039
+ field=field,
2040
+ key=key,
2041
+ validated_only=True,
2042
+ organism=organism,
2043
+ source=self._sources.get(key),
2044
+ )
2045
+ _, non_val = validate_categories(
2046
+ values=values,
2047
+ field=field,
2048
+ key=key,
2049
+ organism=organism,
2050
+ source=self._sources.get(key),
2051
+ )
2052
+ if len(non_val) > 0:
2053
+ validated = False
2054
+ self._non_validated_values[key] = non_val
2055
+ else:
2056
+ self._validated_values[key] = values
2057
+ self._is_validated = validated
2058
+ return self._is_validated
1969
2059
 
1970
- @property
1971
- def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
1972
- """Return the non-validated features and labels."""
1973
- if self._non_validated is None:
1974
- raise ValidationError("Please run validate() first!")
1975
- return self._non_validated
2060
+ def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
2061
+ assert self._non_validated_values is not None # noqa: S101
1976
2062
 
1977
- def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
1978
- """Verify that the accessors exist (either a valid table or in attrs)."""
1979
- for acc in accessors:
1980
- is_present = False
1981
- try:
1982
- self._sdata.get_attrs(key=acc)
1983
- is_present = True
1984
- except KeyError:
1985
- if acc in self._sdata.tables.keys():
1986
- is_present = True
1987
- if not is_present:
1988
- raise ValidationError(f"Accessor '{acc}' does not exist!")
2063
+ if key in self._valid_obs_keys:
2064
+ field = self._obs_fields[key]
2065
+ elif key in self._valid_var_keys:
2066
+ ms = key.partition("__")[0]
2067
+ field = self._var_fields[ms][1]
2068
+ else:
2069
+ raise KeyError(f"key {key} is invalid!")
2070
+ values = self._non_validated_values.get(key, [])
2071
+ return values, field
1989
2072
 
1990
- def lookup(self, public: bool = False) -> CurateLookup:
1991
- """Look up categories.
2073
+ def add_new_from(self, key: str, **kwargs) -> None:
2074
+ """Add validated & new categories.
1992
2075
 
1993
2076
  Args:
1994
- public: Whether the lookup is performed on the public reference.
2077
+ key: The key referencing the slot in the `tiledbsoma` store.
2078
+ It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
2079
+ or a column name in `.obs`.
1995
2080
  """
1996
- cat_values_dict = list(self.categoricals.values())[0]
1997
- return CurateLookup(
1998
- categoricals=cat_values_dict,
1999
- slots={"accessors": cat_values_dict.keys()},
2000
- public=public,
2001
- )
2002
-
2003
- def _update_registry_all(self) -> None:
2004
- """Saves labels of all features for sample and table metadata."""
2005
- if self._sample_df_curator is not None:
2006
- self._sample_df_curator._update_registry_all(
2007
- validated_only=True,
2081
+ if self._non_validated_values is None:
2082
+ raise ValidationError("Run .validate() first.")
2083
+ if key == "all":
2084
+ keys = list(self._non_validated_values.keys())
2085
+ else:
2086
+ avail_keys = list(
2087
+ chain(self._non_validated_values.keys(), self._validated_values.keys())
2008
2088
  )
2009
- for _, adata_curator in self._table_adata_curators.items():
2010
- adata_curator._obs_df_curator._update_registry_all(
2011
- validated_only=True,
2089
+ if key not in avail_keys:
2090
+ raise KeyError(
2091
+ f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
2092
+ )
2093
+ keys = [key]
2094
+ for k in keys:
2095
+ values, field = self._non_validated_values_field(k)
2096
+ if len(values) == 0:
2097
+ continue
2098
+ organism = configure_organism(field.field.model, self._organism).get(
2099
+ "organism"
2100
+ )
2101
+ update_registry(
2102
+ values=values,
2103
+ field=field,
2104
+ key=k,
2105
+ validated_only=False,
2106
+ organism=organism,
2107
+ source=self._sources.get(k),
2108
+ **kwargs,
2012
2109
  )
2110
+ # update non-validated values list but keep the key there
2111
+ # it will be removed by .validate()
2112
+ if k in self._non_validated_values:
2113
+ self._non_validated_values[k] = []
2013
2114
 
2014
- def add_new_from_var_index(self, table: str, **kwargs) -> None:
2015
- """Save new values from ``.var.index`` of table.
2115
+ @property
2116
+ def non_validated(self) -> dict[str, list]:
2117
+ """Return the non-validated features and labels."""
2118
+ non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
2119
+ return non_val
2016
2120
 
2017
- Args:
2018
- table: The table key.
2019
- organism: The organism name.
2020
- **kwargs: Additional keyword arguments to pass to create new records.
2021
- """
2022
- if self._non_validated is None:
2023
- raise ValidationError("Run .validate() first.")
2024
- self._table_adata_curators[table].add_new_from_var_index(**kwargs)
2025
- if table in self.non_validated.keys():
2026
- if "var_index" in self._non_validated[table]:
2027
- self._non_validated[table].pop("var_index")
2121
+ @property
2122
+ def var_index(self) -> dict[str, FieldAttr]:
2123
+ """Return the registry fields with flattened keys to validate variables indices against."""
2124
+ return self._var_fields_flat
2028
2125
 
2029
- if len(self.non_validated[table].values()) == 0:
2030
- self.non_validated.pop(table)
2126
+ @property
2127
+ def categoricals(self) -> dict[str, FieldAttr]:
2128
+ """Return the obs fields to validate against."""
2129
+ return self._obs_fields
2031
2130
 
2032
- def add_new_from(
2033
- self,
2034
- key: str,
2035
- accessor: str | None = None,
2036
- **kwargs,
2037
- ) -> None:
2038
- """Save new values of categorical from sample level metadata or table.
2131
+ def lookup(self, public: bool = False) -> CatLookup:
2132
+ """Lookup categories.
2039
2133
 
2040
2134
  Args:
2041
- key: The key referencing the slot in the DataFrame.
2042
- accessor: The accessor key such as 'sample' or 'table x'.
2043
- organism: The organism name.
2044
- **kwargs: Additional keyword arguments to pass to create new records.
2135
+ public: If "public", the lookup is performed on the public reference.
2045
2136
  """
2046
- if self._non_validated is None:
2047
- raise ValidationError("Run .validate() first.")
2048
-
2049
- if len(kwargs) > 0 and key == "all":
2050
- raise ValueError("Cannot pass additional arguments to 'all' key!")
2051
-
2052
- if accessor not in self.categoricals:
2053
- raise ValueError(
2054
- f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
2055
- )
2056
-
2057
- if accessor in self._table_adata_curators:
2058
- adata_curator = self._table_adata_curators[accessor]
2059
- adata_curator.add_new_from(key=key, **kwargs)
2060
- if accessor == self._sample_metadata_key:
2061
- self._sample_df_curator.add_new_from(key=key, **kwargs)
2062
-
2063
- if accessor in self.non_validated.keys():
2064
- if len(self.non_validated[accessor].values()) == 0:
2065
- self.non_validated.pop(accessor)
2137
+ return CatLookup(
2138
+ categoricals=self._obs_fields,
2139
+ slots={"columns": self._columns_field, **self._var_fields_flat},
2140
+ public=public,
2141
+ )
2066
2142
 
2067
- def standardize(self, key: str, accessor: str | None = None) -> None:
2068
- """Replace synonyms with canonical values.
2143
+ def standardize(self, key: str):
2144
+ """Replace synonyms with standardized values.
2069
2145
 
2070
2146
  Modifies the dataset inplace.
2071
2147
 
2072
2148
  Args:
2073
- key: The key referencing the slot in the table or sample metadata.
2074
- accessor: The accessor key such as 'sample_key' or 'table_key'.
2149
+ key: The key referencing the slot in the `tiledbsoma` store.
2150
+ It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
2151
+ or a column name in `.obs`.
2075
2152
  """
2076
2153
  if len(self.non_validated) == 0:
2077
2154
  logger.warning("values are already standardized")
2078
2155
  return
2079
- if self._artifact is not None:
2080
- raise RuntimeError("can't mutate the dataset when an artifact is passed!")
2081
-
2082
- if accessor == self._sample_metadata_key:
2083
- if key not in self._sample_metadata.columns:
2084
- raise ValueError(f"key '{key}' not present in '{accessor}'!")
2156
+ avail_keys = list(self._non_validated_values.keys())
2157
+ if key == "all":
2158
+ keys = avail_keys
2085
2159
  else:
2086
- if (
2087
- key == "var_index" and self._sdata.tables[accessor].var.index is None
2088
- ) or (
2089
- key != "var_index"
2090
- and key not in self._sdata.tables[accessor].obs.columns
2091
- ):
2092
- raise ValueError(f"key '{key}' not present in '{accessor}'!")
2093
-
2094
- if accessor in self._table_adata_curators.keys():
2095
- adata_curator = self._table_adata_curators[accessor]
2096
- adata_curator.standardize(key)
2097
- if accessor == self._sample_metadata_key:
2098
- self._sample_df_curator.standardize(key)
2099
-
2100
- if len(self.non_validated[accessor].values()) == 0:
2101
- self.non_validated.pop(accessor)
2102
-
2103
- def validate(self) -> bool:
2104
- """Validate variables and categorical observations.
2105
-
2106
- This method also registers the validated records in the current instance:
2107
- - from public sources
2108
-
2109
- Args:
2110
- organism: The organism name.
2160
+ if key not in avail_keys:
2161
+ raise KeyError(
2162
+ f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
2163
+ )
2164
+ keys = [key]
2111
2165
 
2112
- Returns:
2113
- Whether the SpatialData object is validated.
2114
- """
2115
- from lamindb.core._settings import settings
2166
+ for k in keys:
2167
+ values, field = self._non_validated_values_field(k)
2168
+ if len(values) == 0:
2169
+ continue
2170
+ if k in self._valid_var_keys:
2171
+ ms, _, slot_key = k.partition("__")
2172
+ slot = lambda experiment: experiment.ms[ms].var # noqa: B023
2173
+ else:
2174
+ slot = lambda experiment: experiment.obs
2175
+ slot_key = k
2176
+ # errors if public ontology and the model has no organism
2177
+ # has to be fixed in bionty
2178
+ organism = configure_organism(field.field.model, self._organism).get(
2179
+ "organism"
2180
+ )
2181
+ syn_mapper = standardize_categories(
2182
+ values=values,
2183
+ field=field,
2184
+ source=self._sources.get(k),
2185
+ organism=organism,
2186
+ )
2187
+ if (n_syn_mapper := len(syn_mapper)) == 0:
2188
+ continue
2116
2189
 
2117
- # add all validated records to the current instance
2118
- verbosity = settings.verbosity
2119
- try:
2120
- settings.verbosity = "error"
2121
- self._update_registry_all()
2122
- finally:
2123
- settings.verbosity = verbosity
2190
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
2124
2191
 
2125
- self._non_validated = {} # type: ignore
2192
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
2193
+ value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
2194
+ table = slot(experiment).read(value_filter=value_filter).concat()
2126
2195
 
2127
- sample_validated = True
2128
- if self._sample_df_curator:
2129
- logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
2130
- sample_validated &= self._sample_df_curator.validate()
2131
- if len(self._sample_df_curator.non_validated) > 0:
2132
- self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
2133
- logger.print("")
2196
+ if len(table) == 0:
2197
+ continue
2134
2198
 
2135
- mods_validated = True
2136
- for table, adata_curator in self._table_adata_curators.items():
2137
- logger.info(f"validating categoricals of table '{table}' ...")
2138
- mods_validated &= adata_curator.validate()
2139
- if len(adata_curator.non_validated) > 0:
2140
- self._non_validated[table] = adata_curator.non_validated # type: ignore
2141
- logger.print("")
2199
+ df = table.to_pandas()
2200
+ # map values
2201
+ df[slot_key] = df[slot_key].map(
2202
+ lambda val: syn_mapper.get(val, val) # noqa
2203
+ )
2204
+ # write the mapped values
2205
+ with _open_tiledbsoma(self._dataset, mode="w") as experiment:
2206
+ slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
2207
+ # update non_validated dict
2208
+ non_val_k = [
2209
+ nv for nv in self._non_validated_values[k] if nv not in syn_mapper
2210
+ ]
2211
+ self._non_validated_values[k] = non_val_k
2142
2212
 
2143
- self._is_validated = sample_validated & mods_validated
2144
- return self._is_validated
2213
+ syn_mapper_print = _format_values(
2214
+ [f'"{m_k}" → "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
2215
+ )
2216
+ s = "s" if n_syn_mapper > 1 else ""
2217
+ logger.success(
2218
+ f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
2219
+ )
2145
2220
 
2146
2221
  def save_artifact(
2147
2222
  self,
@@ -2151,217 +2226,119 @@ class SpatialDataCatManager(CatManager):
2151
2226
  revises: Artifact | None = None,
2152
2227
  run: Run | None = None,
2153
2228
  ) -> Artifact:
2229
+ """Save the validated `tiledbsoma` store and metadata.
2230
+
2231
+ Args:
2232
+ description: A description of the ``tiledbsoma`` store.
2233
+ key: A path-like key to reference artifact in default storage,
2234
+ e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
2235
+ revises: Previous version of the artifact. Triggers a revision.
2236
+ run: The run that creates the artifact.
2237
+
2238
+ Returns:
2239
+ A saved artifact record.
2240
+ """
2154
2241
  if not self._is_validated:
2155
2242
  self.validate()
2156
2243
  if not self._is_validated:
2157
2244
  raise ValidationError("Dataset does not validate. Please curate.")
2158
2245
 
2159
- verbosity = settings.verbosity
2160
- try:
2161
- settings.verbosity = "warning"
2162
-
2163
- self._artifact = Artifact.from_spatialdata(
2164
- self._sdata,
2165
- key=key,
2246
+ if self._artifact is None:
2247
+ artifact = Artifact(
2248
+ self._dataset,
2166
2249
  description=description,
2250
+ key=key,
2167
2251
  revises=revises,
2168
2252
  run=run,
2169
2253
  )
2170
- self._artifact.save()
2254
+ artifact.n_observations = self._n_obs
2255
+ artifact.otype = "tiledbsoma"
2256
+ artifact.save()
2257
+ else:
2258
+ artifact = self._artifact
2171
2259
 
2172
- # Link schemas
2173
- feature_kwargs = check_registry_organism(
2174
- (list(self._var_fields.values())[0].field.model),
2175
- self._organism,
2260
+ feature_sets = {}
2261
+ if len(self._obs_fields) > 0:
2262
+ organism = configure_organism(
2263
+ self._columns_field.field.model, self._organism
2264
+ ).get("organism")
2265
+ empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
2266
+ mock_df = pa.Table.from_pydict(
2267
+ empty_dict, schema=self._obs_pa_schema
2268
+ ).to_pandas()
2269
+ # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
2270
+ feature_sets["obs"] = Schema.from_df(
2271
+ df=mock_df,
2272
+ field=self._columns_field,
2273
+ mute=True,
2274
+ organism=organism,
2176
2275
  )
2177
-
2178
- def _add_set_from_spatialdata(
2179
- host: Artifact | Collection | Run,
2180
- var_fields: dict[str, FieldAttr],
2181
- obs_fields: dict[str, FieldAttr] = None,
2182
- mute: bool = False,
2183
- organism: str | Record | None = None,
2184
- ):
2185
- """Add Schemas from SpatialData."""
2186
- if obs_fields is None:
2187
- obs_fields = {}
2188
- assert host.otype == "SpatialData" # noqa: S101
2189
-
2190
- feature_sets = {}
2191
-
2192
- # sample features
2193
- sample_features = Feature.from_values(self._sample_metadata.columns) # type: ignore
2194
- if len(sample_features) > 0:
2195
- feature_sets[self._sample_metadata_key] = Schema(
2196
- features=sample_features
2197
- )
2198
-
2199
- # table features
2200
- for table, field in var_fields.items():
2201
- table_fs = parse_staged_feature_sets_from_anndata(
2202
- self._sdata[table],
2203
- var_field=field,
2204
- obs_field=obs_fields.get(table, Feature.name),
2205
- mute=mute,
2206
- organism=organism,
2207
- )
2208
- for k, v in table_fs.items():
2209
- feature_sets[f"['{table}'].{k}"] = v
2210
-
2211
- def _unify_staged_feature_sets_by_hash(
2212
- feature_sets: MutableMapping[str, Schema],
2213
- ):
2214
- unique_values: dict[str, Any] = {}
2215
-
2216
- for key, value in feature_sets.items():
2217
- value_hash = (
2218
- value.hash
2219
- ) # Assuming each value has a .hash attribute
2220
- if value_hash in unique_values:
2221
- feature_sets[key] = unique_values[value_hash]
2222
- else:
2223
- unique_values[value_hash] = value
2224
-
2225
- return feature_sets
2226
-
2227
- # link feature sets
2228
- host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
2229
- feature_sets
2230
- )
2231
- host.save()
2232
-
2233
- _add_set_from_spatialdata(
2234
- self._artifact, var_fields=self._var_fields, **feature_kwargs
2276
+ for ms in self._var_fields:
2277
+ var_key, var_field = self._var_fields[ms]
2278
+ organism = configure_organism(var_field.field.model, self._organism).get(
2279
+ "organism"
2235
2280
  )
2281
+ feature_sets[f"{ms}__var"] = Schema.from_values(
2282
+ values=self._validated_values[f"{ms}__{var_key}"],
2283
+ field=var_field,
2284
+ organism=organism,
2285
+ raise_validation_error=False,
2286
+ )
2287
+ artifact._staged_feature_sets = feature_sets
2236
2288
 
2237
- # Link labels
2238
- def _add_labels_from_spatialdata(
2239
- data,
2240
- artifact: Artifact,
2241
- fields: dict[str, FieldAttr],
2242
- feature_ref_is_name: bool | None = None,
2243
- ):
2244
- """Add Labels from SpatialData."""
2245
- features = Feature.lookup().dict()
2246
- for key, field in fields.items():
2247
- feature = features.get(key)
2248
- registry = field.field.model
2249
- filter_kwargs = check_registry_organism(registry, self._organism)
2250
- filter_kwargs_current = get_current_filter_kwargs(
2251
- registry, filter_kwargs
2252
- )
2253
- df = data if isinstance(data, pd.DataFrame) else data.obs
2254
- labels = registry.from_values(
2255
- df[key],
2256
- field=field,
2257
- **filter_kwargs_current,
2258
- )
2259
- if len(labels) == 0:
2260
- continue
2261
-
2262
- label_ref_is_name = None
2263
- if hasattr(registry, "_name_field"):
2264
- label_ref_is_name = field.field.name == registry._name_field
2265
- add_labels(
2266
- artifact,
2267
- records=labels,
2268
- feature=feature,
2269
- feature_ref_is_name=feature_ref_is_name,
2270
- label_ref_is_name=label_ref_is_name,
2271
- from_curator=True,
2272
- )
2273
-
2274
- for accessor, accessor_fields in self._categoricals.items():
2275
- column_field = self._var_fields.get(accessor)
2276
- if accessor == self._sample_metadata_key:
2277
- _add_labels_from_spatialdata(
2278
- self._sample_metadata,
2279
- self._artifact,
2280
- accessor_fields,
2281
- feature_ref_is_name=(
2282
- None if column_field is None else _ref_is_name(column_field)
2283
- ),
2284
- )
2285
- else:
2286
- _add_labels_from_spatialdata(
2287
- self._sdata.tables[accessor],
2288
- self._artifact,
2289
- accessor_fields,
2290
- feature_ref_is_name=(
2291
- None if column_field is None else _ref_is_name(column_field)
2292
- ),
2293
- )
2294
-
2295
- finally:
2296
- settings.verbosity = verbosity
2297
-
2298
- slug = ln_setup.settings.instance.slug
2299
- if ln_setup.settings.instance.is_remote: # pragma: no cover
2300
- logger.important(
2301
- f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
2289
+ feature_ref_is_name = _ref_is_name(self._columns_field)
2290
+ features = Feature.lookup().dict()
2291
+ for key, field in self._obs_fields.items():
2292
+ feature = features.get(key)
2293
+ registry = field.field.model
2294
+ organism = configure_organism(field.field.model, self._organism).get(
2295
+ "organism"
2296
+ )
2297
+ labels = registry.from_values(
2298
+ values=self._validated_values[key], field=field, organism=organism
2302
2299
  )
2300
+ if len(labels) == 0:
2301
+ continue
2302
+ if hasattr(registry, "_name_field"):
2303
+ label_ref_is_name = field.field.name == registry._name_field
2304
+ add_labels(
2305
+ artifact,
2306
+ records=labels,
2307
+ feature=feature,
2308
+ feature_ref_is_name=feature_ref_is_name,
2309
+ label_ref_is_name=label_ref_is_name,
2310
+ from_curator=True,
2311
+ )
2303
2312
 
2304
- return self._artifact
2313
+ return artifact.save()
2305
2314
 
2306
2315
 
2307
- def _restrict_obs_fields(
2308
- obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
2309
- ) -> dict[str, str]:
2310
- """Restrict the obs fields to name return only available obs fields.
2316
+ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2317
+ """Categorical manager for `AnnData` respecting the CELLxGENE schema.
2311
2318
 
2312
- To simplify the curation, we only validate against either name or ontology_id.
2313
- If both are available, we validate against ontology_id.
2314
- If none are available, we validate against name.
2319
+ This will be superceded by a schema-based curation flow.
2315
2320
  """
2316
- obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
2317
- for name, field in obs_fields.items():
2318
- if name.endswith("_ontology_term_id"):
2319
- continue
2320
- # if both the ontology id and the name are present, only validate on the ontology_id
2321
- if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
2322
- obs_fields_unique.pop(name)
2323
- # if the neither name nor ontology id are present, validate on the name
2324
- # this will raise error downstream, we just use name to be more readable
2325
- if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
2326
- obs_fields_unique[name] = field
2327
-
2328
- # Only retain obs_fields_unique that have keys in adata.obs.columns
2329
- available_obs_fields = {
2330
- k: v for k, v in obs_fields_unique.items() if k in obs.columns
2331
- }
2332
-
2333
- return available_obs_fields
2334
-
2335
2321
 
2336
- def _add_defaults_to_obs(
2337
- obs: pd.DataFrame,
2338
- defaults: dict[str, str],
2339
- ) -> None:
2340
- """Add default columns and values to obs DataFrame."""
2341
- added_defaults: dict = {}
2342
- for name, default in defaults.items():
2343
- if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
2344
- obs[name] = default
2345
- added_defaults[name] = default
2346
- logger.important(
2347
- f"added default value '{default}' to the adata.obs['{name}']"
2348
- )
2349
-
2350
-
2351
- class CellxGeneAnnDataCatManager(AnnDataCatManager):
2352
- """Annotation flow of AnnData based on CELLxGENE schema."""
2353
-
2354
- _controls_were_created: bool | None = None
2322
+ cxg_categoricals_defaults = {
2323
+ "cell_type": "unknown",
2324
+ "development_stage": "unknown",
2325
+ "disease": "normal",
2326
+ "donor_id": "unknown",
2327
+ "self_reported_ethnicity": "unknown",
2328
+ "sex": "unknown",
2329
+ "suspension_type": "cell",
2330
+ "tissue_type": "tissue",
2331
+ }
2355
2332
 
2356
2333
  def __init__(
2357
2334
  self,
2358
- adata: ad.AnnData | UPathStr,
2335
+ adata: ad.AnnData,
2359
2336
  categoricals: dict[str, FieldAttr] | None = None,
2360
2337
  organism: Literal["human", "mouse"] = "human",
2361
2338
  *,
2339
+ schema_version: Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
2362
2340
  defaults: dict[str, str] = None,
2363
2341
  extra_sources: dict[str, Record] = None,
2364
- schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
2365
2342
  verbosity: str = "hint",
2366
2343
  ) -> None:
2367
2344
  """CELLxGENE schema curator.
@@ -2371,304 +2348,85 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2371
2348
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
2372
2349
  The CELLxGENE Curator maps against the required CELLxGENE fields by default.
2373
2350
  organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
2351
+ schema_version: The CELLxGENE schema version to curate against.
2374
2352
  defaults: Default values that are set if columns or column values are missing.
2375
2353
  extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
2376
2354
  These extra sources are joined with the CELLxGENE fixed sources.
2377
2355
  Use this parameter when subclassing.
2378
- exclude: A dictionary mapping column names to values to exclude.
2379
- schema_version: The CELLxGENE schema version to curate against.
2380
2356
  verbosity: The verbosity level.
2381
-
2382
2357
  """
2383
2358
  import bionty as bt
2384
2359
 
2385
- CellxGeneAnnDataCatManager._init_categoricals_additional_values()
2360
+ from ._cellxgene_schemas import (
2361
+ _add_defaults_to_obs,
2362
+ _create_sources,
2363
+ _init_categoricals_additional_values,
2364
+ _restrict_obs_fields,
2365
+ )
2386
2366
 
2387
- var_index: FieldAttr = bt.Gene.ensembl_gene_id
2367
+ # Add defaults first to ensure that we fetch valid sources
2368
+ if defaults:
2369
+ _add_defaults_to_obs(adata.obs, defaults)
2388
2370
 
2371
+ # Filter categoricals based on what's present in adata
2389
2372
  if categoricals is None:
2390
- categoricals = CellxGeneAnnDataCatManager._get_categoricals()
2373
+ categoricals = self._get_cxg_categoricals()
2374
+ categoricals = _restrict_obs_fields(adata.obs, categoricals)
2391
2375
 
2392
- self.organism = organism
2393
-
2394
- VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
2395
- if schema_version not in VALID_SCHEMA_VERSIONS:
2396
- valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
2397
- raise ValueError(
2398
- f"Invalid schema_version: {schema_version}. "
2399
- f"Valid versions are: {valid_versions}"
2400
- )
2376
+ # Configure sources
2377
+ sources = _create_sources(categoricals, schema_version, organism)
2401
2378
  self.schema_version = schema_version
2402
2379
  self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
2403
- with resources.path(
2404
- "lamindb.curators._cellxgene_schemas", "schema_versions.yml"
2405
- ) as schema_versions_path:
2406
- self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
2407
- self.schema_version
2408
- ]
2409
-
2410
- # Fetch AnnData obs to be able to set defaults and get sources
2411
- if isinstance(adata, ad.AnnData):
2412
- self._adata_obs = adata.obs
2413
- else:
2414
- self._adata_obs = backed_access(upath.create_path(adata)).obs # type: ignore
2415
-
2416
- # Add defaults first to ensure that we fetch valid sources
2417
- if defaults:
2418
- _add_defaults_to_obs(self._adata_obs, defaults)
2419
-
2420
- self.sources = self._create_sources(self._adata_obs)
2421
- self.sources = {
2422
- entity: source
2423
- for entity, source in self.sources.items()
2424
- if source is not None
2425
- }
2426
-
2427
2380
  # These sources are not a part of the cellxgene schema but rather passed through.
2428
2381
  # This is useful when other Curators extend the CELLxGENE curator
2429
2382
  if extra_sources:
2430
- self.sources = self.sources | extra_sources
2383
+ sources = sources | extra_sources
2431
2384
 
2432
- # Exclude default values from validation because they are not available in the pinned sources
2433
- exclude_keys = {
2434
- entity: default
2435
- for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
2436
- if entity in self._adata_obs.columns # type: ignore
2437
- }
2385
+ _init_categoricals_additional_values()
2438
2386
 
2439
2387
  super().__init__(
2440
2388
  data=adata,
2441
- var_index=var_index,
2442
- categoricals=_restrict_obs_fields(self._adata_obs, categoricals),
2389
+ var_index=bt.Gene.ensembl_gene_id,
2390
+ categoricals=categoricals,
2443
2391
  verbosity=verbosity,
2444
2392
  organism=organism,
2445
- sources=self.sources,
2446
- exclude=exclude_keys,
2393
+ sources=sources,
2447
2394
  )
2448
2395
 
2449
2396
  @classmethod
2450
- def _init_categoricals_additional_values(cls) -> None:
2451
- import bionty as bt
2452
-
2453
- import lamindb as ln
2454
-
2455
- # Note: if you add another control below, be mindful to change the if condition that
2456
- # triggers whether creating these records is re-considered
2457
- if cls._controls_were_created is None:
2458
- cls._controls_were_created = (
2459
- ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
2460
- is not None
2461
- )
2462
- if not cls._controls_were_created:
2463
- logger.important("Creating control labels in the CellxGene schema.")
2464
- bt.CellType(
2465
- ontology_id="unknown",
2466
- name="unknown",
2467
- description="From CellxGene schema.",
2468
- ).save()
2469
- pato = bt.Source.filter(name="pato", version="2024-03-28").one()
2470
- normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
2471
- bt.Disease(
2472
- uid=normal.uid,
2473
- name=normal.name,
2474
- ontology_id=normal.ontology_id,
2475
- description=normal.description,
2476
- source=normal.source,
2477
- ).save()
2478
- bt.Ethnicity(
2479
- ontology_id="na", name="na", description="From CellxGene schema."
2480
- ).save()
2481
- bt.Ethnicity(
2482
- ontology_id="unknown",
2483
- name="unknown",
2484
- description="From CellxGene schema.",
2485
- ).save()
2486
- bt.DevelopmentalStage(
2487
- ontology_id="unknown",
2488
- name="unknown",
2489
- description="From CellxGene schema.",
2490
- ).save()
2491
- bt.Phenotype(
2492
- ontology_id="unknown",
2493
- name="unknown",
2494
- description="From CellxGene schema.",
2495
- ).save()
2496
-
2497
- tissue_type = ln.ULabel(
2498
- name="TissueType",
2499
- is_type=True,
2500
- description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
2501
- ).save()
2502
- ln.ULabel(
2503
- name="tissue", type=tissue_type, description="From CellxGene schema."
2504
- ).save()
2505
- ln.ULabel(
2506
- name="organoid", type=tissue_type, description="From CellxGene schema."
2507
- ).save()
2508
- ln.ULabel(
2509
- name="cell culture",
2510
- type=tissue_type,
2511
- description="From CellxGene schema.",
2512
- ).save()
2513
-
2514
- suspension_type = ln.ULabel(
2515
- name="SuspensionType",
2516
- is_type=True,
2517
- description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
2518
- ).save()
2519
- ln.ULabel(
2520
- name="cell", type=suspension_type, description="From CellxGene schema."
2521
- ).save()
2522
- ln.ULabel(
2523
- name="nucleus",
2524
- type=suspension_type,
2525
- description="From CellxGene schema.",
2526
- ).save()
2527
- ln.ULabel(name="na", type=suspension_type).save()
2528
-
2529
- @classmethod
2530
- def _get_categoricals(cls) -> dict[str, FieldAttr]:
2531
- import bionty as bt
2532
-
2533
- return {
2534
- "assay": bt.ExperimentalFactor.name,
2535
- "assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
2536
- "cell_type": bt.CellType.name,
2537
- "cell_type_ontology_term_id": bt.CellType.ontology_id,
2538
- "development_stage": bt.DevelopmentalStage.name,
2539
- "development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
2540
- "disease": bt.Disease.name,
2541
- "disease_ontology_term_id": bt.Disease.ontology_id,
2542
- # "donor_id": "str", via pandera
2543
- "self_reported_ethnicity": bt.Ethnicity.name,
2544
- "self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
2545
- "sex": bt.Phenotype.name,
2546
- "sex_ontology_term_id": bt.Phenotype.ontology_id,
2547
- "suspension_type": ULabel.name,
2548
- "tissue": bt.Tissue.name,
2549
- "tissue_ontology_term_id": bt.Tissue.ontology_id,
2550
- "tissue_type": ULabel.name,
2551
- "organism": bt.Organism.name,
2552
- "organism_ontology_term_id": bt.Organism.ontology_id,
2553
- }
2554
-
2555
- @classmethod
2397
+ @deprecated(new_name="cxg_categoricals_defaults")
2556
2398
  def _get_categoricals_defaults(cls) -> dict[str, str]:
2557
- return {
2558
- "cell_type": "unknown",
2559
- "development_stage": "unknown",
2560
- "disease": "normal",
2561
- "donor_id": "unknown",
2562
- "self_reported_ethnicity": "unknown",
2563
- "sex": "unknown",
2564
- "suspension_type": "cell",
2565
- "tissue_type": "tissue",
2566
- }
2567
-
2568
- @property
2569
- def pinned_ontologies(self) -> pd.DataFrame:
2570
- return self._pinned_ontologies
2571
-
2572
- @property
2573
- def adata(self) -> AnnData:
2574
- return self._adata
2575
-
2576
- def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
2577
- """Creates a sources dictionary that can be passed to AnnDataCatManager."""
2578
- import bionty as bt
2579
-
2580
- # fmt: off
2581
- def _fetch_bionty_source(
2582
- entity: str, organism: str, source: str
2583
- ) -> bt.Source | None:
2584
- """Fetch the Bionty source of the pinned ontology.
2399
+ return cls.cxg_categoricals_defaults
2585
2400
 
2586
- Returns None if the source does not exist.
2587
- """
2588
- version = self._pinned_ontologies.loc[(self._pinned_ontologies.index == entity) &
2589
- (self._pinned_ontologies["organism"] == organism) &
2590
- (self._pinned_ontologies["source"] == source), "version"].iloc[0]
2591
- return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
2592
-
2593
- entity_mapping = {
2594
- "var_index": ("Gene", self.organism, "ensembl"),
2595
- "cell_type": ("CellType", "all", "cl"),
2596
- "assay": ("ExperimentalFactor", "all", "efo"),
2597
- "self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
2598
- "development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
2599
- "disease": ("Disease", "all", "mondo"),
2600
- # "organism": ("Organism", "vertebrates", "ensembl"),
2601
- "sex": ("Phenotype", "all", "pato"),
2602
- "tissue": ("Tissue", "all", "uberon"),
2603
- }
2604
- # fmt: on
2605
-
2606
- # Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
2607
- entity_to_sources = {
2608
- entity: _fetch_bionty_source(*params)
2609
- for entity, params in entity_mapping.items()
2610
- if entity in obs.columns
2611
- or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
2612
- or entity == "var_index"
2613
- }
2614
-
2615
- return entity_to_sources
2616
-
2617
- def _convert_name_to_ontology_id(self, values: pd.Series, field: FieldAttr):
2618
- """Converts a column that stores a name into a column that stores the ontology id.
2619
-
2620
- cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
2621
- """
2622
- field_name = field.field.name
2623
- assert field_name == "name" # noqa: S101
2624
- cols = ["name", "ontology_id"]
2625
- registry = field.field.model
2401
+ @classmethod
2402
+ def _get_cxg_categoricals(cls) -> dict[str, FieldAttr]:
2403
+ """Returns the CELLxGENE schema mapped fields."""
2404
+ from ._cellxgene_schemas import _get_cxg_categoricals
2626
2405
 
2627
- if hasattr(registry, "ontology_id"):
2628
- validated_records = registry.filter(**{f"{field_name}__in": values})
2629
- mapper = (
2630
- pd.DataFrame(validated_records.values_list(*cols))
2631
- .set_index(0)
2632
- .to_dict()[1]
2633
- )
2634
- return values.map(mapper)
2406
+ return _get_cxg_categoricals()
2635
2407
 
2636
- def validate(self) -> bool: # type: ignore
2408
+ def validate(self) -> bool:
2637
2409
  """Validates the AnnData object against most cellxgene requirements."""
2410
+ from ._cellxgene_schemas import RESERVED_NAMES
2411
+
2638
2412
  # Verify that all required obs columns are present
2413
+ required_columns = list(self.cxg_categoricals_defaults.keys()) + ["donor_id"]
2639
2414
  missing_obs_fields = [
2640
2415
  name
2641
- for name in CellxGeneAnnDataCatManager._get_categoricals_defaults().keys()
2416
+ for name in required_columns
2642
2417
  if name not in self._adata.obs.columns
2643
2418
  and f"{name}_ontology_term_id" not in self._adata.obs.columns
2644
2419
  ]
2645
2420
  if len(missing_obs_fields) > 0:
2646
- missing_obs_fields_str = ", ".join(list(missing_obs_fields))
2647
- logger.error(f"missing required obs columns {missing_obs_fields_str}")
2648
- logger.info(
2649
- "consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
2650
- "to automatically add these columns with default values."
2421
+ logger.error(
2422
+ f"missing required obs columns {_format_values(missing_obs_fields)}\n"
2423
+ " → consider initializing a Curate object with `defaults=cxg.CellxGeneAnnDataCatManager.cxg_categoricals_defaults` to automatically add these columns with default values"
2651
2424
  )
2652
2425
  return False
2653
2426
 
2654
2427
  # Verify that no cellxgene reserved names are present
2655
- reserved_names = {
2656
- "ethnicity",
2657
- "ethnicity_ontology_term_id",
2658
- "X_normalization",
2659
- "default_field",
2660
- "layer_descriptions",
2661
- "tags",
2662
- "versions",
2663
- "contributors",
2664
- "preprint_doi",
2665
- "project_description",
2666
- "project_links",
2667
- "project_name",
2668
- "publication_doi",
2669
- }
2670
2428
  matched_columns = [
2671
- column for column in self._adata.obs.columns if column in reserved_names
2429
+ column for column in self._adata.obs.columns if column in RESERVED_NAMES
2672
2430
  ]
2673
2431
  if len(matched_columns) > 0:
2674
2432
  raise ValueError(
@@ -2695,6 +2453,26 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2695
2453
  Returns:
2696
2454
  An AnnData object which adheres to the cellxgene-schema.
2697
2455
  """
2456
+
2457
+ def _convert_name_to_ontology_id(values: pd.Series, field: FieldAttr):
2458
+ """Converts a column that stores a name into a column that stores the ontology id.
2459
+
2460
+ cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
2461
+ """
2462
+ field_name = field.field.name
2463
+ assert field_name == "name" # noqa: S101
2464
+ cols = ["name", "ontology_id"]
2465
+ registry = field.field.model
2466
+
2467
+ if hasattr(registry, "ontology_id"):
2468
+ validated_records = registry.filter(**{f"{field_name}__in": values})
2469
+ mapper = (
2470
+ pd.DataFrame(validated_records.values_list(*cols))
2471
+ .set_index(0)
2472
+ .to_dict()[1]
2473
+ )
2474
+ return values.map(mapper)
2475
+
2698
2476
  # Create a copy since we modify the AnnData object extensively
2699
2477
  adata_cxg = self._adata.copy()
2700
2478
 
@@ -2714,7 +2492,7 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2714
2492
  # convert name column to ontology_term_id column
2715
2493
  for column in adata_cxg.obs.columns:
2716
2494
  if column in self.categoricals and not column.endswith("_ontology_term_id"):
2717
- mapped_column = self._convert_name_to_ontology_id(
2495
+ mapped_column = _convert_name_to_ontology_id(
2718
2496
  adata_cxg.obs[column], field=self.categoricals.get(column)
2719
2497
  )
2720
2498
  if mapped_column is not None:
@@ -2880,7 +2658,7 @@ class TimeHandler:
2880
2658
 
2881
2659
 
2882
2660
  class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2883
- """Curator flow for Perturbation data."""
2661
+ """Categorical manager for `AnnData` to manage perturbations."""
2884
2662
 
2885
2663
  PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
2886
2664
 
@@ -2891,45 +2669,32 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2891
2669
  pert_dose: bool = True,
2892
2670
  pert_time: bool = True,
2893
2671
  *,
2672
+ cxg_schema_version: Literal["5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
2894
2673
  verbosity: str = "hint",
2895
- cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
2896
2674
  ):
2897
2675
  """Initialize the curator with configuration and validation settings."""
2898
- import bionty as bt
2899
-
2900
2676
  self._pert_time = pert_time
2901
2677
  self._pert_dose = pert_dose
2902
2678
 
2903
2679
  self._validate_initial_data(adata)
2904
- self._setup_configuration(adata)
2905
-
2906
- self._setup_sources(adata)
2907
- self._setup_compound_source()
2680
+ categoricals, categoricals_defaults = self._configure_categoricals(adata)
2908
2681
 
2909
2682
  super().__init__(
2910
2683
  adata=adata,
2911
- categoricals=self.PT_CATEGORICALS,
2912
- defaults=self.PT_DEFAULT_VALUES,
2913
- verbosity=verbosity,
2684
+ categoricals=categoricals,
2685
+ defaults=categoricals_defaults,
2914
2686
  organism=organism,
2915
- extra_sources=self.PT_SOURCES,
2687
+ extra_sources=self._configure_sources(adata),
2916
2688
  schema_version=cxg_schema_version,
2689
+ verbosity=verbosity,
2917
2690
  )
2918
2691
 
2919
- def _setup_configuration(self, adata: ad.AnnData):
2692
+ def _configure_categoricals(self, adata: ad.AnnData):
2920
2693
  """Set up default configuration values."""
2921
2694
  import bionty as bt
2922
2695
  import wetlab as wl
2923
2696
 
2924
- self.PT_DEFAULT_VALUES = (
2925
- CellxGeneAnnDataCatManager._get_categoricals_defaults()
2926
- | {
2927
- "cell_line": "unknown",
2928
- "pert_target": "unknown",
2929
- }
2930
- )
2931
-
2932
- self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
2697
+ categoricals = CellxGeneAnnDataCatManager._get_cxg_categoricals() | {
2933
2698
  k: v
2934
2699
  for k, v in {
2935
2700
  "cell_line": bt.CellLine.name,
@@ -2941,22 +2706,40 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2941
2706
  }.items()
2942
2707
  if k in adata.obs.columns
2943
2708
  }
2944
- # if "donor_id" in self.PT_CATEGORICALS:
2945
- # self.PT_CATEGORICALS["donor_id"] = Donor.name
2709
+ # if "donor_id" in categoricals:
2710
+ # categoricals["donor_id"] = Donor.name
2946
2711
 
2947
- def _setup_sources(self, adata: ad.AnnData):
2712
+ categoricals_defaults = CellxGeneAnnDataCatManager.cxg_categoricals_defaults | {
2713
+ "cell_line": "unknown",
2714
+ "pert_target": "unknown",
2715
+ }
2716
+
2717
+ return categoricals, categoricals_defaults
2718
+
2719
+ def _configure_sources(self, adata: ad.AnnData):
2948
2720
  """Set up data sources."""
2949
- self.PT_SOURCES = {}
2950
- # if "cell_line" in adata.obs.columns:
2951
- # self.PT_SOURCES["cell_line"] = (
2952
- # bt.Source.filter(name="depmap").first()
2953
- # )
2721
+ import bionty as bt
2722
+ import wetlab as wl
2723
+
2724
+ sources = {}
2725
+ if "cell_line" in adata.obs.columns:
2726
+ sources["cell_line"] = bt.Source.filter(
2727
+ entity="bionty.CellLine", name="depmap"
2728
+ ).first()
2954
2729
  if "pert_compound" in adata.obs.columns:
2955
- import bionty as bt
2730
+ with logger.mute():
2731
+ chebi_source = bt.Source.filter(
2732
+ entity="wetlab.Compound", name="chebi"
2733
+ ).first()
2734
+ if not chebi_source:
2735
+ wl.Compound.add_source(
2736
+ bt.Source.filter(entity="Drug", name="chebi").first()
2737
+ )
2956
2738
 
2957
- self.PT_SOURCES["pert_compound"] = bt.Source.filter(
2739
+ sources["pert_compound"] = bt.Source.filter(
2958
2740
  entity="wetlab.Compound", name="chebi"
2959
2741
  ).first()
2742
+ return sources
2960
2743
 
2961
2744
  def _validate_initial_data(self, adata: ad.AnnData):
2962
2745
  """Validate the initial data structure."""
@@ -3004,20 +2787,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
3004
2787
  adata.obs[col_name].cat.remove_unused_categories()
3005
2788
  logger.important(f"mapped 'pert_name' to '{col_name}'")
3006
2789
 
3007
- def _setup_compound_source(self):
3008
- """Set up the compound source with muted logging."""
3009
- import bionty as bt
3010
- import wetlab as wl
3011
-
3012
- with logger.mute():
3013
- chebi_source = bt.Source.filter(
3014
- entity="wetlab.Compound", name="chebi"
3015
- ).first()
3016
- if not chebi_source:
3017
- wl.Compound.add_source(
3018
- bt.Source.filter(entity="Drug", name="chebi").first()
3019
- )
3020
-
3021
2790
  def validate(self) -> bool: # type: ignore
3022
2791
  """Validate the AnnData object."""
3023
2792
  validated = super().validate()
@@ -3135,8 +2904,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
3135
2904
 
3136
2905
  def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
3137
2906
  """Make sure the source and organism are saved in the same database as the registry."""
3138
- from lamindb.core._settings import settings
3139
-
3140
2907
  db = registry.filter().db
3141
2908
  source = kwargs.get("source")
3142
2909
  organism = kwargs.get("organism")
@@ -3161,44 +2928,15 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
3161
2928
  return filter_kwargs
3162
2929
 
3163
2930
 
3164
- def inspect_instance(
3165
- values: Iterable[str],
3166
- field: FieldAttr,
3167
- registry: type[Record],
3168
- exclude: str | list | None = None,
3169
- **kwargs,
3170
- ):
3171
- """Inspect values using a registry."""
3172
- # inspect exclude values in the default instance
3173
- values = list(values)
3174
- include_validated = []
3175
- if exclude is not None:
3176
- exclude = [exclude] if isinstance(exclude, str) else exclude
3177
- exclude = [i for i in exclude if i in values]
3178
- if len(exclude) > 0:
3179
- # exclude values are validated without source and organism
3180
- inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
3181
- # if exclude values are validated, remove them from the values
3182
- values = [i for i in values if i not in inspect_result_exclude.validated]
3183
- include_validated = inspect_result_exclude.validated
3184
-
3185
- inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
3186
- inspect_result._validated += include_validated
3187
- inspect_result._non_validated = [
3188
- i for i in inspect_result.non_validated if i not in include_validated
3189
- ]
3190
-
3191
- return inspect_result
3192
-
3193
-
3194
- def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
2931
+ def configure_organism(registry: Record, organism: str | None = None) -> dict[str, str]:
3195
2932
  """Check if a registry needs an organism and return the organism name."""
3196
- if hasattr(registry, "organism_id"):
2933
+ from ..models._from_values import _is_organism_required
2934
+
2935
+ if _is_organism_required(registry):
3197
2936
  import bionty as bt
3198
2937
 
3199
- if organism is None and bt.settings.organism is None:
3200
- return {}
3201
- return {"organism": organism or bt.settings.organism.name}
2938
+ if organism is not None or bt.settings.organism is not None:
2939
+ return {"organism": organism or bt.settings.organism.name}
3202
2940
  return {}
3203
2941
 
3204
2942
 
@@ -3208,7 +2946,6 @@ def validate_categories(
3208
2946
  key: str,
3209
2947
  organism: str | None = None,
3210
2948
  source: Record | None = None,
3211
- exclude: str | list | None = None,
3212
2949
  hint_print: str | None = None,
3213
2950
  curator: CatManager | None = None,
3214
2951
  ) -> tuple[bool, list[str]]:
@@ -3220,13 +2957,9 @@ def validate_categories(
3220
2957
  key: The key referencing the slot in the DataFrame.
3221
2958
  organism: The organism name.
3222
2959
  source: The source record.
3223
- exclude: Exclude specific values from validation.
3224
2960
  standardize: Whether to standardize the values.
3225
2961
  hint_print: The hint to print that suggests fixing non-validated values.
3226
2962
  """
3227
- from lamindb.core._settings import settings
3228
- from lamindb.models._from_values import _format_values
3229
-
3230
2963
  model_field = f"{field.field.model.__name__}.{field.field.name}"
3231
2964
 
3232
2965
  def _log_mapping_info():
@@ -3236,36 +2969,26 @@ def validate_categories(
3236
2969
 
3237
2970
  registry = field.field.model
3238
2971
 
3239
- # {"organism": organism_name/organism_record}
3240
- kwargs = check_registry_organism(registry, organism)
2972
+ # {"organism": organism_name}
2973
+ kwargs = configure_organism(registry, organism)
3241
2974
  kwargs.update({"source": source} if source else {})
3242
2975
  kwargs_current = get_current_filter_kwargs(registry, kwargs)
3243
2976
 
3244
2977
  # inspect values from the default instance
3245
- inspect_result = inspect_instance(
3246
- values=values,
3247
- field=field,
3248
- registry=registry,
3249
- exclude=exclude,
3250
- **kwargs_current,
3251
- )
2978
+ inspect_result = registry.inspect(values, field=field, mute=True, **kwargs_current)
3252
2979
  non_validated = inspect_result.non_validated
3253
2980
  syn_mapper = inspect_result.synonyms_mapper
3254
2981
 
3255
2982
  # inspect the non-validated values from public (bionty only)
3256
2983
  values_validated = []
3257
2984
  if hasattr(registry, "public"):
3258
- verbosity = settings.verbosity
3259
- try:
3260
- settings.verbosity = "error"
3261
- public_records = registry.from_values(
3262
- non_validated,
3263
- field=field,
3264
- **kwargs_current,
3265
- )
3266
- values_validated += [getattr(r, field.field.name) for r in public_records]
3267
- finally:
3268
- settings.verbosity = verbosity
2985
+ public_records = registry.from_values(
2986
+ non_validated,
2987
+ field=field,
2988
+ mute=True,
2989
+ **kwargs_current,
2990
+ )
2991
+ values_validated += [getattr(r, field.field.name) for r in public_records]
3269
2992
 
3270
2993
  # logging messages
3271
2994
  non_validated_hint_print = hint_print or f'.add_new_from("{key}")'
@@ -3329,7 +3052,6 @@ def validate_categories_in_df(
3329
3052
  df: pd.DataFrame,
3330
3053
  fields: dict[str, FieldAttr],
3331
3054
  sources: dict[str, Record] = None,
3332
- exclude: dict | None = None,
3333
3055
  curator: CatManager | None = None,
3334
3056
  **kwargs,
3335
3057
  ) -> tuple[bool, dict]:
@@ -3347,7 +3069,6 @@ def validate_categories_in_df(
3347
3069
  field=field,
3348
3070
  key=key,
3349
3071
  source=sources.get(key),
3350
- exclude=exclude.get(key) if exclude else None,
3351
3072
  curator=curator,
3352
3073
  **kwargs,
3353
3074
  )
@@ -3358,9 +3079,10 @@ def validate_categories_in_df(
3358
3079
 
3359
3080
 
3360
3081
  def save_artifact(
3361
- data: pd.DataFrame | ad.AnnData | MuData,
3082
+ data: pd.DataFrame | ScverseDataStructures,
3083
+ *,
3362
3084
  fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
3363
- columns_field: FieldAttr | dict[str, FieldAttr] | None = None,
3085
+ index_field: FieldAttr | dict[str, FieldAttr] | None = None,
3364
3086
  description: str | None = None,
3365
3087
  organism: str | None = None,
3366
3088
  key: str | None = None,
@@ -3368,73 +3090,64 @@ def save_artifact(
3368
3090
  revises: Artifact | None = None,
3369
3091
  run: Run | None = None,
3370
3092
  schema: Schema | None = None,
3093
+ **kwargs,
3371
3094
  ) -> Artifact:
3372
3095
  """Save all metadata with an Artifact.
3373
3096
 
3374
3097
  Args:
3375
- data: The DataFrame/AnnData/MuData object to save.
3098
+ data: The object to save.
3376
3099
  fields: A dictionary mapping obs_column to registry_field.
3377
- columns_field: The registry field to validate variables index against.
3100
+ index_field: The registry field to validate variables index against.
3378
3101
  description: A description of the artifact.
3379
3102
  organism: The organism name.
3380
- type: The artifact type.
3381
3103
  key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
3382
3104
  artifact: A already registered artifact. Passing this will not save a new artifact from data.
3383
3105
  revises: Previous version of the artifact. Triggers a revision.
3384
3106
  run: The run that creates the artifact.
3107
+ schema: The Schema to associate with the Artifact.
3385
3108
 
3386
3109
  Returns:
3387
3110
  The saved Artifact.
3388
3111
  """
3389
- from ..models.artifact import add_labels, data_is_anndata, data_is_mudata
3112
+ from ..models.artifact import add_labels
3390
3113
 
3391
3114
  if artifact is None:
3392
- if data_is_anndata(data):
3393
- artifact = Artifact.from_anndata(
3115
+ if isinstance(data, pd.DataFrame):
3116
+ artifact = Artifact.from_df(
3394
3117
  data, description=description, key=key, revises=revises, run=run
3395
3118
  )
3396
- elif isinstance(data, pd.DataFrame):
3397
- artifact = Artifact.from_df(
3119
+ elif isinstance(data, AnnData):
3120
+ artifact = Artifact.from_anndata(
3398
3121
  data, description=description, key=key, revises=revises, run=run
3399
3122
  )
3400
3123
  elif data_is_mudata(data):
3401
3124
  artifact = Artifact.from_mudata(
3402
- data,
3403
- description=description,
3404
- key=key,
3405
- revises=revises,
3406
- run=run,
3125
+ data, description=description, key=key, revises=revises, run=run
3126
+ )
3127
+ elif data_is_spatialdata(data):
3128
+ artifact = Artifact.from_spatialdata(
3129
+ data, description=description, key=key, revises=revises, run=run
3130
+ )
3131
+ else:
3132
+ raise InvalidArgument( # pragma: no cover
3133
+ "data must be one of pd.Dataframe, AnnData, MuData, SpatialData."
3407
3134
  )
3408
- artifact.schema = schema
3409
3135
  artifact.save()
3410
3136
 
3411
- if organism is not None and columns_field is not None:
3412
- feature_kwargs = check_registry_organism(
3137
+ if organism is not None and index_field is not None:
3138
+ feature_kwargs = configure_organism(
3413
3139
  (
3414
- list(columns_field.values())[0].field.model
3415
- if isinstance(columns_field, dict)
3416
- else columns_field.field.model
3140
+ list(index_field.values())[0].field.model
3141
+ if isinstance(index_field, dict)
3142
+ else index_field.field.model
3417
3143
  ),
3418
3144
  organism,
3419
3145
  )
3420
3146
  else:
3421
3147
  feature_kwargs = {}
3422
3148
 
3423
- if artifact.otype == "DataFrame":
3424
- artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) # type: ignore
3425
- elif artifact.otype == "AnnData":
3426
- artifact.features._add_set_from_anndata( # type: ignore
3427
- var_field=columns_field, **feature_kwargs
3428
- )
3429
- elif artifact.otype == "MuData":
3430
- artifact.features._add_set_from_mudata( # type: ignore
3431
- var_fields=columns_field, **feature_kwargs
3432
- )
3433
- else:
3434
- raise NotImplementedError
3435
-
3436
3149
  def _add_labels(
3437
- data,
3150
+ data: pd.DataFrame | ScverseDataStructures,
3438
3151
  artifact: Artifact,
3439
3152
  fields: dict[str, FieldAttr],
3440
3153
  feature_ref_is_name: bool | None = None,
@@ -3443,7 +3156,7 @@ def save_artifact(
3443
3156
  for key, field in fields.items():
3444
3157
  feature = features.get(key)
3445
3158
  registry = field.field.model
3446
- filter_kwargs = check_registry_organism(registry, organism)
3159
+ filter_kwargs = configure_organism(registry, organism)
3447
3160
  filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
3448
3161
  df = data if isinstance(data, pd.DataFrame) else data.obs
3449
3162
  # multi-value columns are separated by "|"
@@ -3470,35 +3183,81 @@ def save_artifact(
3470
3183
  from_curator=True,
3471
3184
  )
3472
3185
 
3473
- if artifact.otype == "MuData":
3474
- for modality, modality_fields in fields.items():
3475
- column_field_modality = columns_field.get(modality)
3476
- if modality == "obs":
3477
- _add_labels(
3478
- data,
3479
- artifact,
3480
- modality_fields,
3481
- feature_ref_is_name=(
3482
- None
3483
- if column_field_modality is None
3484
- else _ref_is_name(column_field_modality)
3485
- ),
3486
- )
3487
- else:
3488
- _add_labels(
3489
- data[modality],
3490
- artifact,
3491
- modality_fields,
3492
- feature_ref_is_name=(
3493
- None
3494
- if column_field_modality is None
3495
- else _ref_is_name(column_field_modality)
3496
- ),
3497
- )
3498
- else:
3499
- _add_labels(
3500
- data, artifact, fields, feature_ref_is_name=_ref_is_name(columns_field)
3501
- )
3186
+ match artifact.otype:
3187
+ case "DataFrame":
3188
+ artifact.features._add_set_from_df(field=index_field, **feature_kwargs) # type: ignore
3189
+ _add_labels(
3190
+ data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
3191
+ )
3192
+ case "AnnData":
3193
+ artifact.features._add_set_from_anndata( # type: ignore
3194
+ var_field=index_field, **feature_kwargs
3195
+ )
3196
+ _add_labels(
3197
+ data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
3198
+ )
3199
+ case "MuData":
3200
+ artifact.features._add_set_from_mudata( # type: ignore
3201
+ var_fields=index_field, **feature_kwargs
3202
+ )
3203
+ for modality, modality_fields in fields.items():
3204
+ column_field_modality = index_field.get(modality)
3205
+ if modality == "obs":
3206
+ _add_labels(
3207
+ data,
3208
+ artifact,
3209
+ modality_fields,
3210
+ feature_ref_is_name=(
3211
+ None
3212
+ if column_field_modality is None
3213
+ else _ref_is_name(column_field_modality)
3214
+ ),
3215
+ )
3216
+ else:
3217
+ _add_labels(
3218
+ data[modality],
3219
+ artifact,
3220
+ modality_fields,
3221
+ feature_ref_is_name=(
3222
+ None
3223
+ if column_field_modality is None
3224
+ else _ref_is_name(column_field_modality)
3225
+ ),
3226
+ )
3227
+ case "SpatialData":
3228
+ artifact.features._add_set_from_spatialdata( # type: ignore
3229
+ sample_metadata_key=kwargs.get("sample_metadata_key", "sample"),
3230
+ var_fields=index_field,
3231
+ **feature_kwargs,
3232
+ )
3233
+ sample_metadata_key = kwargs.get("sample_metadata_key", "sample")
3234
+ for accessor, accessor_fields in fields.items():
3235
+ column_field = index_field.get(accessor)
3236
+ if accessor == sample_metadata_key:
3237
+ _add_labels(
3238
+ data.get_attrs(
3239
+ key=sample_metadata_key, return_as="df", flatten=True
3240
+ ),
3241
+ artifact,
3242
+ accessor_fields,
3243
+ feature_ref_is_name=(
3244
+ None if column_field is None else _ref_is_name(column_field)
3245
+ ),
3246
+ )
3247
+ else:
3248
+ _add_labels(
3249
+ data.tables[accessor],
3250
+ artifact,
3251
+ accessor_fields,
3252
+ feature_ref_is_name=(
3253
+ None if column_field is None else _ref_is_name(column_field)
3254
+ ),
3255
+ )
3256
+ case _:
3257
+ raise NotImplementedError # pragma: no cover
3258
+
3259
+ artifact.schema = schema
3260
+ artifact.save()
3502
3261
 
3503
3262
  slug = ln_setup.settings.instance.slug
3504
3263
  if ln_setup.settings.instance.is_remote: # pdagma: no cover
@@ -3528,8 +3287,7 @@ def update_registry(
3528
3287
  organism: str | None = None,
3529
3288
  dtype: str | None = None,
3530
3289
  source: Record | None = None,
3531
- exclude: str | list | None = None,
3532
- **kwargs,
3290
+ **create_kwargs,
3533
3291
  ) -> None:
3534
3292
  """Save features or labels records in the default instance..
3535
3293
 
@@ -3542,14 +3300,12 @@ def update_registry(
3542
3300
  organism: The organism name.
3543
3301
  dtype: The type of the feature.
3544
3302
  source: The source record.
3545
- exclude: Values to exclude from inspect.
3546
- kwargs: Additional keyword arguments to pass to the registry model to create new records.
3303
+ **create_kwargs: Additional keyword arguments to pass to the registry model to create new records.
3547
3304
  """
3548
- from lamindb.core._settings import settings
3549
3305
  from lamindb.models.save import save as ln_save
3550
3306
 
3551
3307
  registry = field.field.model
3552
- filter_kwargs = check_registry_organism(registry, organism)
3308
+ filter_kwargs = configure_organism(registry, organism)
3553
3309
  filter_kwargs.update({"source": source} if source else {})
3554
3310
  values = [i for i in values if isinstance(i, str) and i]
3555
3311
  if not values:
@@ -3607,14 +3363,16 @@ def update_registry(
3607
3363
  registry(
3608
3364
  **init_kwargs,
3609
3365
  **{k: v for k, v in filter_kwargs.items() if k != "source"},
3610
- **{k: v for k, v in kwargs.items() if k != "sources"},
3366
+ **{
3367
+ k: v for k, v in create_kwargs.items() if k != "sources"
3368
+ },
3611
3369
  )
3612
3370
  )
3613
3371
  ln_save(non_validated_records)
3614
3372
 
3615
3373
  # save parent labels for ulabels, for example a parent label "project" for label "project001"
3616
3374
  if registry == ULabel and field.field.name == "name":
3617
- save_ulabels_parent(values, field=field, key=key)
3375
+ save_ulabels_type(values, field=field, key=key)
3618
3376
 
3619
3377
  finally:
3620
3378
  settings.verbosity = verbosity
@@ -3652,16 +3410,18 @@ def log_saved_labels(
3652
3410
  )
3653
3411
 
3654
3412
 
3655
- def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None:
3656
- """Save a parent label for the given labels."""
3413
+ def save_ulabels_type(values: list[str], field: FieldAttr, key: str) -> None:
3414
+ """Save the ULabel type of the given labels."""
3657
3415
  registry = field.field.model
3658
3416
  assert registry == ULabel # noqa: S101
3659
- all_records = registry.from_values(list(values), field=field)
3660
- is_feature = registry.filter(name=f"{key}").one_or_none()
3661
- if is_feature is None:
3662
- is_feature = registry(name=f"{key}").save()
3663
- logger.important(f"Created a parent ULabel: {is_feature}")
3664
- is_feature.children.add(*all_records)
3417
+ all_records = registry.filter(**{field.field.name: list(values)}).all()
3418
+ # so `tissue_type` becomes `TissueType`
3419
+ type_name = "".join([i.capitalize() for i in key.lower().split("_")])
3420
+ ulabel_type = registry.filter(name=type_name, is_type=True).one_or_none()
3421
+ if ulabel_type is None:
3422
+ ulabel_type = registry(name=type_name, is_type=True).save()
3423
+ logger.important(f"Created a ULabel type: {ulabel_type}")
3424
+ all_records.update(type=ulabel_type)
3665
3425
 
3666
3426
 
3667
3427
  def _save_organism(name: str):
@@ -3760,7 +3520,6 @@ def from_tiledbsoma(
3760
3520
  obs_columns: FieldAttr = Feature.name,
3761
3521
  organism: str | None = None,
3762
3522
  sources: dict[str, Record] | None = None,
3763
- exclude: dict[str, str | list[str]] | None = None,
3764
3523
  ) -> TiledbsomaCatManager:
3765
3524
  return TiledbsomaCatManager(
3766
3525
  experiment_uri=experiment_uri,
@@ -3769,7 +3528,6 @@ def from_tiledbsoma(
3769
3528
  obs_columns=obs_columns,
3770
3529
  organism=organism,
3771
3530
  sources=sources,
3772
- exclude=exclude,
3773
3531
  )
3774
3532
 
3775
3533
 
@@ -3781,7 +3539,6 @@ def from_spatialdata(
3781
3539
  categoricals: dict[str, dict[str, FieldAttr]] | None = None,
3782
3540
  organism: str | None = None,
3783
3541
  sources: dict[str, dict[str, Record]] | None = None,
3784
- exclude: dict[str, dict] | None = None,
3785
3542
  verbosity: str = "hint",
3786
3543
  *,
3787
3544
  sample_metadata_key: str = "sample",
@@ -3798,7 +3555,6 @@ def from_spatialdata(
3798
3555
  verbosity=verbosity,
3799
3556
  organism=organism,
3800
3557
  sources=sources,
3801
- exclude=exclude,
3802
3558
  sample_metadata_key=sample_metadata_key,
3803
3559
  )
3804
3560