lamindb 1.2a2__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,25 +1,27 @@
1
1
  """Curators.
2
2
 
3
- .. versionadded:: 1.1.0
4
-
5
3
  .. autosummary::
6
4
  :toctree: .
7
5
 
8
- Curator
9
6
  DataFrameCurator
10
7
  AnnDataCurator
8
+ MuDataCurator
9
+ SpatialDataCurator
11
10
 
12
- CatManager:
11
+ Helper classes.
13
12
 
14
13
  .. autosummary::
15
14
  :toctree: .
16
15
 
16
+ Curator
17
+ SlotsCurator
17
18
  CatManager
19
+ CatLookup
18
20
  DataFrameCatManager
19
21
  AnnDataCatManager
20
22
  MuDataCatManager
23
+ SpatialDataCatManager
21
24
  TiledbsomaCatManager
22
- CurateLookup
23
25
 
24
26
  """
25
27
 
@@ -27,7 +29,6 @@ from __future__ import annotations
27
29
 
28
30
  import copy
29
31
  import re
30
- from importlib import resources
31
32
  from itertools import chain
32
33
  from typing import TYPE_CHECKING, Any, Literal
33
34
 
@@ -37,45 +38,44 @@ import pandas as pd
37
38
  import pandera
38
39
  import pyarrow as pa
39
40
  from lamin_utils import colors, logger
40
- from lamindb_setup.core import deprecated, upath
41
+ from lamindb_setup.core import deprecated
41
42
  from lamindb_setup.core._docs import doc_args
42
43
  from lamindb_setup.core.upath import UPath
43
44
 
44
- from lamindb.core.storage._backed_access import backed_access
45
-
46
- from ._cellxgene_schemas import _read_schema_versions
47
-
48
45
  if TYPE_CHECKING:
49
- from anndata import AnnData
50
46
  from lamindb_setup.core.types import UPathStr
47
+ from mudata import MuData
48
+ from spatialdata import SpatialData
51
49
 
52
- from lamindb.base.types import FieldAttr
50
+ from lamindb.core.types import ScverseDataStructures
53
51
  from lamindb.models import Record
54
52
  from lamindb.base.types import FieldAttr # noqa
55
53
  from lamindb.core._settings import settings
56
54
  from lamindb.models import (
57
55
  Artifact,
58
- Collection,
59
56
  Feature,
60
57
  Record,
61
58
  Run,
62
59
  Schema,
63
60
  ULabel,
64
61
  )
65
- from lamindb.models._feature_manager import parse_staged_feature_sets_from_anndata
66
- from lamindb.models.artifact import add_labels, data_is_anndata
62
+ from lamindb.models.artifact import (
63
+ add_labels,
64
+ data_is_anndata,
65
+ data_is_mudata,
66
+ data_is_spatialdata,
67
+ )
67
68
  from lamindb.models.feature import parse_dtype, parse_dtype_single_cat
68
69
  from lamindb.models._from_values import _format_values
69
70
 
70
71
  from ..errors import InvalidArgument, ValidationError
72
+ from anndata import AnnData
71
73
 
72
74
  if TYPE_CHECKING:
73
75
  from collections.abc import Iterable, MutableMapping
74
76
  from typing import Any
75
77
 
76
78
  from lamindb_setup.core.types import UPathStr
77
- from mudata import MuData
78
- from spatialdata import SpatialData
79
79
 
80
80
  from lamindb.models.query_set import RecordList
81
81
 
@@ -86,7 +86,7 @@ def strip_ansi_codes(text):
86
86
  return ansi_pattern.sub("", text)
87
87
 
88
88
 
89
- class CurateLookup:
89
+ class CatLookup:
90
90
  """Lookup categories from the reference instance.
91
91
 
92
92
  Args:
@@ -94,10 +94,10 @@ class CurateLookup:
94
94
  slots: A dictionary of slot fields to lookup.
95
95
  public: Whether to lookup from the public instance. Defaults to False.
96
96
 
97
- Example:
98
- >>> curator = ln.Curator.from_df(...)
99
- >>> curator.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
100
- <Category: alveolar_type_1_fibroblast_cell>
97
+ Example::
98
+
99
+ curator = ln.curators.DataFrameCurator(...)
100
+ curator.cat.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
101
101
 
102
102
  """
103
103
 
@@ -150,7 +150,7 @@ class CurateLookup:
150
150
  " → categories.alveolar_type_1_fibroblast_cell\n\n"
151
151
  "To look up public ontologies, use .lookup(public=True)"
152
152
  )
153
- else: # pdagma: no cover
153
+ else: # pragma: no cover
154
154
  return colors.warning("No fields are found!")
155
155
 
156
156
 
@@ -163,7 +163,7 @@ SLOTS_DOCSTRING = """Curator objects by slot.
163
163
  """
164
164
 
165
165
 
166
- VALIDATE_DOCSTRING = """Validate dataset.
166
+ VALIDATE_DOCSTRING = """Validate dataset against Schema.
167
167
 
168
168
  Raises:
169
169
  lamindb.errors.ValidationError: If validation fails.
@@ -183,15 +183,17 @@ Returns:
183
183
 
184
184
 
185
185
  class Curator:
186
- """Dataset curator.
186
+ """Curator base class.
187
187
 
188
188
  A `Curator` object makes it easy to validate, standardize & annotate datasets.
189
189
 
190
- .. versionadded:: 1.1.0
191
-
192
190
  See:
193
191
  - :class:`~lamindb.curators.DataFrameCurator`
194
192
  - :class:`~lamindb.curators.AnnDataCurator`
193
+ - :class:`~lamindb.curators.MuDataCurator`
194
+ - :class:`~lamindb.curators.SpatialDataCurator`
195
+
196
+ .. versionadded:: 1.1.0
195
197
  """
196
198
 
197
199
  def __init__(self, dataset: Any, schema: Schema | None = None):
@@ -199,7 +201,12 @@ class Curator:
199
201
  self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
200
202
  if isinstance(self._dataset, Artifact):
201
203
  self._artifact = self._dataset
202
- if self._artifact.otype in {"DataFrame", "AnnData"}:
204
+ if self._artifact.otype in {
205
+ "DataFrame",
206
+ "AnnData",
207
+ "MuData",
208
+ "SpatialData",
209
+ }:
203
210
  self._dataset = self._dataset.load()
204
211
  self._schema: Schema | None = schema
205
212
  self._is_validated: bool = False
@@ -208,7 +215,7 @@ class Curator:
208
215
  @doc_args(VALIDATE_DOCSTRING)
209
216
  def validate(self) -> bool | str:
210
217
  """{}""" # noqa: D415
211
- pass # pdagma: no cover
218
+ pass # pragma: no cover
212
219
 
213
220
  @doc_args(SAVE_ARTIFACT_DOCSTRING)
214
221
  def save_artifact(
@@ -225,9 +232,72 @@ class Curator:
225
232
  pass
226
233
 
227
234
 
235
+ class SlotsCurator(Curator):
236
+ """Curator for a dataset with slots.
237
+
238
+ Args:
239
+ dataset: The dataset to validate & annotate.
240
+ schema: A `Schema` object that defines the validation constraints.
241
+
242
+ .. versionadded:: 1.3.0
243
+ """
244
+
245
+ def __init__(
246
+ self,
247
+ dataset: Any,
248
+ schema: Schema,
249
+ ) -> None:
250
+ super().__init__(dataset=dataset, schema=schema)
251
+ self._slots: dict[str, DataFrameCurator] = {}
252
+
253
+ # used in MuDataCurator and SpatialDataCurator
254
+ # in form of {table/modality_key: var_field}
255
+ self._var_fields: dict[str, FieldAttr] = {}
256
+ # in form of {table/modality_key: categoricals}
257
+ self._categoricals: dict[str, dict[str, FieldAttr]] = {}
258
+
259
+ @property
260
+ @doc_args(SLOTS_DOCSTRING)
261
+ def slots(self) -> dict[str, DataFrameCurator]:
262
+ """{}""" # noqa: D415
263
+ return self._slots
264
+
265
+ @doc_args(VALIDATE_DOCSTRING)
266
+ def validate(self) -> None:
267
+ """{}""" # noqa: D415
268
+ for _, curator in self._slots.items():
269
+ curator.validate()
270
+
271
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
272
+ def save_artifact(
273
+ self,
274
+ *,
275
+ key: str | None = None,
276
+ description: str | None = None,
277
+ revises: Artifact | None = None,
278
+ run: Run | None = None,
279
+ ) -> Artifact:
280
+ """{}""" # noqa: D415
281
+ if not self._is_validated:
282
+ self.validate()
283
+
284
+ # default implementation for MuDataCurator and SpatialDataCurator
285
+ return save_artifact( # type: ignore
286
+ self._dataset,
287
+ key=key,
288
+ description=description,
289
+ fields=self._categoricals,
290
+ index_field=self._var_fields,
291
+ artifact=self._artifact,
292
+ revises=revises,
293
+ run=run,
294
+ schema=self._schema,
295
+ )
296
+
297
+
228
298
  class DataFrameCurator(Curator):
229
299
  # the example in the docstring is tested in test_curators_quickstart_example
230
- """Curator for a DataFrame object.
300
+ """Curator for `DataFrame`.
231
301
 
232
302
  See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
233
303
 
@@ -282,7 +352,9 @@ class DataFrameCurator(Curator):
282
352
  feature.dtype if not feature.dtype.startswith("cat") else "category"
283
353
  )
284
354
  pandera_columns[feature.name] = pandera.Column(
285
- pandera_dtype, nullable=feature.nullable
355
+ pandera_dtype,
356
+ nullable=feature.nullable,
357
+ coerce=feature.coerce_dtype,
286
358
  )
287
359
  if feature.dtype.startswith("cat"):
288
360
  categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
@@ -378,7 +450,7 @@ class DataFrameCurator(Curator):
378
450
  description: str | None = None,
379
451
  revises: Artifact | None = None,
380
452
  run: Run | None = None,
381
- ):
453
+ ) -> Artifact:
382
454
  """{}""" # noqa: D415
383
455
  if not self._is_validated:
384
456
  self.validate() # raises ValidationError if doesn't validate
@@ -387,7 +459,7 @@ class DataFrameCurator(Curator):
387
459
  self._dataset,
388
460
  description=description,
389
461
  fields=self._cat_manager.categoricals,
390
- columns_field=result["field"],
462
+ index_field=result["field"],
391
463
  key=key,
392
464
  artifact=self._artifact,
393
465
  revises=revises,
@@ -396,9 +468,9 @@ class DataFrameCurator(Curator):
396
468
  )
397
469
 
398
470
 
399
- class AnnDataCurator(Curator):
471
+ class AnnDataCurator(SlotsCurator):
400
472
  # the example in the docstring is tested in test_curators_quickstart_example
401
- """Curator for a DataFrame object.
473
+ """Curator for `AnnData`.
402
474
 
403
475
  See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
404
476
 
@@ -446,7 +518,7 @@ class AnnDataCurator(Curator):
446
518
  ).save()
447
519
 
448
520
  # curate an AnnData
449
- adata = datasets.small_dataset1(otype="AnnData")
521
+ adata = ln.core.datasets.small_dataset1(otype="AnnData")
450
522
  curator = ln.curators.AnnDataCurator(adata, anndata_schema)
451
523
  artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
452
524
  assert artifact.schema == anndata_schema
@@ -466,9 +538,9 @@ class AnnDataCurator(Curator):
466
538
  self._slots = {
467
539
  slot: DataFrameCurator(
468
540
  (
469
- self._dataset.__getattribute__(slot).T
541
+ getattr(self._dataset, slot).T
470
542
  if slot == "var"
471
- else self._dataset.__getattribute__(slot)
543
+ else getattr(self._dataset, slot)
472
544
  ),
473
545
  slot_schema,
474
546
  )
@@ -476,18 +548,6 @@ class AnnDataCurator(Curator):
476
548
  if slot in {"obs", "var"}
477
549
  }
478
550
 
479
- @property
480
- @doc_args(SLOTS_DOCSTRING)
481
- def slots(self) -> dict[str, DataFrameCurator]:
482
- """{}""" # noqa: D415
483
- return self._slots
484
-
485
- @doc_args(VALIDATE_DOCSTRING)
486
- def validate(self) -> None:
487
- """{}""" # noqa: D415
488
- for _, curator in self._slots.items():
489
- curator.validate()
490
-
491
551
  @doc_args(SAVE_ARTIFACT_DOCSTRING)
492
552
  def save_artifact(
493
553
  self,
@@ -496,7 +556,7 @@ class AnnDataCurator(Curator):
496
556
  description: str | None = None,
497
557
  revises: Artifact | None = None,
498
558
  run: Run | None = None,
499
- ):
559
+ ) -> Artifact:
500
560
  """{}""" # noqa: D415
501
561
  if not self._is_validated:
502
562
  self.validate()
@@ -504,7 +564,7 @@ class AnnDataCurator(Curator):
504
564
  self._dataset,
505
565
  description=description,
506
566
  fields=self.slots["obs"]._cat_manager.categoricals,
507
- columns_field=(
567
+ index_field=(
508
568
  parse_dtype_single_cat(self.slots["var"]._schema.itype, is_itype=True)[
509
569
  "field"
510
570
  ]
@@ -519,34 +579,286 @@ class AnnDataCurator(Curator):
519
579
  )
520
580
 
521
581
 
522
- class CatManager:
523
- """Manage valid categoricals by updating registries.
582
+ def _assign_var_fields_categoricals_multimodal(
583
+ modality: str | None,
584
+ slot_type: str,
585
+ slot: str,
586
+ slot_schema: Schema,
587
+ var_fields: dict[str, FieldAttr],
588
+ categoricals: dict[str, dict[str, FieldAttr]],
589
+ slots: dict[str, DataFrameCurator],
590
+ ) -> None:
591
+ """Assigns var_fields and categoricals for multimodal data curators."""
592
+ if modality is not None:
593
+ # Makes sure that all tables are present
594
+ var_fields[modality] = None
595
+ categoricals[modality] = {}
596
+
597
+ if slot_type == "var":
598
+ var_field = parse_dtype_single_cat(slot_schema.itype, is_itype=True)["field"]
599
+ if modality is None:
600
+ # This should rarely/never be used since tables should have different var fields
601
+ var_fields[slot] = var_field # pragma: no cover
602
+ else:
603
+ # Note that this is NOT nested since the nested key is always "var"
604
+ var_fields[modality] = var_field
605
+ else:
606
+ obs_fields = slots[slot]._cat_manager.categoricals
607
+ if modality is None:
608
+ categoricals[slot] = obs_fields
609
+ else:
610
+ # Note that this is NOT nested since the nested key is always "obs"
611
+ categoricals[modality] = obs_fields
612
+
524
613
 
525
- A `CatManager` object makes it easy to validate, standardize & annotate datasets.
614
+ class MuDataCurator(SlotsCurator):
615
+ # the example in the docstring is tested in test_curators_quickstart_example
616
+ """Curator for `MuData`.
617
+
618
+ See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
526
619
 
527
- Example:
620
+ .. versionadded:: 1.3.0
528
621
 
529
- >>> cat_manager = ln.CatManager(
530
- >>> dataset,
531
- >>> # define validation criteria as mappings
532
- >>> columns=Feature.name, # map column names
533
- >>> categoricals={"perturbation": ULabel.name}, # map categories
534
- >>> )
535
- >>> cat_manager.validate() # validate the dataframe
536
- >>> artifact = cat_manager.save_artifact(description="my RNA-seq")
537
- >>> artifact.describe() # see annotations
622
+ Args:
623
+ dataset: The MuData-like object to validate & annotate.
624
+ schema: A `Schema` object that defines the validation constraints.
538
625
 
539
- `cat_manager.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
626
+ Example::
540
627
 
541
- If you find non-validated values, you have several options:
628
+ import lamindb as ln
629
+ import bionty as bt
630
+
631
+ # define the global obs schema
632
+ obs_schema = ln.Schema(
633
+ name="mudata_papalexi21_subset_obs_schema",
634
+ features=[
635
+ ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
636
+ ln.Feature(name="replicate", dtype="cat[ULabel[Replicate]]").save(),
637
+ ],
638
+ ).save()
639
+
640
+ # define the ['rna'].obs schema
641
+ obs_schema_rna = ln.Schema(
642
+ name="mudata_papalexi21_subset_rna_obs_schema",
643
+ features=[
644
+ ln.Feature(name="nCount_RNA", dtype=int).save(),
645
+ ln.Feature(name="nFeature_RNA", dtype=int).save(),
646
+ ln.Feature(name="percent.mito", dtype=float).save(),
647
+ ],
648
+ coerce_dtype=True,
649
+ ).save()
650
+
651
+ # define the ['hto'].obs schema
652
+ obs_schema_hto = ln.Schema(
653
+ name="mudata_papalexi21_subset_hto_obs_schema",
654
+ features=[
655
+ ln.Feature(name="nCount_HTO", dtype=int).save(),
656
+ ln.Feature(name="nFeature_HTO", dtype=int).save(),
657
+ ln.Feature(name="technique", dtype=bt.ExperimentalFactor).save(),
658
+ ],
659
+ coerce_dtype=True,
660
+ ).save()
661
+
662
+ # define ['rna'].var schema
663
+ var_schema_rna = ln.Schema(
664
+ name="mudata_papalexi21_subset_rna_var_schema",
665
+ itype=bt.Gene.symbol,
666
+ dtype=float,
667
+ ).save()
668
+
669
+ # define composite schema
670
+ mudata_schema = ln.Schema(
671
+ name="mudata_papalexi21_subset_mudata_schema",
672
+ otype="MuData",
673
+ components={
674
+ "obs": obs_schema,
675
+ "rna:obs": obs_schema_rna,
676
+ "hto:obs": obs_schema_hto,
677
+ "rna:var": var_schema_rna,
678
+ },
679
+ ).save()
542
680
 
543
- - new values found in the data can be registered using :meth:`~lamindb.curators.DataFrameCatManager.add_new_from`
544
- - non-validated values can be accessed using :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
681
+ # curate a MuData
682
+ mdata = ln.core.datasets.mudata_papalexi21_subset()
683
+ bt.settings.organism = "human" # set the organism
684
+ curator = ln.curators.MuDataCurator(mdata, mudata_schema)
685
+ artifact = curator.save_artifact(key="example_datasets/mudata_papalexi21_subset.h5mu")
686
+ assert artifact.schema == mudata_schema
545
687
  """
546
688
 
547
689
  def __init__(
548
- self, *, dataset, categoricals, sources, organism, exclude, columns_field=None
549
- ):
690
+ self,
691
+ dataset: MuData | Artifact,
692
+ schema: Schema,
693
+ ) -> None:
694
+ super().__init__(dataset=dataset, schema=schema)
695
+ if not data_is_mudata(self._dataset):
696
+ raise InvalidArgument("dataset must be MuData-like.")
697
+ if schema.otype != "MuData":
698
+ raise InvalidArgument("Schema otype must be 'MuData'.")
699
+
700
+ for slot, slot_schema in schema.slots.items():
701
+ # Assign to _slots
702
+ if ":" in slot:
703
+ modality, modality_slot = slot.split(":")
704
+ schema_dataset = self._dataset.__getitem__(modality)
705
+ else:
706
+ modality, modality_slot = None, slot
707
+ schema_dataset = self._dataset
708
+ self._slots[slot] = DataFrameCurator(
709
+ (
710
+ getattr(schema_dataset, modality_slot).T
711
+ if modality_slot == "var"
712
+ else getattr(schema_dataset, modality_slot)
713
+ ),
714
+ slot_schema,
715
+ )
716
+ _assign_var_fields_categoricals_multimodal(
717
+ modality=modality,
718
+ slot_type=modality_slot,
719
+ slot=slot,
720
+ slot_schema=slot_schema,
721
+ var_fields=self._var_fields,
722
+ categoricals=self._categoricals,
723
+ slots=self._slots,
724
+ )
725
+
726
+ # for consistency with BaseCatManager
727
+ self._columns_field = self._var_fields
728
+
729
+
730
+ class SpatialDataCurator(SlotsCurator):
731
+ # the example in the docstring is tested in test_curators_quickstart_example
732
+ """Curator for `SpatialData`.
733
+
734
+ See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
735
+
736
+ .. versionadded:: 1.3.0
737
+
738
+ Args:
739
+ dataset: The SpatialData-like object to validate & annotate.
740
+ schema: A `Schema` object that defines the validation constraints.
741
+
742
+ Example::
743
+
744
+ import lamindb as ln
745
+ import bionty as bt
746
+
747
+ # define sample schema
748
+ sample_schema = ln.Schema(
749
+ name="blobs_sample_level_metadata",
750
+ features=[
751
+ ln.Feature(name="assay", dtype=bt.ExperimentalFactor).save(),
752
+ ln.Feature(name="disease", dtype=bt.Disease).save(),
753
+ ln.Feature(name="development_stage", dtype=bt.DevelopmentalStage).save(),
754
+ ],
755
+ coerce_dtype=True
756
+ ).save()
757
+
758
+ # define table obs schema
759
+ blobs_obs_schema = ln.Schema(
760
+ name="blobs_obs_level_metadata",
761
+ features=[
762
+ ln.Feature(name="sample_region", dtype="str").save(),
763
+ ],
764
+ coerce_dtype=True
765
+ ).save()
766
+
767
+ # define table var schema
768
+ blobs_var_schema = ln.Schema(
769
+ name="blobs_var_schema",
770
+ itype=bt.Gene.ensembl_gene_id,
771
+ dtype=int
772
+ ).save()
773
+
774
+ # define composite schema
775
+ spatialdata_schema = ln.Schema(
776
+ name="blobs_spatialdata_schema",
777
+ otype="SpatialData",
778
+ components={
779
+ "sample": sample_schema,
780
+ "table:obs": blobs_obs_schema,
781
+ "table:var": blobs_var_schema,
782
+ }).save()
783
+
784
+ # curate a SpatialData
785
+ spatialdata = ln.core.datasets.spatialdata_blobs()
786
+ curator = ln.curators.SpatialDataCurator(spatialdata, spatialdata_schema)
787
+ try:
788
+ curator.validate()
789
+ except ln.errors.ValidationError as error:
790
+ print(error)
791
+
792
+ # validate again (must pass now) and save artifact
793
+ artifact = curator.save_artifact(key="example_datasets/spatialdata1.zarr")
794
+ assert artifact.schema == spatialdata_schema
795
+ """
796
+
797
+ def __init__(
798
+ self,
799
+ dataset: SpatialData | Artifact,
800
+ schema: Schema,
801
+ *,
802
+ sample_metadata_key: str | None = "sample",
803
+ ) -> None:
804
+ super().__init__(dataset=dataset, schema=schema)
805
+ if not data_is_spatialdata(self._dataset):
806
+ raise InvalidArgument("dataset must be SpatialData-like.")
807
+ if schema.otype != "SpatialData":
808
+ raise InvalidArgument("Schema otype must be 'SpatialData'.")
809
+
810
+ for slot, slot_schema in schema.slots.items():
811
+ # Assign to _slots
812
+ if ":" in slot:
813
+ table_key, table_slot = slot.split(":")
814
+ schema_dataset = self._dataset.tables.__getitem__(table_key)
815
+ # sample metadata (does not have a `:` separator)
816
+ else:
817
+ table_key = None
818
+ table_slot = slot
819
+ schema_dataset = self._dataset.get_attrs(
820
+ key=sample_metadata_key, return_as="df", flatten=True
821
+ )
822
+
823
+ self._slots[slot] = DataFrameCurator(
824
+ (
825
+ getattr(schema_dataset, table_slot).T
826
+ if table_slot == "var"
827
+ else (
828
+ getattr(schema_dataset, table_slot)
829
+ if table_slot != sample_metadata_key
830
+ else schema_dataset
831
+ ) # just take the schema_dataset if it's the sample metadata key
832
+ ),
833
+ slot_schema,
834
+ )
835
+
836
+ _assign_var_fields_categoricals_multimodal(
837
+ modality=table_key,
838
+ slot_type=table_slot,
839
+ slot=slot,
840
+ slot_schema=slot_schema,
841
+ var_fields=self._var_fields,
842
+ categoricals=self._categoricals,
843
+ slots=self._slots,
844
+ )
845
+
846
+ # for consistency with BaseCatManager
847
+ self._columns_field = self._var_fields
848
+
849
+
850
+ class CatManager:
851
+ """Manage categoricals by updating registries.
852
+
853
+ This class is accessible from within a `DataFrameCurator` via the `.cat` attribute.
854
+
855
+ If you find non-validated values, you have several options:
856
+
857
+ - new values found in the data can be registered via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.add_new_from`
858
+ - non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
859
+ """
860
+
861
+ def __init__(self, *, dataset, categoricals, sources, organism, columns_field=None):
550
862
  # the below is shared with Curator
551
863
  self._artifact: Artifact = None # pass the dataset as an artifact
552
864
  self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
@@ -560,7 +872,6 @@ class CatManager:
560
872
  self._non_validated = None
561
873
  self._organism = organism
562
874
  self._sources = sources or {}
563
- self._exclude = exclude or {}
564
875
  self._columns_field = columns_field
565
876
  self._validate_category_error_messages: str = ""
566
877
 
@@ -620,7 +931,7 @@ class CatManager:
620
931
  Returns:
621
932
  None
622
933
  """
623
- pass # pdagma: no cover
934
+ pass # pragma: no cover
624
935
 
625
936
  @doc_args(SAVE_ARTIFACT_DOCSTRING)
626
937
  def save_artifact(
@@ -645,10 +956,10 @@ class CatManager:
645
956
  settings.verbosity = "warning"
646
957
  self._artifact = save_artifact( # type: ignore
647
958
  self._dataset,
959
+ key=key,
648
960
  description=description,
649
961
  fields=self.categoricals,
650
- columns_field=self._columns_field,
651
- key=key,
962
+ index_field=self._columns_field,
652
963
  artifact=self._artifact,
653
964
  revises=revises,
654
965
  run=run,
@@ -662,34 +973,7 @@ class CatManager:
662
973
 
663
974
 
664
975
  class DataFrameCatManager(CatManager):
665
- """Curation flow for a DataFrame object.
666
-
667
- See also :class:`~lamindb.Curator`.
668
-
669
- Args:
670
- df: The DataFrame object to curate.
671
- columns: The field attribute for the feature column.
672
- categoricals: A dictionary mapping column names to registry_field.
673
- verbosity: The verbosity level.
674
- organism: The organism name.
675
- sources: A dictionary mapping column names to Source records.
676
- exclude: A dictionary mapping column names to values to exclude from validation.
677
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
678
- using the exclude parameter ensures they are not validated.
679
-
680
- Returns:
681
- A curator object.
682
-
683
- Examples:
684
- >>> import bionty as bt
685
- >>> curator = ln.Curator.from_df(
686
- ... df,
687
- ... categoricals={
688
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
689
- ... "donor_id": ULabel.name
690
- ... }
691
- ... )
692
- """
976
+ """Categorical manager for `DataFrame`."""
693
977
 
694
978
  def __init__(
695
979
  self,
@@ -699,7 +983,6 @@ class DataFrameCatManager(CatManager):
699
983
  verbosity: str = "hint",
700
984
  organism: str | None = None,
701
985
  sources: dict[str, Record] | None = None,
702
- exclude: dict | None = None,
703
986
  ) -> None:
704
987
  from lamindb.core._settings import settings
705
988
 
@@ -714,17 +997,16 @@ class DataFrameCatManager(CatManager):
714
997
  organism=organism,
715
998
  categoricals=categoricals,
716
999
  sources=sources,
717
- exclude=exclude,
718
1000
  )
719
1001
  self._save_columns()
720
1002
 
721
- def lookup(self, public: bool = False) -> CurateLookup:
1003
+ def lookup(self, public: bool = False) -> CatLookup:
722
1004
  """Lookup categories.
723
1005
 
724
1006
  Args:
725
1007
  public: If "public", the lookup is performed on the public reference.
726
1008
  """
727
- return CurateLookup(
1009
+ return CatLookup(
728
1010
  categoricals=self._categoricals,
729
1011
  slots={"columns": self._columns_field},
730
1012
  public=public,
@@ -739,7 +1021,6 @@ class DataFrameCatManager(CatManager):
739
1021
  key="columns",
740
1022
  validated_only=False,
741
1023
  source=self._sources.get("columns"),
742
- exclude=self._exclude.get("columns"),
743
1024
  )
744
1025
 
745
1026
  # Save the rest of the columns based on validated_only
@@ -752,7 +1033,6 @@ class DataFrameCatManager(CatManager):
752
1033
  validated_only=validated_only,
753
1034
  df=self._dataset, # Get the Feature type from df
754
1035
  source=self._sources.get("columns"),
755
- exclude=self._exclude.get("columns"),
756
1036
  )
757
1037
 
758
1038
  @deprecated(new_name="is run by default")
@@ -778,7 +1058,6 @@ class DataFrameCatManager(CatManager):
778
1058
  self._dataset,
779
1059
  fields=self.categoricals,
780
1060
  sources=self._sources,
781
- exclude=self._exclude,
782
1061
  curator=self,
783
1062
  organism=self._organism,
784
1063
  )
@@ -852,7 +1131,6 @@ class DataFrameCatManager(CatManager):
852
1131
  key=categorical,
853
1132
  validated_only=validated_only,
854
1133
  source=self._sources.get(categorical),
855
- exclude=self._exclude.get(categorical),
856
1134
  organism=self._organism,
857
1135
  )
858
1136
  # adding new records removes them from non_validated
@@ -882,32 +1160,7 @@ class DataFrameCatManager(CatManager):
882
1160
 
883
1161
 
884
1162
  class AnnDataCatManager(CatManager):
885
- """Manage categorical curation.
886
-
887
- Args:
888
- data: The AnnData object or an AnnData-like path.
889
- var_index: The registry field for mapping the ``.var`` index.
890
- categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
891
- obs_columns: The registry field for mapping the ``.obs.columns``.
892
- verbosity: The verbosity level.
893
- organism: The organism name.
894
- sources: A dictionary mapping ``.obs.columns`` to Source records.
895
- exclude: A dictionary mapping column names to values to exclude from validation.
896
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
897
- using the exclude parameter ensures they are not validated.
898
-
899
- Examples:
900
- >>> import bionty as bt
901
- >>> curator = ln.Curator.from_anndata(
902
- ... adata,
903
- ... var_index=bt.Gene.ensembl_gene_id,
904
- ... categoricals={
905
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
906
- ... "donor_id": ULabel.name
907
- ... },
908
- ... organism="human",
909
- ... )
910
- """
1163
+ """Categorical manager for `AnnData`."""
911
1164
 
912
1165
  def __init__(
913
1166
  self,
@@ -918,13 +1171,10 @@ class AnnDataCatManager(CatManager):
918
1171
  verbosity: str = "hint",
919
1172
  organism: str | None = None,
920
1173
  sources: dict[str, Record] | None = None,
921
- exclude: dict | None = None,
922
1174
  ) -> None:
923
1175
  if isinstance(var_index, str):
924
1176
  raise TypeError("var_index parameter has to be a bionty field")
925
1177
 
926
- if sources is None:
927
- sources = {}
928
1178
  if not data_is_anndata(data):
929
1179
  raise TypeError("data has to be an AnnData object")
930
1180
 
@@ -935,12 +1185,12 @@ class AnnDataCatManager(CatManager):
935
1185
 
936
1186
  self._obs_fields = categoricals or {}
937
1187
  self._var_field = var_index
1188
+ self._sources = sources or {}
938
1189
  super().__init__(
939
1190
  dataset=data,
940
1191
  categoricals=categoricals,
941
- sources=sources,
1192
+ sources=self._sources,
942
1193
  organism=organism,
943
- exclude=exclude,
944
1194
  columns_field=var_index,
945
1195
  )
946
1196
  self._adata = self._dataset
@@ -950,8 +1200,7 @@ class AnnDataCatManager(CatManager):
950
1200
  columns=obs_columns,
951
1201
  verbosity=verbosity,
952
1202
  organism=None,
953
- sources=sources,
954
- exclude=exclude,
1203
+ sources=self._sources,
955
1204
  )
956
1205
 
957
1206
  @property
@@ -964,13 +1213,13 @@ class AnnDataCatManager(CatManager):
964
1213
  """Return the obs fields to validate against."""
965
1214
  return self._obs_fields
966
1215
 
967
- def lookup(self, public: bool = False) -> CurateLookup:
1216
+ def lookup(self, public: bool = False) -> CatLookup:
968
1217
  """Lookup categories.
969
1218
 
970
1219
  Args:
971
1220
  public: If "public", the lookup is performed on the public reference.
972
1221
  """
973
- return CurateLookup(
1222
+ return CatLookup(
974
1223
  categoricals=self._obs_fields,
975
1224
  slots={"columns": self._columns_field, "var_index": self._var_field},
976
1225
  public=public,
@@ -989,7 +1238,6 @@ class AnnDataCatManager(CatManager):
989
1238
  validated_only=validated_only,
990
1239
  organism=self._organism,
991
1240
  source=self._sources.get("var_index"),
992
- exclude=self._exclude.get("var_index"),
993
1241
  )
994
1242
 
995
1243
  def add_new_from(self, key: str, **kwargs):
@@ -1033,7 +1281,6 @@ class AnnDataCatManager(CatManager):
1033
1281
  key="var_index",
1034
1282
  source=self._sources.get("var_index"),
1035
1283
  hint_print=".add_new_from_var_index()",
1036
- exclude=self._exclude.get("var_index"),
1037
1284
  organism=self._organism, # type: ignore
1038
1285
  )
1039
1286
  else:
@@ -1077,59 +1324,29 @@ class AnnDataCatManager(CatManager):
1077
1324
 
1078
1325
 
1079
1326
  class MuDataCatManager(CatManager):
1080
- """Curation flow for a ``MuData`` object.
1081
-
1082
- Args:
1083
- mdata: The MuData object to curate.
1084
- var_index: The registry field for mapping the ``.var`` index for each modality.
1085
- For example:
1086
- ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
1087
- categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
1088
- Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
1089
- verbosity: The verbosity level.
1090
- organism: The organism name.
1091
- sources: A dictionary mapping ``.obs.columns`` to Source records.
1092
- exclude: A dictionary mapping column names to values to exclude from validation.
1093
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1094
- using the exclude parameter ensures they are not validated.
1095
-
1096
- Examples:
1097
- >>> import bionty as bt
1098
- >>> curator = ln.Curator.from_mudata(
1099
- ... mdata,
1100
- ... var_index={
1101
- ... "rna": bt.Gene.ensembl_gene_id,
1102
- ... "adt": CellMarker.name
1103
- ... },
1104
- ... categoricals={
1105
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
1106
- ... "donor_id": ULabel.name
1107
- ... },
1108
- ... organism="human",
1109
- ... )
1110
- """
1327
+ """Categorical manager for `MuData`."""
1111
1328
 
1112
1329
  def __init__(
1113
1330
  self,
1114
1331
  mdata: MuData | Artifact,
1115
- var_index: dict[str, FieldAttr],
1332
+ var_index: dict[str, FieldAttr] | None = None,
1116
1333
  categoricals: dict[str, FieldAttr] | None = None,
1117
1334
  verbosity: str = "hint",
1118
1335
  organism: str | None = None,
1119
1336
  sources: dict[str, Record] | None = None,
1120
- exclude: dict | None = None, # {modality: {field: [values]}}
1121
1337
  ) -> None:
1122
1338
  super().__init__(
1123
1339
  dataset=mdata,
1124
1340
  categoricals={},
1125
1341
  sources=sources,
1126
1342
  organism=organism,
1127
- exclude=exclude,
1128
1343
  )
1129
- self._columns_field = var_index # this is for consistency with BaseCatManager
1130
- self._var_fields = var_index
1344
+ self._columns_field = (
1345
+ var_index or {}
1346
+ ) # this is for consistency with BaseCatManager
1347
+ self._var_fields = var_index or {}
1131
1348
  self._verify_modality(self._var_fields.keys())
1132
- self._obs_fields = self._parse_categoricals(categoricals)
1349
+ self._obs_fields = self._parse_categoricals(categoricals or {})
1133
1350
  self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
1134
1351
  self._verbosity = verbosity
1135
1352
  self._obs_df_curator = None
@@ -1140,7 +1357,6 @@ class MuDataCatManager(CatManager):
1140
1357
  categoricals=self._obs_fields.get("obs", {}),
1141
1358
  verbosity=verbosity,
1142
1359
  sources=self._sources.get("obs"),
1143
- exclude=self._exclude.get("obs"),
1144
1360
  organism=organism,
1145
1361
  )
1146
1362
  self._mod_adata_curators = {
@@ -1150,7 +1366,6 @@ class MuDataCatManager(CatManager):
1150
1366
  categoricals=self._obs_fields.get(modality),
1151
1367
  verbosity=verbosity,
1152
1368
  sources=self._sources.get(modality),
1153
- exclude=self._exclude.get(modality),
1154
1369
  organism=organism,
1155
1370
  )
1156
1371
  for modality in self._modalities
@@ -1199,7 +1414,7 @@ class MuDataCatManager(CatManager):
1199
1414
  obs_fields["obs"][k] = v
1200
1415
  return obs_fields
1201
1416
 
1202
- def lookup(self, public: bool = False) -> CurateLookup:
1417
+ def lookup(self, public: bool = False) -> CatLookup:
1203
1418
  """Lookup categories.
1204
1419
 
1205
1420
  Args:
@@ -1212,7 +1427,7 @@ class MuDataCatManager(CatManager):
1212
1427
  obs_fields[k] = v
1213
1428
  else:
1214
1429
  obs_fields[f"{mod}:{k}"] = v
1215
- return CurateLookup(
1430
+ return CatLookup(
1216
1431
  categoricals=obs_fields,
1217
1432
  slots={
1218
1433
  **{f"{k}_var_index": v for k, v in self._var_fields.items()},
@@ -1271,8 +1486,6 @@ class MuDataCatManager(CatManager):
1271
1486
 
1272
1487
  def validate(self) -> bool:
1273
1488
  """Validate categories."""
1274
- from lamindb.core._settings import settings
1275
-
1276
1489
  # add all validated records to the current instance
1277
1490
  verbosity = settings.verbosity
1278
1491
  try:
@@ -1329,393 +1542,290 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
1329
1542
  )
1330
1543
 
1331
1544
 
1332
- class TiledbsomaCatManager(CatManager):
1333
- """Curation flow for `tiledbsoma.Experiment`.
1334
-
1335
- Args:
1336
- experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
1337
- var_index: The registry fields for mapping the `.var` indices for measurements.
1338
- Should be in the form `{"measurement name": ("var column", field)}`.
1339
- These keys should be used in the flattened form (`'{measurement name}__{column name in .var}'`)
1340
- in `.standardize` or `.add_new_from`, see the output of `.var_index`.
1341
- categoricals: A dictionary mapping categorical `.obs` columns to a registry field.
1342
- obs_columns: The registry field for mapping the names of the `.obs` columns.
1343
- organism: The organism name.
1344
- sources: A dictionary mapping `.obs` columns to Source records.
1345
- exclude: A dictionary mapping column names to values to exclude from validation.
1346
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1347
- using the exclude parameter ensures they are not validated.
1348
-
1349
- Examples:
1350
- >>> import bionty as bt
1351
- >>> curator = ln.Curator.from_tiledbsoma(
1352
- ... "./my_array_store.tiledbsoma",
1353
- ... var_index={"RNA": ("var_id", bt.Gene.symbol)},
1354
- ... categoricals={
1355
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
1356
- ... "donor_id": ULabel.name
1357
- ... },
1358
- ... organism="human",
1359
- ... )
1360
- """
1545
+ class SpatialDataCatManager(CatManager):
1546
+ """Categorical manager for `SpatialData`."""
1361
1547
 
1362
1548
  def __init__(
1363
1549
  self,
1364
- experiment_uri: UPathStr | Artifact,
1365
- var_index: dict[str, tuple[str, FieldAttr]],
1366
- categoricals: dict[str, FieldAttr] | None = None,
1367
- obs_columns: FieldAttr = Feature.name,
1550
+ sdata: Any,
1551
+ var_index: dict[str, FieldAttr],
1552
+ categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1553
+ verbosity: str = "hint",
1368
1554
  organism: str | None = None,
1369
- sources: dict[str, Record] | None = None,
1370
- exclude: dict[str, str | list[str]] | None = None,
1371
- ):
1372
- self._obs_fields = categoricals or {}
1373
- self._var_fields = var_index
1374
- self._columns_field = obs_columns
1375
- if isinstance(experiment_uri, Artifact):
1376
- self._dataset = experiment_uri.path
1377
- self._artifact = experiment_uri
1555
+ sources: dict[str, dict[str, Record]] | None = None,
1556
+ *,
1557
+ sample_metadata_key: str | None = "sample",
1558
+ ) -> None:
1559
+ super().__init__(
1560
+ dataset=sdata,
1561
+ categoricals={},
1562
+ sources=sources,
1563
+ organism=organism,
1564
+ )
1565
+ if isinstance(sdata, Artifact):
1566
+ self._sdata = sdata.load()
1378
1567
  else:
1379
- self._dataset = UPath(experiment_uri)
1380
- self._artifact = None
1381
- self._organism = organism
1382
- self._sources = sources or {}
1383
- self._exclude = exclude or {}
1384
-
1385
- self._is_validated: bool | None = False
1386
- self._non_validated_values: dict[str, list] | None = None
1387
- self._validated_values: dict[str, list] = {}
1388
- # filled by _check_save_keys
1389
- self._n_obs: int | None = None
1390
- self._valid_obs_keys: list[str] | None = None
1391
- self._obs_pa_schema: pa.lib.Schema | None = (
1392
- None # this is needed to create the obs feature set
1568
+ self._sdata = self._dataset
1569
+ self._sample_metadata_key = sample_metadata_key
1570
+ self._write_path = None
1571
+ self._var_fields = var_index
1572
+ self._verify_accessor_exists(self._var_fields.keys())
1573
+ self._categoricals = categoricals
1574
+ self._table_keys = set(self._var_fields.keys()) | set(
1575
+ self._categoricals.keys() - {self._sample_metadata_key}
1393
1576
  )
1394
- self._valid_var_keys: list[str] | None = None
1395
- self._var_fields_flat: dict[str, FieldAttr] | None = None
1396
- self._check_save_keys()
1397
-
1398
- # check that the provided keys in var_index and categoricals are available in the store
1399
- # and save features
1400
- def _check_save_keys(self):
1401
- from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1402
-
1403
- with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1404
- experiment_obs = experiment.obs
1405
- self._n_obs = len(experiment_obs)
1406
- self._obs_pa_schema = experiment_obs.schema
1407
- valid_obs_keys = [
1408
- k for k in self._obs_pa_schema.names if k != "soma_joinid"
1409
- ]
1410
- self._valid_obs_keys = valid_obs_keys
1411
-
1412
- valid_var_keys = []
1413
- ms_list = []
1414
- for ms in experiment.ms.keys():
1415
- ms_list.append(ms)
1416
- var_ms = experiment.ms[ms].var
1417
- valid_var_keys += [
1418
- f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
1419
- ]
1420
- self._valid_var_keys = valid_var_keys
1577
+ self._verbosity = verbosity
1578
+ self._sample_df_curator = None
1579
+ if self._sample_metadata_key is not None:
1580
+ self._sample_metadata = self._sdata.get_attrs(
1581
+ key=self._sample_metadata_key, return_as="df", flatten=True
1582
+ )
1583
+ self._is_validated = False
1421
1584
 
1422
- # check validity of keys in categoricals
1585
+ # Check validity of keys in categoricals
1423
1586
  nonval_keys = []
1424
- for obs_key in self._obs_fields.keys():
1425
- if obs_key not in valid_obs_keys:
1426
- nonval_keys.append(obs_key)
1587
+ for accessor, accessor_categoricals in self._categoricals.items():
1588
+ if (
1589
+ accessor == self._sample_metadata_key
1590
+ and self._sample_metadata is not None
1591
+ ):
1592
+ for key in accessor_categoricals.keys():
1593
+ if key not in self._sample_metadata.columns:
1594
+ nonval_keys.append(key)
1595
+ else:
1596
+ for key in accessor_categoricals.keys():
1597
+ if key not in self._sdata[accessor].obs.columns:
1598
+ nonval_keys.append(key)
1599
+
1427
1600
  _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1428
1601
 
1429
- # check validity of keys in var_index
1430
- self._var_fields_flat = {}
1602
+ # check validity of keys in sources
1431
1603
  nonval_keys = []
1432
- for ms_key in self._var_fields.keys():
1433
- var_key, var_field = self._var_fields[ms_key]
1434
- var_key_flat = f"{ms_key}__{var_key}"
1435
- if var_key_flat not in valid_var_keys:
1436
- nonval_keys.append(f"({ms_key}, {var_key})")
1604
+ for accessor, accessor_sources in self._sources.items():
1605
+ if (
1606
+ accessor == self._sample_metadata_key
1607
+ and self._sample_metadata is not None
1608
+ ):
1609
+ columns = self._sample_metadata.columns
1610
+ elif accessor != self._sample_metadata_key:
1611
+ columns = self._sdata[accessor].obs.columns
1437
1612
  else:
1438
- self._var_fields_flat[var_key_flat] = var_field
1439
- _maybe_curation_keys_not_present(nonval_keys, "var_index")
1440
-
1441
- # check validity of keys in sources and exclude
1442
- valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
1443
- for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
1444
- nonval_keys = []
1445
- for arg_key in dct.keys():
1446
- if arg_key not in valid_arg_keys:
1447
- nonval_keys.append(arg_key)
1448
- _maybe_curation_keys_not_present(nonval_keys, name)
1613
+ continue
1614
+ for key in accessor_sources:
1615
+ if key not in columns:
1616
+ nonval_keys.append(key)
1617
+ _maybe_curation_keys_not_present(nonval_keys, "sources")
1449
1618
 
1450
- # register obs columns' names
1451
- register_columns = list(self._obs_fields.keys())
1452
- organism = check_registry_organism(
1453
- self._columns_field.field.model, self._organism
1454
- ).get("organism")
1455
- update_registry(
1456
- values=register_columns,
1457
- field=self._columns_field,
1458
- key="columns",
1459
- validated_only=False,
1460
- organism=organism,
1461
- source=self._sources.get("columns"),
1462
- exclude=self._exclude.get("columns"),
1463
- )
1464
- additional_columns = [k for k in valid_obs_keys if k not in register_columns]
1465
- # no need to register with validated_only=True if columns are features
1619
+ # Set up sample level metadata and table Curator objects
1466
1620
  if (
1467
- len(additional_columns) > 0
1468
- and self._columns_field.field.model is not Feature
1621
+ self._sample_metadata_key is not None
1622
+ and self._sample_metadata_key in self._categoricals
1469
1623
  ):
1470
- update_registry(
1471
- values=additional_columns,
1472
- field=self._columns_field,
1473
- key="columns",
1474
- validated_only=True,
1624
+ self._sample_df_curator = DataFrameCatManager(
1625
+ df=self._sample_metadata,
1626
+ columns=Feature.name,
1627
+ categoricals=self._categoricals.get(self._sample_metadata_key, {}),
1628
+ verbosity=verbosity,
1629
+ sources=self._sources.get(self._sample_metadata_key),
1630
+ organism=organism,
1631
+ )
1632
+ self._table_adata_curators = {
1633
+ table: AnnDataCatManager(
1634
+ data=self._sdata[table],
1635
+ var_index=var_index.get(table),
1636
+ categoricals=self._categoricals.get(table),
1637
+ verbosity=verbosity,
1638
+ sources=self._sources.get(table),
1475
1639
  organism=organism,
1476
- source=self._sources.get("columns"),
1477
- exclude=self._exclude.get("columns"),
1478
1640
  )
1641
+ for table in self._table_keys
1642
+ }
1479
1643
 
1480
- def validate(self):
1481
- """Validate categories."""
1482
- from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1644
+ self._non_validated = None
1483
1645
 
1484
- validated = True
1485
- self._non_validated_values = {}
1486
- with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1487
- for ms, (key, field) in self._var_fields.items():
1488
- var_ms = experiment.ms[ms].var
1489
- var_ms_key = f"{ms}__{key}"
1490
- # it was already validated and cached
1491
- if var_ms_key in self._validated_values:
1492
- continue
1493
- var_ms_values = (
1494
- var_ms.read(column_names=[key]).concat()[key].to_pylist()
1495
- )
1496
- organism = check_registry_organism(
1497
- field.field.model, self._organism
1498
- ).get("organism")
1499
- update_registry(
1500
- values=var_ms_values,
1501
- field=field,
1502
- key=var_ms_key,
1503
- validated_only=True,
1504
- organism=organism,
1505
- source=self._sources.get(var_ms_key),
1506
- exclude=self._exclude.get(var_ms_key),
1507
- )
1508
- _, non_val = validate_categories(
1509
- values=var_ms_values,
1510
- field=field,
1511
- key=var_ms_key,
1512
- organism=organism,
1513
- source=self._sources.get(var_ms_key),
1514
- exclude=self._exclude.get(var_ms_key),
1515
- )
1516
- if len(non_val) > 0:
1517
- validated = False
1518
- self._non_validated_values[var_ms_key] = non_val
1519
- else:
1520
- self._validated_values[var_ms_key] = var_ms_values
1646
+ @property
1647
+ def var_index(self) -> FieldAttr:
1648
+ """Return the registry fields to validate variables indices against."""
1649
+ return self._var_fields
1521
1650
 
1522
- obs = experiment.obs
1523
- for key, field in self._obs_fields.items():
1524
- # already validated and cached
1525
- if key in self._validated_values:
1526
- continue
1527
- values = pa.compute.unique(
1528
- obs.read(column_names=[key]).concat()[key]
1529
- ).to_pylist()
1530
- organism = check_registry_organism(
1531
- field.field.model, self._organism
1532
- ).get("organism")
1533
- update_registry(
1534
- values=values,
1535
- field=field,
1536
- key=key,
1537
- validated_only=True,
1538
- organism=organism,
1539
- source=self._sources.get(key),
1540
- exclude=self._exclude.get(key),
1541
- )
1542
- _, non_val = validate_categories(
1543
- values=values,
1544
- field=field,
1545
- key=key,
1546
- organism=organism,
1547
- source=self._sources.get(key),
1548
- exclude=self._exclude.get(key),
1549
- )
1550
- if len(non_val) > 0:
1551
- validated = False
1552
- self._non_validated_values[key] = non_val
1553
- else:
1554
- self._validated_values[key] = values
1555
- self._is_validated = validated
1556
- return self._is_validated
1651
+ @property
1652
+ def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
1653
+ """Return the categorical keys and fields to validate against."""
1654
+ return self._categoricals
1557
1655
 
1558
- def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
1559
- assert self._non_validated_values is not None # noqa: S101
1656
+ @property
1657
+ def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
1658
+ """Return the non-validated features and labels."""
1659
+ if self._non_validated is None:
1660
+ raise ValidationError("Please run validate() first!")
1661
+ return self._non_validated
1560
1662
 
1561
- if key in self._valid_obs_keys:
1562
- field = self._obs_fields[key]
1563
- elif key in self._valid_var_keys:
1564
- ms = key.partition("__")[0]
1565
- field = self._var_fields[ms][1]
1566
- else:
1567
- raise KeyError(f"key {key} is invalid!")
1568
- values = self._non_validated_values.get(key, [])
1569
- return values, field
1663
+ def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
1664
+ """Verify that the accessors exist (either a valid table or in attrs)."""
1665
+ for acc in accessors:
1666
+ is_present = False
1667
+ try:
1668
+ self._sdata.get_attrs(key=acc)
1669
+ is_present = True
1670
+ except KeyError:
1671
+ if acc in self._sdata.tables.keys():
1672
+ is_present = True
1673
+ if not is_present:
1674
+ raise ValidationError(f"Accessor '{acc}' does not exist!")
1570
1675
 
1571
- def add_new_from(self, key: str, **kwargs) -> None:
1572
- """Add validated & new categories.
1676
+ def lookup(self, public: bool = False) -> CatLookup:
1677
+ """Look up categories.
1573
1678
 
1574
1679
  Args:
1575
- key: The key referencing the slot in the `tiledbsoma` store.
1576
- It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
1577
- or a column name in `.obs`.
1680
+ public: Whether the lookup is performed on the public reference.
1578
1681
  """
1579
- if self._non_validated_values is None:
1580
- raise ValidationError("Run .validate() first.")
1581
- if key == "all":
1582
- keys = list(self._non_validated_values.keys())
1583
- else:
1584
- avail_keys = list(
1585
- chain(self._non_validated_values.keys(), self._validated_values.keys())
1682
+ cat_values_dict = list(self.categoricals.values())[0]
1683
+ return CatLookup(
1684
+ categoricals=cat_values_dict,
1685
+ slots={"accessors": cat_values_dict.keys()},
1686
+ public=public,
1687
+ )
1688
+
1689
+ def _update_registry_all(self) -> None:
1690
+ """Saves labels of all features for sample and table metadata."""
1691
+ if self._sample_df_curator is not None:
1692
+ self._sample_df_curator._update_registry_all(
1693
+ validated_only=True,
1586
1694
  )
1587
- if key not in avail_keys:
1588
- raise KeyError(
1589
- f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
1590
- )
1591
- keys = [key]
1592
- for k in keys:
1593
- values, field = self._non_validated_values_field(k)
1594
- if len(values) == 0:
1595
- continue
1596
- organism = check_registry_organism(field.field.model, self._organism).get(
1597
- "organism"
1598
- )
1599
- update_registry(
1600
- values=values,
1601
- field=field,
1602
- key=k,
1603
- validated_only=False,
1604
- organism=organism,
1605
- source=self._sources.get(k),
1606
- exclude=self._exclude.get(k),
1607
- **kwargs,
1695
+ for _, adata_curator in self._table_adata_curators.items():
1696
+ adata_curator._obs_df_curator._update_registry_all(
1697
+ validated_only=True,
1608
1698
  )
1609
- # update non-validated values list but keep the key there
1610
- # it will be removed by .validate()
1611
- if k in self._non_validated_values:
1612
- self._non_validated_values[k] = []
1613
1699
 
1614
- @property
1615
- def non_validated(self) -> dict[str, list]:
1616
- """Return the non-validated features and labels."""
1617
- non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
1618
- return non_val
1700
+ def add_new_from_var_index(self, table: str, **kwargs) -> None:
1701
+ """Save new values from ``.var.index`` of table.
1619
1702
 
1620
- @property
1621
- def var_index(self) -> dict[str, FieldAttr]:
1622
- """Return the registry fields with flattened keys to validate variables indices against."""
1623
- return self._var_fields_flat
1703
+ Args:
1704
+ table: The table key.
1705
+ organism: The organism name.
1706
+ **kwargs: Additional keyword arguments to pass to create new records.
1707
+ """
1708
+ if self._non_validated is None:
1709
+ raise ValidationError("Run .validate() first.")
1710
+ self._table_adata_curators[table].add_new_from_var_index(**kwargs)
1711
+ if table in self.non_validated.keys():
1712
+ if "var_index" in self._non_validated[table]:
1713
+ self._non_validated[table].pop("var_index")
1624
1714
 
1625
- @property
1626
- def categoricals(self) -> dict[str, FieldAttr]:
1627
- """Return the obs fields to validate against."""
1628
- return self._obs_fields
1715
+ if len(self.non_validated[table].values()) == 0:
1716
+ self.non_validated.pop(table)
1629
1717
 
1630
- def lookup(self, public: bool = False) -> CurateLookup:
1631
- """Lookup categories.
1718
+ def add_new_from(
1719
+ self,
1720
+ key: str,
1721
+ accessor: str | None = None,
1722
+ **kwargs,
1723
+ ) -> None:
1724
+ """Save new values of categorical from sample level metadata or table.
1632
1725
 
1633
1726
  Args:
1634
- public: If "public", the lookup is performed on the public reference.
1727
+ key: The key referencing the slot in the DataFrame.
1728
+ accessor: The accessor key such as 'sample' or 'table x'.
1729
+ organism: The organism name.
1730
+ **kwargs: Additional keyword arguments to pass to create new records.
1635
1731
  """
1636
- return CurateLookup(
1637
- categoricals=self._obs_fields,
1638
- slots={"columns": self._columns_field, **self._var_fields_flat},
1639
- public=public,
1640
- )
1732
+ if self._non_validated is None:
1733
+ raise ValidationError("Run .validate() first.")
1641
1734
 
1642
- def standardize(self, key: str):
1643
- """Replace synonyms with standardized values.
1735
+ if len(kwargs) > 0 and key == "all":
1736
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
1737
+
1738
+ if accessor not in self.categoricals:
1739
+ raise ValueError(
1740
+ f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
1741
+ )
1742
+
1743
+ if accessor in self._table_adata_curators:
1744
+ adata_curator = self._table_adata_curators[accessor]
1745
+ adata_curator.add_new_from(key=key, **kwargs)
1746
+ if accessor == self._sample_metadata_key:
1747
+ self._sample_df_curator.add_new_from(key=key, **kwargs)
1748
+
1749
+ if accessor in self.non_validated.keys():
1750
+ if len(self.non_validated[accessor].values()) == 0:
1751
+ self.non_validated.pop(accessor)
1752
+
1753
+ def standardize(self, key: str, accessor: str | None = None) -> None:
1754
+ """Replace synonyms with canonical values.
1644
1755
 
1645
1756
  Modifies the dataset inplace.
1646
1757
 
1647
1758
  Args:
1648
- key: The key referencing the slot in the `tiledbsoma` store.
1649
- It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
1650
- or a column name in `.obs`.
1759
+ key: The key referencing the slot in the table or sample metadata.
1760
+ accessor: The accessor key such as 'sample_key' or 'table_key'.
1651
1761
  """
1652
1762
  if len(self.non_validated) == 0:
1653
1763
  logger.warning("values are already standardized")
1654
1764
  return
1655
- avail_keys = list(self._non_validated_values.keys())
1656
- if key == "all":
1657
- keys = avail_keys
1765
+ if self._artifact is not None:
1766
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
1767
+
1768
+ if accessor == self._sample_metadata_key:
1769
+ if key not in self._sample_metadata.columns:
1770
+ raise ValueError(f"key '{key}' not present in '{accessor}'!")
1658
1771
  else:
1659
- if key not in avail_keys:
1660
- raise KeyError(
1661
- f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
1662
- )
1663
- keys = [key]
1772
+ if (
1773
+ key == "var_index" and self._sdata.tables[accessor].var.index is None
1774
+ ) or (
1775
+ key != "var_index"
1776
+ and key not in self._sdata.tables[accessor].obs.columns
1777
+ ):
1778
+ raise ValueError(f"key '{key}' not present in '{accessor}'!")
1664
1779
 
1665
- for k in keys:
1666
- values, field = self._non_validated_values_field(k)
1667
- if len(values) == 0:
1668
- continue
1669
- if k in self._valid_var_keys:
1670
- ms, _, slot_key = k.partition("__")
1671
- slot = lambda experiment: experiment.ms[ms].var # noqa: B023
1672
- else:
1673
- slot = lambda experiment: experiment.obs
1674
- slot_key = k
1675
- # errors if public ontology and the model has no organism
1676
- # has to be fixed in bionty
1677
- organism = check_registry_organism(field.field.model, self._organism).get(
1678
- "organism"
1679
- )
1680
- syn_mapper = standardize_categories(
1681
- values=values,
1682
- field=field,
1683
- source=self._sources.get(k),
1684
- organism=organism,
1685
- )
1686
- if (n_syn_mapper := len(syn_mapper)) == 0:
1687
- continue
1780
+ if accessor in self._table_adata_curators.keys():
1781
+ adata_curator = self._table_adata_curators[accessor]
1782
+ adata_curator.standardize(key)
1783
+ if accessor == self._sample_metadata_key:
1784
+ self._sample_df_curator.standardize(key)
1688
1785
 
1689
- from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1786
+ if len(self.non_validated[accessor].values()) == 0:
1787
+ self.non_validated.pop(accessor)
1690
1788
 
1691
- with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1692
- value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
1693
- table = slot(experiment).read(value_filter=value_filter).concat()
1789
+ def validate(self) -> bool:
1790
+ """Validate variables and categorical observations.
1694
1791
 
1695
- if len(table) == 0:
1696
- continue
1792
+ This method also registers the validated records in the current instance:
1793
+ - from public sources
1697
1794
 
1698
- df = table.to_pandas()
1699
- # map values
1700
- df[slot_key] = df[slot_key].map(
1701
- lambda val: syn_mapper.get(val, val) # noqa
1702
- )
1703
- # write the mapped values
1704
- with _open_tiledbsoma(self._dataset, mode="w") as experiment:
1705
- slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
1706
- # update non_validated dict
1707
- non_val_k = [
1708
- nv for nv in self._non_validated_values[k] if nv not in syn_mapper
1709
- ]
1710
- self._non_validated_values[k] = non_val_k
1795
+ Args:
1796
+ organism: The organism name.
1711
1797
 
1712
- syn_mapper_print = _format_values(
1713
- [f'"{m_k}" "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
1714
- )
1715
- s = "s" if n_syn_mapper > 1 else ""
1716
- logger.success(
1717
- f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
1718
- )
1798
+ Returns:
1799
+ Whether the SpatialData object is validated.
1800
+ """
1801
+ # add all validated records to the current instance
1802
+ verbosity = settings.verbosity
1803
+ try:
1804
+ settings.verbosity = "error"
1805
+ self._update_registry_all()
1806
+ finally:
1807
+ settings.verbosity = verbosity
1808
+
1809
+ self._non_validated = {} # type: ignore
1810
+
1811
+ sample_validated = True
1812
+ if self._sample_df_curator:
1813
+ logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
1814
+ sample_validated &= self._sample_df_curator.validate()
1815
+ if len(self._sample_df_curator.non_validated) > 0:
1816
+ self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
1817
+ logger.print("")
1818
+
1819
+ mods_validated = True
1820
+ for table, adata_curator in self._table_adata_curators.items():
1821
+ logger.info(f"validating categoricals of table '{table}' ...")
1822
+ mods_validated &= adata_curator.validate()
1823
+ if len(adata_curator.non_validated) > 0:
1824
+ self._non_validated[table] = adata_curator.non_validated # type: ignore
1825
+ logger.print("")
1826
+
1827
+ self._is_validated = sample_validated & mods_validated
1828
+ return self._is_validated
1719
1829
 
1720
1830
  def save_artifact(
1721
1831
  self,
@@ -1725,424 +1835,388 @@ class TiledbsomaCatManager(CatManager):
1725
1835
  revises: Artifact | None = None,
1726
1836
  run: Run | None = None,
1727
1837
  ) -> Artifact:
1728
- """Save the validated `tiledbsoma` store and metadata.
1838
+ """Save the validated SpatialData store and metadata.
1729
1839
 
1730
1840
  Args:
1731
- description: A description of the ``tiledbsoma`` store.
1841
+ description: A description of the dataset.
1732
1842
  key: A path-like key to reference artifact in default storage,
1733
- e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
1843
+ e.g., `"myartifact.zarr"`. Artifacts with the same key form a version family.
1734
1844
  revises: Previous version of the artifact. Triggers a revision.
1735
1845
  run: The run that creates the artifact.
1736
1846
 
1737
1847
  Returns:
1738
1848
  A saved artifact record.
1739
1849
  """
1740
- from lamindb.models.artifact import add_labels
1741
-
1742
1850
  if not self._is_validated:
1743
1851
  self.validate()
1744
1852
  if not self._is_validated:
1745
1853
  raise ValidationError("Dataset does not validate. Please curate.")
1746
1854
 
1747
- if self._artifact is None:
1748
- artifact = Artifact(
1749
- self._dataset,
1750
- description=description,
1751
- key=key,
1752
- revises=revises,
1753
- run=run,
1754
- )
1755
- artifact.n_observations = self._n_obs
1756
- artifact.otype = "tiledbsoma"
1757
- artifact.save()
1855
+ return save_artifact(
1856
+ self._sdata,
1857
+ description=description,
1858
+ fields=self.categoricals,
1859
+ index_field=self.var_index,
1860
+ key=key,
1861
+ artifact=self._artifact,
1862
+ revises=revises,
1863
+ run=run,
1864
+ schema=None,
1865
+ organism=self._organism,
1866
+ sample_metadata_key=self._sample_metadata_key,
1867
+ )
1868
+
1869
+
1870
+ class TiledbsomaCatManager(CatManager):
1871
+ """Categorical manager for `tiledbsoma.Experiment`."""
1872
+
1873
+ def __init__(
1874
+ self,
1875
+ experiment_uri: UPathStr | Artifact,
1876
+ var_index: dict[str, tuple[str, FieldAttr]],
1877
+ categoricals: dict[str, FieldAttr] | None = None,
1878
+ obs_columns: FieldAttr = Feature.name,
1879
+ organism: str | None = None,
1880
+ sources: dict[str, Record] | None = None,
1881
+ ):
1882
+ self._obs_fields = categoricals or {}
1883
+ self._var_fields = var_index
1884
+ self._columns_field = obs_columns
1885
+ if isinstance(experiment_uri, Artifact):
1886
+ self._dataset = experiment_uri.path
1887
+ self._artifact = experiment_uri
1758
1888
  else:
1759
- artifact = self._artifact
1889
+ self._dataset = UPath(experiment_uri)
1890
+ self._artifact = None
1891
+ self._organism = organism
1892
+ self._sources = sources or {}
1760
1893
 
1761
- feature_sets = {}
1762
- if len(self._obs_fields) > 0:
1763
- organism = check_registry_organism(
1764
- self._columns_field.field.model, self._organism
1765
- ).get("organism")
1766
- empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
1767
- mock_df = pa.Table.from_pydict(
1768
- empty_dict, schema=self._obs_pa_schema
1769
- ).to_pandas()
1770
- # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
1771
- feature_sets["obs"] = Schema.from_df(
1772
- df=mock_df,
1773
- field=self._columns_field,
1774
- mute=True,
1775
- organism=organism,
1776
- )
1777
- for ms in self._var_fields:
1778
- var_key, var_field = self._var_fields[ms]
1779
- organism = check_registry_organism(
1780
- var_field.field.model, self._organism
1781
- ).get("organism")
1782
- feature_sets[f"{ms}__var"] = Schema.from_values(
1783
- values=self._validated_values[f"{ms}__{var_key}"],
1784
- field=var_field,
1785
- organism=organism,
1786
- raise_validation_error=False,
1787
- )
1788
- artifact._staged_feature_sets = feature_sets
1789
-
1790
- feature_ref_is_name = _ref_is_name(self._columns_field)
1791
- features = Feature.lookup().dict()
1792
- for key, field in self._obs_fields.items():
1793
- feature = features.get(key)
1794
- registry = field.field.model
1795
- organism = check_registry_organism(field.field.model, self._organism).get(
1796
- "organism"
1797
- )
1798
- labels = registry.from_values(
1799
- values=self._validated_values[key], field=field, organism=organism
1800
- )
1801
- if len(labels) == 0:
1802
- continue
1803
- if hasattr(registry, "_name_field"):
1804
- label_ref_is_name = field.field.name == registry._name_field
1805
- add_labels(
1806
- artifact,
1807
- records=labels,
1808
- feature=feature,
1809
- feature_ref_is_name=feature_ref_is_name,
1810
- label_ref_is_name=label_ref_is_name,
1811
- from_curator=True,
1812
- )
1813
-
1814
- return artifact.save()
1815
-
1816
-
1817
- class SpatialDataCatManager(CatManager):
1818
- """Curation flow for a ``Spatialdata`` object.
1819
-
1820
- See also :class:`~lamindb.Curator`.
1821
-
1822
- Note that if genes or other measurements are removed from the SpatialData object,
1823
- the object should be recreated.
1894
+ self._is_validated: bool | None = False
1895
+ self._non_validated_values: dict[str, list] | None = None
1896
+ self._validated_values: dict[str, list] = {}
1897
+ # filled by _check_save_keys
1898
+ self._n_obs: int | None = None
1899
+ self._valid_obs_keys: list[str] | None = None
1900
+ self._obs_pa_schema: pa.lib.Schema | None = (
1901
+ None # this is needed to create the obs feature set
1902
+ )
1903
+ self._valid_var_keys: list[str] | None = None
1904
+ self._var_fields_flat: dict[str, FieldAttr] | None = None
1905
+ self._check_save_keys()
1824
1906
 
1825
- In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
1907
+ # check that the provided keys in var_index and categoricals are available in the store
1908
+ # and save features
1909
+ def _check_save_keys(self):
1910
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1826
1911
 
1827
- Args:
1828
- sdata: The SpatialData object to curate.
1829
- var_index: A dictionary mapping table keys to the ``.var`` indices.
1830
- categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
1912
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1913
+ experiment_obs = experiment.obs
1914
+ self._n_obs = len(experiment_obs)
1915
+ self._obs_pa_schema = experiment_obs.schema
1916
+ valid_obs_keys = [
1917
+ k for k in self._obs_pa_schema.names if k != "soma_joinid"
1918
+ ]
1919
+ self._valid_obs_keys = valid_obs_keys
1831
1920
 
1832
- organism: The organism name.
1833
- sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
1834
- exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
1835
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1836
- using the exclude parameter ensures they are not validated.
1837
- verbosity: The verbosity level of the logger.
1838
- sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
1839
-
1840
- Examples:
1841
- >>> import bionty as bt
1842
- >>> curator = SpatialDataCatManager(
1843
- ... sdata,
1844
- ... var_index={
1845
- ... "table_1": bt.Gene.ensembl_gene_id,
1846
- ... },
1847
- ... categoricals={
1848
- ... "table1":
1849
- ... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
1850
- ... "sample":
1851
- ... {"experimental_factor": bt.ExperimentalFactor.name},
1852
- ... },
1853
- ... organism="human",
1854
- ... )
1855
- """
1921
+ valid_var_keys = []
1922
+ ms_list = []
1923
+ for ms in experiment.ms.keys():
1924
+ ms_list.append(ms)
1925
+ var_ms = experiment.ms[ms].var
1926
+ valid_var_keys += [
1927
+ f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
1928
+ ]
1929
+ self._valid_var_keys = valid_var_keys
1856
1930
 
1857
- def __init__(
1858
- self,
1859
- sdata: Any,
1860
- var_index: dict[str, FieldAttr],
1861
- categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1862
- verbosity: str = "hint",
1863
- organism: str | None = None,
1864
- sources: dict[str, dict[str, Record]] | None = None,
1865
- exclude: dict[str, dict] | None = None,
1866
- *,
1867
- sample_metadata_key: str | None = "sample",
1868
- ) -> None:
1869
- super().__init__(
1870
- dataset=sdata,
1871
- categoricals={},
1872
- sources=sources,
1873
- organism=organism,
1874
- exclude=exclude,
1875
- )
1876
- if isinstance(sdata, Artifact):
1877
- # TODO: load() doesn't yet work
1878
- self._sdata = sdata.load()
1879
- else:
1880
- self._sdata = self._dataset
1881
- self._sample_metadata_key = sample_metadata_key
1882
- self._write_path = None
1883
- self._var_fields = var_index
1884
- self._verify_accessor_exists(self._var_fields.keys())
1885
- self._categoricals = categoricals
1886
- self._table_keys = set(self._var_fields.keys()) | set(
1887
- self._categoricals.keys() - {self._sample_metadata_key}
1888
- )
1889
- self._verbosity = verbosity
1890
- self._sample_df_curator = None
1891
- if self._sample_metadata_key is not None:
1892
- self._sample_metadata = self._sdata.get_attrs(
1893
- key=self._sample_metadata_key, return_as="df", flatten=True
1894
- )
1895
- self._is_validated = False
1931
+ # check validity of keys in categoricals
1932
+ nonval_keys = []
1933
+ for obs_key in self._obs_fields.keys():
1934
+ if obs_key not in valid_obs_keys:
1935
+ nonval_keys.append(obs_key)
1936
+ _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1896
1937
 
1897
- # Check validity of keys in categoricals
1938
+ # check validity of keys in var_index
1939
+ self._var_fields_flat = {}
1898
1940
  nonval_keys = []
1899
- for accessor, accessor_categoricals in self._categoricals.items():
1900
- if (
1901
- accessor == self._sample_metadata_key
1902
- and self._sample_metadata is not None
1903
- ):
1904
- for key in accessor_categoricals.keys():
1905
- if key not in self._sample_metadata.columns:
1906
- nonval_keys.append(key)
1941
+ for ms_key in self._var_fields.keys():
1942
+ var_key, var_field = self._var_fields[ms_key]
1943
+ var_key_flat = f"{ms_key}__{var_key}"
1944
+ if var_key_flat not in valid_var_keys:
1945
+ nonval_keys.append(f"({ms_key}, {var_key})")
1907
1946
  else:
1908
- for key in accessor_categoricals.keys():
1909
- if key not in self._sdata[accessor].obs.columns:
1910
- nonval_keys.append(key)
1911
-
1912
- _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1947
+ self._var_fields_flat[var_key_flat] = var_field
1948
+ _maybe_curation_keys_not_present(nonval_keys, "var_index")
1913
1949
 
1914
- # check validity of keys in sources and exclude
1915
- for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
1916
- nonval_keys = []
1917
- for accessor, accessor_sources in dct.items():
1918
- if (
1919
- accessor == self._sample_metadata_key
1920
- and self._sample_metadata is not None
1921
- ):
1922
- columns = self._sample_metadata.columns
1923
- elif accessor != self._sample_metadata_key:
1924
- columns = self._sdata[accessor].obs.columns
1925
- else:
1926
- continue
1927
- for key in accessor_sources:
1928
- if key not in columns:
1929
- nonval_keys.append(key)
1930
- _maybe_curation_keys_not_present(nonval_keys, name)
1950
+ # check validity of keys in sources
1951
+ valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
1952
+ nonval_keys = []
1953
+ for arg_key in self._sources.keys():
1954
+ if arg_key not in valid_arg_keys:
1955
+ nonval_keys.append(arg_key)
1956
+ _maybe_curation_keys_not_present(nonval_keys, "sources")
1931
1957
 
1932
- # Set up sample level metadata and table Curator objects
1958
+ # register obs columns' names
1959
+ register_columns = list(self._obs_fields.keys())
1960
+ organism = configure_organism(
1961
+ self._columns_field.field.model, self._organism
1962
+ ).get("organism")
1963
+ update_registry(
1964
+ values=register_columns,
1965
+ field=self._columns_field,
1966
+ key="columns",
1967
+ validated_only=False,
1968
+ organism=organism,
1969
+ source=self._sources.get("columns"),
1970
+ )
1971
+ additional_columns = [k for k in valid_obs_keys if k not in register_columns]
1972
+ # no need to register with validated_only=True if columns are features
1933
1973
  if (
1934
- self._sample_metadata_key is not None
1935
- and self._sample_metadata_key in self._categoricals
1974
+ len(additional_columns) > 0
1975
+ and self._columns_field.field.model is not Feature
1936
1976
  ):
1937
- self._sample_df_curator = DataFrameCatManager(
1938
- df=self._sample_metadata,
1939
- columns=Feature.name,
1940
- categoricals=self._categoricals.get(self._sample_metadata_key, {}),
1941
- verbosity=verbosity,
1942
- sources=self._sources.get(self._sample_metadata_key),
1943
- exclude=self._exclude.get(self._sample_metadata_key),
1944
- organism=organism,
1945
- )
1946
- self._table_adata_curators = {
1947
- table: AnnDataCatManager(
1948
- data=self._sdata[table],
1949
- var_index=var_index.get(table),
1950
- categoricals=self._categoricals.get(table),
1951
- verbosity=verbosity,
1952
- sources=self._sources.get(table),
1953
- exclude=self._exclude.get(table),
1977
+ update_registry(
1978
+ values=additional_columns,
1979
+ field=self._columns_field,
1980
+ key="columns",
1981
+ validated_only=True,
1954
1982
  organism=organism,
1983
+ source=self._sources.get("columns"),
1955
1984
  )
1956
- for table in self._table_keys
1957
- }
1958
1985
 
1959
- self._non_validated = None
1986
+ def validate(self):
1987
+ """Validate categories."""
1988
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1960
1989
 
1961
- @property
1962
- def var_index(self) -> FieldAttr:
1963
- """Return the registry fields to validate variables indices against."""
1964
- return self._var_fields
1990
+ validated = True
1991
+ self._non_validated_values = {}
1992
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1993
+ for ms, (key, field) in self._var_fields.items():
1994
+ var_ms = experiment.ms[ms].var
1995
+ var_ms_key = f"{ms}__{key}"
1996
+ # it was already validated and cached
1997
+ if var_ms_key in self._validated_values:
1998
+ continue
1999
+ var_ms_values = (
2000
+ var_ms.read(column_names=[key]).concat()[key].to_pylist()
2001
+ )
2002
+ organism = configure_organism(field.field.model, self._organism).get(
2003
+ "organism"
2004
+ )
2005
+ update_registry(
2006
+ values=var_ms_values,
2007
+ field=field,
2008
+ key=var_ms_key,
2009
+ validated_only=True,
2010
+ organism=organism,
2011
+ source=self._sources.get(var_ms_key),
2012
+ )
2013
+ _, non_val = validate_categories(
2014
+ values=var_ms_values,
2015
+ field=field,
2016
+ key=var_ms_key,
2017
+ organism=organism,
2018
+ source=self._sources.get(var_ms_key),
2019
+ )
2020
+ if len(non_val) > 0:
2021
+ validated = False
2022
+ self._non_validated_values[var_ms_key] = non_val
2023
+ else:
2024
+ self._validated_values[var_ms_key] = var_ms_values
1965
2025
 
1966
- @property
1967
- def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
1968
- """Return the categorical keys and fields to validate against."""
1969
- return self._categoricals
2026
+ obs = experiment.obs
2027
+ for key, field in self._obs_fields.items():
2028
+ # already validated and cached
2029
+ if key in self._validated_values:
2030
+ continue
2031
+ values = pa.compute.unique(
2032
+ obs.read(column_names=[key]).concat()[key]
2033
+ ).to_pylist()
2034
+ organism = configure_organism(field.field.model, self._organism).get(
2035
+ "organism"
2036
+ )
2037
+ update_registry(
2038
+ values=values,
2039
+ field=field,
2040
+ key=key,
2041
+ validated_only=True,
2042
+ organism=organism,
2043
+ source=self._sources.get(key),
2044
+ )
2045
+ _, non_val = validate_categories(
2046
+ values=values,
2047
+ field=field,
2048
+ key=key,
2049
+ organism=organism,
2050
+ source=self._sources.get(key),
2051
+ )
2052
+ if len(non_val) > 0:
2053
+ validated = False
2054
+ self._non_validated_values[key] = non_val
2055
+ else:
2056
+ self._validated_values[key] = values
2057
+ self._is_validated = validated
2058
+ return self._is_validated
1970
2059
 
1971
- @property
1972
- def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
1973
- """Return the non-validated features and labels."""
1974
- if self._non_validated is None:
1975
- raise ValidationError("Please run validate() first!")
1976
- return self._non_validated
2060
+ def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
2061
+ assert self._non_validated_values is not None # noqa: S101
1977
2062
 
1978
- def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
1979
- """Verify that the accessors exist (either a valid table or in attrs)."""
1980
- for acc in accessors:
1981
- is_present = False
1982
- try:
1983
- self._sdata.get_attrs(key=acc)
1984
- is_present = True
1985
- except KeyError:
1986
- if acc in self._sdata.tables.keys():
1987
- is_present = True
1988
- if not is_present:
1989
- raise ValidationError(f"Accessor '{acc}' does not exist!")
2063
+ if key in self._valid_obs_keys:
2064
+ field = self._obs_fields[key]
2065
+ elif key in self._valid_var_keys:
2066
+ ms = key.partition("__")[0]
2067
+ field = self._var_fields[ms][1]
2068
+ else:
2069
+ raise KeyError(f"key {key} is invalid!")
2070
+ values = self._non_validated_values.get(key, [])
2071
+ return values, field
1990
2072
 
1991
- def lookup(self, public: bool = False) -> CurateLookup:
1992
- """Look up categories.
2073
+ def add_new_from(self, key: str, **kwargs) -> None:
2074
+ """Add validated & new categories.
1993
2075
 
1994
2076
  Args:
1995
- public: Whether the lookup is performed on the public reference.
2077
+ key: The key referencing the slot in the `tiledbsoma` store.
2078
+ It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
2079
+ or a column name in `.obs`.
1996
2080
  """
1997
- cat_values_dict = list(self.categoricals.values())[0]
1998
- return CurateLookup(
1999
- categoricals=cat_values_dict,
2000
- slots={"accessors": cat_values_dict.keys()},
2001
- public=public,
2002
- )
2003
-
2004
- def _update_registry_all(self) -> None:
2005
- """Saves labels of all features for sample and table metadata."""
2006
- if self._sample_df_curator is not None:
2007
- self._sample_df_curator._update_registry_all(
2008
- validated_only=True,
2081
+ if self._non_validated_values is None:
2082
+ raise ValidationError("Run .validate() first.")
2083
+ if key == "all":
2084
+ keys = list(self._non_validated_values.keys())
2085
+ else:
2086
+ avail_keys = list(
2087
+ chain(self._non_validated_values.keys(), self._validated_values.keys())
2009
2088
  )
2010
- for _, adata_curator in self._table_adata_curators.items():
2011
- adata_curator._obs_df_curator._update_registry_all(
2012
- validated_only=True,
2089
+ if key not in avail_keys:
2090
+ raise KeyError(
2091
+ f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
2092
+ )
2093
+ keys = [key]
2094
+ for k in keys:
2095
+ values, field = self._non_validated_values_field(k)
2096
+ if len(values) == 0:
2097
+ continue
2098
+ organism = configure_organism(field.field.model, self._organism).get(
2099
+ "organism"
2100
+ )
2101
+ update_registry(
2102
+ values=values,
2103
+ field=field,
2104
+ key=k,
2105
+ validated_only=False,
2106
+ organism=organism,
2107
+ source=self._sources.get(k),
2108
+ **kwargs,
2013
2109
  )
2110
+ # update non-validated values list but keep the key there
2111
+ # it will be removed by .validate()
2112
+ if k in self._non_validated_values:
2113
+ self._non_validated_values[k] = []
2014
2114
 
2015
- def add_new_from_var_index(self, table: str, **kwargs) -> None:
2016
- """Save new values from ``.var.index`` of table.
2115
+ @property
2116
+ def non_validated(self) -> dict[str, list]:
2117
+ """Return the non-validated features and labels."""
2118
+ non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
2119
+ return non_val
2017
2120
 
2018
- Args:
2019
- table: The table key.
2020
- organism: The organism name.
2021
- **kwargs: Additional keyword arguments to pass to create new records.
2022
- """
2023
- if self._non_validated is None:
2024
- raise ValidationError("Run .validate() first.")
2025
- self._table_adata_curators[table].add_new_from_var_index(**kwargs)
2026
- if table in self.non_validated.keys():
2027
- if "var_index" in self._non_validated[table]:
2028
- self._non_validated[table].pop("var_index")
2121
+ @property
2122
+ def var_index(self) -> dict[str, FieldAttr]:
2123
+ """Return the registry fields with flattened keys to validate variables indices against."""
2124
+ return self._var_fields_flat
2029
2125
 
2030
- if len(self.non_validated[table].values()) == 0:
2031
- self.non_validated.pop(table)
2126
+ @property
2127
+ def categoricals(self) -> dict[str, FieldAttr]:
2128
+ """Return the obs fields to validate against."""
2129
+ return self._obs_fields
2032
2130
 
2033
- def add_new_from(
2034
- self,
2035
- key: str,
2036
- accessor: str | None = None,
2037
- **kwargs,
2038
- ) -> None:
2039
- """Save new values of categorical from sample level metadata or table.
2131
+ def lookup(self, public: bool = False) -> CatLookup:
2132
+ """Lookup categories.
2040
2133
 
2041
2134
  Args:
2042
- key: The key referencing the slot in the DataFrame.
2043
- accessor: The accessor key such as 'sample' or 'table x'.
2044
- organism: The organism name.
2045
- **kwargs: Additional keyword arguments to pass to create new records.
2135
+ public: If "public", the lookup is performed on the public reference.
2046
2136
  """
2047
- if self._non_validated is None:
2048
- raise ValidationError("Run .validate() first.")
2049
-
2050
- if len(kwargs) > 0 and key == "all":
2051
- raise ValueError("Cannot pass additional arguments to 'all' key!")
2052
-
2053
- if accessor not in self.categoricals:
2054
- raise ValueError(
2055
- f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
2056
- )
2057
-
2058
- if accessor in self._table_adata_curators:
2059
- adata_curator = self._table_adata_curators[accessor]
2060
- adata_curator.add_new_from(key=key, **kwargs)
2061
- if accessor == self._sample_metadata_key:
2062
- self._sample_df_curator.add_new_from(key=key, **kwargs)
2063
-
2064
- if accessor in self.non_validated.keys():
2065
- if len(self.non_validated[accessor].values()) == 0:
2066
- self.non_validated.pop(accessor)
2137
+ return CatLookup(
2138
+ categoricals=self._obs_fields,
2139
+ slots={"columns": self._columns_field, **self._var_fields_flat},
2140
+ public=public,
2141
+ )
2067
2142
 
2068
- def standardize(self, key: str, accessor: str | None = None) -> None:
2069
- """Replace synonyms with canonical values.
2143
+ def standardize(self, key: str):
2144
+ """Replace synonyms with standardized values.
2070
2145
 
2071
2146
  Modifies the dataset inplace.
2072
2147
 
2073
2148
  Args:
2074
- key: The key referencing the slot in the table or sample metadata.
2075
- accessor: The accessor key such as 'sample_key' or 'table_key'.
2149
+ key: The key referencing the slot in the `tiledbsoma` store.
2150
+ It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
2151
+ or a column name in `.obs`.
2076
2152
  """
2077
2153
  if len(self.non_validated) == 0:
2078
2154
  logger.warning("values are already standardized")
2079
2155
  return
2080
- if self._artifact is not None:
2081
- raise RuntimeError("can't mutate the dataset when an artifact is passed!")
2082
-
2083
- if accessor == self._sample_metadata_key:
2084
- if key not in self._sample_metadata.columns:
2085
- raise ValueError(f"key '{key}' not present in '{accessor}'!")
2156
+ avail_keys = list(self._non_validated_values.keys())
2157
+ if key == "all":
2158
+ keys = avail_keys
2086
2159
  else:
2087
- if (
2088
- key == "var_index" and self._sdata.tables[accessor].var.index is None
2089
- ) or (
2090
- key != "var_index"
2091
- and key not in self._sdata.tables[accessor].obs.columns
2092
- ):
2093
- raise ValueError(f"key '{key}' not present in '{accessor}'!")
2094
-
2095
- if accessor in self._table_adata_curators.keys():
2096
- adata_curator = self._table_adata_curators[accessor]
2097
- adata_curator.standardize(key)
2098
- if accessor == self._sample_metadata_key:
2099
- self._sample_df_curator.standardize(key)
2100
-
2101
- if len(self.non_validated[accessor].values()) == 0:
2102
- self.non_validated.pop(accessor)
2103
-
2104
- def validate(self) -> bool:
2105
- """Validate variables and categorical observations.
2106
-
2107
- This method also registers the validated records in the current instance:
2108
- - from public sources
2109
-
2110
- Args:
2111
- organism: The organism name.
2160
+ if key not in avail_keys:
2161
+ raise KeyError(
2162
+ f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
2163
+ )
2164
+ keys = [key]
2112
2165
 
2113
- Returns:
2114
- Whether the SpatialData object is validated.
2115
- """
2116
- from lamindb.core._settings import settings
2166
+ for k in keys:
2167
+ values, field = self._non_validated_values_field(k)
2168
+ if len(values) == 0:
2169
+ continue
2170
+ if k in self._valid_var_keys:
2171
+ ms, _, slot_key = k.partition("__")
2172
+ slot = lambda experiment: experiment.ms[ms].var # noqa: B023
2173
+ else:
2174
+ slot = lambda experiment: experiment.obs
2175
+ slot_key = k
2176
+ # errors if public ontology and the model has no organism
2177
+ # has to be fixed in bionty
2178
+ organism = configure_organism(field.field.model, self._organism).get(
2179
+ "organism"
2180
+ )
2181
+ syn_mapper = standardize_categories(
2182
+ values=values,
2183
+ field=field,
2184
+ source=self._sources.get(k),
2185
+ organism=organism,
2186
+ )
2187
+ if (n_syn_mapper := len(syn_mapper)) == 0:
2188
+ continue
2117
2189
 
2118
- # add all validated records to the current instance
2119
- verbosity = settings.verbosity
2120
- try:
2121
- settings.verbosity = "error"
2122
- self._update_registry_all()
2123
- finally:
2124
- settings.verbosity = verbosity
2190
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
2125
2191
 
2126
- self._non_validated = {} # type: ignore
2192
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
2193
+ value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
2194
+ table = slot(experiment).read(value_filter=value_filter).concat()
2127
2195
 
2128
- sample_validated = True
2129
- if self._sample_df_curator:
2130
- logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
2131
- sample_validated &= self._sample_df_curator.validate()
2132
- if len(self._sample_df_curator.non_validated) > 0:
2133
- self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
2134
- logger.print("")
2196
+ if len(table) == 0:
2197
+ continue
2135
2198
 
2136
- mods_validated = True
2137
- for table, adata_curator in self._table_adata_curators.items():
2138
- logger.info(f"validating categoricals of table '{table}' ...")
2139
- mods_validated &= adata_curator.validate()
2140
- if len(adata_curator.non_validated) > 0:
2141
- self._non_validated[table] = adata_curator.non_validated # type: ignore
2142
- logger.print("")
2199
+ df = table.to_pandas()
2200
+ # map values
2201
+ df[slot_key] = df[slot_key].map(
2202
+ lambda val: syn_mapper.get(val, val) # noqa
2203
+ )
2204
+ # write the mapped values
2205
+ with _open_tiledbsoma(self._dataset, mode="w") as experiment:
2206
+ slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
2207
+ # update non_validated dict
2208
+ non_val_k = [
2209
+ nv for nv in self._non_validated_values[k] if nv not in syn_mapper
2210
+ ]
2211
+ self._non_validated_values[k] = non_val_k
2143
2212
 
2144
- self._is_validated = sample_validated & mods_validated
2145
- return self._is_validated
2213
+ syn_mapper_print = _format_values(
2214
+ [f'"{m_k}" → "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
2215
+ )
2216
+ s = "s" if n_syn_mapper > 1 else ""
2217
+ logger.success(
2218
+ f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
2219
+ )
2146
2220
 
2147
2221
  def save_artifact(
2148
2222
  self,
@@ -2152,217 +2226,119 @@ class SpatialDataCatManager(CatManager):
2152
2226
  revises: Artifact | None = None,
2153
2227
  run: Run | None = None,
2154
2228
  ) -> Artifact:
2229
+ """Save the validated `tiledbsoma` store and metadata.
2230
+
2231
+ Args:
2232
+ description: A description of the ``tiledbsoma`` store.
2233
+ key: A path-like key to reference artifact in default storage,
2234
+ e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
2235
+ revises: Previous version of the artifact. Triggers a revision.
2236
+ run: The run that creates the artifact.
2237
+
2238
+ Returns:
2239
+ A saved artifact record.
2240
+ """
2155
2241
  if not self._is_validated:
2156
2242
  self.validate()
2157
2243
  if not self._is_validated:
2158
2244
  raise ValidationError("Dataset does not validate. Please curate.")
2159
2245
 
2160
- verbosity = settings.verbosity
2161
- try:
2162
- settings.verbosity = "warning"
2163
-
2164
- self._artifact = Artifact.from_spatialdata(
2165
- self._sdata,
2166
- key=key,
2246
+ if self._artifact is None:
2247
+ artifact = Artifact(
2248
+ self._dataset,
2167
2249
  description=description,
2250
+ key=key,
2168
2251
  revises=revises,
2169
2252
  run=run,
2170
2253
  )
2171
- self._artifact.save()
2254
+ artifact.n_observations = self._n_obs
2255
+ artifact.otype = "tiledbsoma"
2256
+ artifact.save()
2257
+ else:
2258
+ artifact = self._artifact
2172
2259
 
2173
- # Link schemas
2174
- feature_kwargs = check_registry_organism(
2175
- (list(self._var_fields.values())[0].field.model),
2176
- self._organism,
2260
+ feature_sets = {}
2261
+ if len(self._obs_fields) > 0:
2262
+ organism = configure_organism(
2263
+ self._columns_field.field.model, self._organism
2264
+ ).get("organism")
2265
+ empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
2266
+ mock_df = pa.Table.from_pydict(
2267
+ empty_dict, schema=self._obs_pa_schema
2268
+ ).to_pandas()
2269
+ # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
2270
+ feature_sets["obs"] = Schema.from_df(
2271
+ df=mock_df,
2272
+ field=self._columns_field,
2273
+ mute=True,
2274
+ organism=organism,
2177
2275
  )
2178
-
2179
- def _add_set_from_spatialdata(
2180
- host: Artifact | Collection | Run,
2181
- var_fields: dict[str, FieldAttr],
2182
- obs_fields: dict[str, FieldAttr] = None,
2183
- mute: bool = False,
2184
- organism: str | Record | None = None,
2185
- ):
2186
- """Add Schemas from SpatialData."""
2187
- if obs_fields is None:
2188
- obs_fields = {}
2189
- assert host.otype == "SpatialData" # noqa: S101
2190
-
2191
- feature_sets = {}
2192
-
2193
- # sample features
2194
- sample_features = Feature.from_values(self._sample_metadata.columns) # type: ignore
2195
- if len(sample_features) > 0:
2196
- feature_sets[self._sample_metadata_key] = Schema(
2197
- features=sample_features
2198
- )
2199
-
2200
- # table features
2201
- for table, field in var_fields.items():
2202
- table_fs = parse_staged_feature_sets_from_anndata(
2203
- self._sdata[table],
2204
- var_field=field,
2205
- obs_field=obs_fields.get(table, Feature.name),
2206
- mute=mute,
2207
- organism=organism,
2208
- )
2209
- for k, v in table_fs.items():
2210
- feature_sets[f"['{table}'].{k}"] = v
2211
-
2212
- def _unify_staged_feature_sets_by_hash(
2213
- feature_sets: MutableMapping[str, Schema],
2214
- ):
2215
- unique_values: dict[str, Any] = {}
2216
-
2217
- for key, value in feature_sets.items():
2218
- value_hash = (
2219
- value.hash
2220
- ) # Assuming each value has a .hash attribute
2221
- if value_hash in unique_values:
2222
- feature_sets[key] = unique_values[value_hash]
2223
- else:
2224
- unique_values[value_hash] = value
2225
-
2226
- return feature_sets
2227
-
2228
- # link feature sets
2229
- host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
2230
- feature_sets
2231
- )
2232
- host.save()
2233
-
2234
- _add_set_from_spatialdata(
2235
- self._artifact, var_fields=self._var_fields, **feature_kwargs
2276
+ for ms in self._var_fields:
2277
+ var_key, var_field = self._var_fields[ms]
2278
+ organism = configure_organism(var_field.field.model, self._organism).get(
2279
+ "organism"
2236
2280
  )
2281
+ feature_sets[f"{ms}__var"] = Schema.from_values(
2282
+ values=self._validated_values[f"{ms}__{var_key}"],
2283
+ field=var_field,
2284
+ organism=organism,
2285
+ raise_validation_error=False,
2286
+ )
2287
+ artifact._staged_feature_sets = feature_sets
2237
2288
 
2238
- # Link labels
2239
- def _add_labels_from_spatialdata(
2240
- data,
2241
- artifact: Artifact,
2242
- fields: dict[str, FieldAttr],
2243
- feature_ref_is_name: bool | None = None,
2244
- ):
2245
- """Add Labels from SpatialData."""
2246
- features = Feature.lookup().dict()
2247
- for key, field in fields.items():
2248
- feature = features.get(key)
2249
- registry = field.field.model
2250
- filter_kwargs = check_registry_organism(registry, self._organism)
2251
- filter_kwargs_current = get_current_filter_kwargs(
2252
- registry, filter_kwargs
2253
- )
2254
- df = data if isinstance(data, pd.DataFrame) else data.obs
2255
- labels = registry.from_values(
2256
- df[key],
2257
- field=field,
2258
- **filter_kwargs_current,
2259
- )
2260
- if len(labels) == 0:
2261
- continue
2262
-
2263
- label_ref_is_name = None
2264
- if hasattr(registry, "_name_field"):
2265
- label_ref_is_name = field.field.name == registry._name_field
2266
- add_labels(
2267
- artifact,
2268
- records=labels,
2269
- feature=feature,
2270
- feature_ref_is_name=feature_ref_is_name,
2271
- label_ref_is_name=label_ref_is_name,
2272
- from_curator=True,
2273
- )
2274
-
2275
- for accessor, accessor_fields in self._categoricals.items():
2276
- column_field = self._var_fields.get(accessor)
2277
- if accessor == self._sample_metadata_key:
2278
- _add_labels_from_spatialdata(
2279
- self._sample_metadata,
2280
- self._artifact,
2281
- accessor_fields,
2282
- feature_ref_is_name=(
2283
- None if column_field is None else _ref_is_name(column_field)
2284
- ),
2285
- )
2286
- else:
2287
- _add_labels_from_spatialdata(
2288
- self._sdata.tables[accessor],
2289
- self._artifact,
2290
- accessor_fields,
2291
- feature_ref_is_name=(
2292
- None if column_field is None else _ref_is_name(column_field)
2293
- ),
2294
- )
2295
-
2296
- finally:
2297
- settings.verbosity = verbosity
2298
-
2299
- slug = ln_setup.settings.instance.slug
2300
- if ln_setup.settings.instance.is_remote: # pragma: no cover
2301
- logger.important(
2302
- f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
2289
+ feature_ref_is_name = _ref_is_name(self._columns_field)
2290
+ features = Feature.lookup().dict()
2291
+ for key, field in self._obs_fields.items():
2292
+ feature = features.get(key)
2293
+ registry = field.field.model
2294
+ organism = configure_organism(field.field.model, self._organism).get(
2295
+ "organism"
2296
+ )
2297
+ labels = registry.from_values(
2298
+ values=self._validated_values[key], field=field, organism=organism
2303
2299
  )
2300
+ if len(labels) == 0:
2301
+ continue
2302
+ if hasattr(registry, "_name_field"):
2303
+ label_ref_is_name = field.field.name == registry._name_field
2304
+ add_labels(
2305
+ artifact,
2306
+ records=labels,
2307
+ feature=feature,
2308
+ feature_ref_is_name=feature_ref_is_name,
2309
+ label_ref_is_name=label_ref_is_name,
2310
+ from_curator=True,
2311
+ )
2304
2312
 
2305
- return self._artifact
2313
+ return artifact.save()
2306
2314
 
2307
2315
 
2308
- def _restrict_obs_fields(
2309
- obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
2310
- ) -> dict[str, str]:
2311
- """Restrict the obs fields to name return only available obs fields.
2316
+ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2317
+ """Categorical manager for `AnnData` respecting the CELLxGENE schema.
2312
2318
 
2313
- To simplify the curation, we only validate against either name or ontology_id.
2314
- If both are available, we validate against ontology_id.
2315
- If none are available, we validate against name.
2319
+ This will be superceded by a schema-based curation flow.
2316
2320
  """
2317
- obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
2318
- for name, field in obs_fields.items():
2319
- if name.endswith("_ontology_term_id"):
2320
- continue
2321
- # if both the ontology id and the name are present, only validate on the ontology_id
2322
- if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
2323
- obs_fields_unique.pop(name)
2324
- # if the neither name nor ontology id are present, validate on the name
2325
- # this will raise error downstream, we just use name to be more readable
2326
- if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
2327
- obs_fields_unique[name] = field
2328
-
2329
- # Only retain obs_fields_unique that have keys in adata.obs.columns
2330
- available_obs_fields = {
2331
- k: v for k, v in obs_fields_unique.items() if k in obs.columns
2332
- }
2333
-
2334
- return available_obs_fields
2335
-
2336
2321
 
2337
- def _add_defaults_to_obs(
2338
- obs: pd.DataFrame,
2339
- defaults: dict[str, str],
2340
- ) -> None:
2341
- """Add default columns and values to obs DataFrame."""
2342
- added_defaults: dict = {}
2343
- for name, default in defaults.items():
2344
- if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
2345
- obs[name] = default
2346
- added_defaults[name] = default
2347
- logger.important(
2348
- f"added default value '{default}' to the adata.obs['{name}']"
2349
- )
2350
-
2351
-
2352
- class CellxGeneAnnDataCatManager(AnnDataCatManager):
2353
- """Annotation flow of AnnData based on CELLxGENE schema."""
2354
-
2355
- _controls_were_created: bool | None = None
2322
+ cxg_categoricals_defaults = {
2323
+ "cell_type": "unknown",
2324
+ "development_stage": "unknown",
2325
+ "disease": "normal",
2326
+ "donor_id": "unknown",
2327
+ "self_reported_ethnicity": "unknown",
2328
+ "sex": "unknown",
2329
+ "suspension_type": "cell",
2330
+ "tissue_type": "tissue",
2331
+ }
2356
2332
 
2357
2333
  def __init__(
2358
2334
  self,
2359
- adata: ad.AnnData | UPathStr,
2335
+ adata: ad.AnnData,
2360
2336
  categoricals: dict[str, FieldAttr] | None = None,
2361
2337
  organism: Literal["human", "mouse"] = "human",
2362
2338
  *,
2339
+ schema_version: Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
2363
2340
  defaults: dict[str, str] = None,
2364
2341
  extra_sources: dict[str, Record] = None,
2365
- schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
2366
2342
  verbosity: str = "hint",
2367
2343
  ) -> None:
2368
2344
  """CELLxGENE schema curator.
@@ -2372,304 +2348,85 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2372
2348
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
2373
2349
  The CELLxGENE Curator maps against the required CELLxGENE fields by default.
2374
2350
  organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
2351
+ schema_version: The CELLxGENE schema version to curate against.
2375
2352
  defaults: Default values that are set if columns or column values are missing.
2376
2353
  extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
2377
2354
  These extra sources are joined with the CELLxGENE fixed sources.
2378
2355
  Use this parameter when subclassing.
2379
- exclude: A dictionary mapping column names to values to exclude.
2380
- schema_version: The CELLxGENE schema version to curate against.
2381
2356
  verbosity: The verbosity level.
2382
-
2383
2357
  """
2384
2358
  import bionty as bt
2385
2359
 
2386
- CellxGeneAnnDataCatManager._init_categoricals_additional_values()
2360
+ from ._cellxgene_schemas import (
2361
+ _add_defaults_to_obs,
2362
+ _create_sources,
2363
+ _init_categoricals_additional_values,
2364
+ _restrict_obs_fields,
2365
+ )
2387
2366
 
2388
- var_index: FieldAttr = bt.Gene.ensembl_gene_id
2367
+ # Add defaults first to ensure that we fetch valid sources
2368
+ if defaults:
2369
+ _add_defaults_to_obs(adata.obs, defaults)
2389
2370
 
2371
+ # Filter categoricals based on what's present in adata
2390
2372
  if categoricals is None:
2391
- categoricals = CellxGeneAnnDataCatManager._get_categoricals()
2373
+ categoricals = self._get_cxg_categoricals()
2374
+ categoricals = _restrict_obs_fields(adata.obs, categoricals)
2392
2375
 
2393
- self.organism = organism
2394
-
2395
- VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
2396
- if schema_version not in VALID_SCHEMA_VERSIONS:
2397
- valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
2398
- raise ValueError(
2399
- f"Invalid schema_version: {schema_version}. "
2400
- f"Valid versions are: {valid_versions}"
2401
- )
2376
+ # Configure sources
2377
+ sources = _create_sources(categoricals, schema_version, organism)
2402
2378
  self.schema_version = schema_version
2403
2379
  self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
2404
- with resources.path(
2405
- "lamindb.curators._cellxgene_schemas", "schema_versions.yml"
2406
- ) as schema_versions_path:
2407
- self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
2408
- self.schema_version
2409
- ]
2410
-
2411
- # Fetch AnnData obs to be able to set defaults and get sources
2412
- if isinstance(adata, ad.AnnData):
2413
- self._adata_obs = adata.obs
2414
- else:
2415
- self._adata_obs = backed_access(upath.create_path(adata)).obs # type: ignore
2416
-
2417
- # Add defaults first to ensure that we fetch valid sources
2418
- if defaults:
2419
- _add_defaults_to_obs(self._adata_obs, defaults)
2420
-
2421
- self.sources = self._create_sources(self._adata_obs)
2422
- self.sources = {
2423
- entity: source
2424
- for entity, source in self.sources.items()
2425
- if source is not None
2426
- }
2427
-
2428
2380
  # These sources are not a part of the cellxgene schema but rather passed through.
2429
2381
  # This is useful when other Curators extend the CELLxGENE curator
2430
2382
  if extra_sources:
2431
- self.sources = self.sources | extra_sources
2383
+ sources = sources | extra_sources
2432
2384
 
2433
- # Exclude default values from validation because they are not available in the pinned sources
2434
- exclude_keys = {
2435
- entity: default
2436
- for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
2437
- if entity in self._adata_obs.columns # type: ignore
2438
- }
2385
+ _init_categoricals_additional_values()
2439
2386
 
2440
2387
  super().__init__(
2441
2388
  data=adata,
2442
- var_index=var_index,
2443
- categoricals=_restrict_obs_fields(self._adata_obs, categoricals),
2389
+ var_index=bt.Gene.ensembl_gene_id,
2390
+ categoricals=categoricals,
2444
2391
  verbosity=verbosity,
2445
2392
  organism=organism,
2446
- sources=self.sources,
2447
- exclude=exclude_keys,
2393
+ sources=sources,
2448
2394
  )
2449
2395
 
2450
2396
  @classmethod
2451
- def _init_categoricals_additional_values(cls) -> None:
2452
- import bionty as bt
2453
-
2454
- import lamindb as ln
2455
-
2456
- # Note: if you add another control below, be mindful to change the if condition that
2457
- # triggers whether creating these records is re-considered
2458
- if cls._controls_were_created is None:
2459
- cls._controls_were_created = (
2460
- ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
2461
- is not None
2462
- )
2463
- if not cls._controls_were_created:
2464
- logger.important("Creating control labels in the CellxGene schema.")
2465
- bt.CellType(
2466
- ontology_id="unknown",
2467
- name="unknown",
2468
- description="From CellxGene schema.",
2469
- ).save()
2470
- pato = bt.Source.filter(name="pato", version="2024-03-28").one()
2471
- normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
2472
- bt.Disease(
2473
- uid=normal.uid,
2474
- name=normal.name,
2475
- ontology_id=normal.ontology_id,
2476
- description=normal.description,
2477
- source=normal.source,
2478
- ).save()
2479
- bt.Ethnicity(
2480
- ontology_id="na", name="na", description="From CellxGene schema."
2481
- ).save()
2482
- bt.Ethnicity(
2483
- ontology_id="unknown",
2484
- name="unknown",
2485
- description="From CellxGene schema.",
2486
- ).save()
2487
- bt.DevelopmentalStage(
2488
- ontology_id="unknown",
2489
- name="unknown",
2490
- description="From CellxGene schema.",
2491
- ).save()
2492
- bt.Phenotype(
2493
- ontology_id="unknown",
2494
- name="unknown",
2495
- description="From CellxGene schema.",
2496
- ).save()
2497
-
2498
- tissue_type = ln.ULabel(
2499
- name="TissueType",
2500
- is_type=True,
2501
- description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
2502
- ).save()
2503
- ln.ULabel(
2504
- name="tissue", type=tissue_type, description="From CellxGene schema."
2505
- ).save()
2506
- ln.ULabel(
2507
- name="organoid", type=tissue_type, description="From CellxGene schema."
2508
- ).save()
2509
- ln.ULabel(
2510
- name="cell culture",
2511
- type=tissue_type,
2512
- description="From CellxGene schema.",
2513
- ).save()
2514
-
2515
- suspension_type = ln.ULabel(
2516
- name="SuspensionType",
2517
- is_type=True,
2518
- description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
2519
- ).save()
2520
- ln.ULabel(
2521
- name="cell", type=suspension_type, description="From CellxGene schema."
2522
- ).save()
2523
- ln.ULabel(
2524
- name="nucleus",
2525
- type=suspension_type,
2526
- description="From CellxGene schema.",
2527
- ).save()
2528
- ln.ULabel(name="na", type=suspension_type).save()
2529
-
2530
- @classmethod
2531
- def _get_categoricals(cls) -> dict[str, FieldAttr]:
2532
- import bionty as bt
2533
-
2534
- return {
2535
- "assay": bt.ExperimentalFactor.name,
2536
- "assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
2537
- "cell_type": bt.CellType.name,
2538
- "cell_type_ontology_term_id": bt.CellType.ontology_id,
2539
- "development_stage": bt.DevelopmentalStage.name,
2540
- "development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
2541
- "disease": bt.Disease.name,
2542
- "disease_ontology_term_id": bt.Disease.ontology_id,
2543
- # "donor_id": "str", via pandera
2544
- "self_reported_ethnicity": bt.Ethnicity.name,
2545
- "self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
2546
- "sex": bt.Phenotype.name,
2547
- "sex_ontology_term_id": bt.Phenotype.ontology_id,
2548
- "suspension_type": ULabel.name,
2549
- "tissue": bt.Tissue.name,
2550
- "tissue_ontology_term_id": bt.Tissue.ontology_id,
2551
- "tissue_type": ULabel.name,
2552
- "organism": bt.Organism.name,
2553
- "organism_ontology_term_id": bt.Organism.ontology_id,
2554
- }
2555
-
2556
- @classmethod
2397
+ @deprecated(new_name="cxg_categoricals_defaults")
2557
2398
  def _get_categoricals_defaults(cls) -> dict[str, str]:
2558
- return {
2559
- "cell_type": "unknown",
2560
- "development_stage": "unknown",
2561
- "disease": "normal",
2562
- "donor_id": "unknown",
2563
- "self_reported_ethnicity": "unknown",
2564
- "sex": "unknown",
2565
- "suspension_type": "cell",
2566
- "tissue_type": "tissue",
2567
- }
2568
-
2569
- @property
2570
- def pinned_ontologies(self) -> pd.DataFrame:
2571
- return self._pinned_ontologies
2572
-
2573
- @property
2574
- def adata(self) -> AnnData:
2575
- return self._adata
2576
-
2577
- def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
2578
- """Creates a sources dictionary that can be passed to AnnDataCatManager."""
2579
- import bionty as bt
2580
-
2581
- # fmt: off
2582
- def _fetch_bionty_source(
2583
- entity: str, organism: str, source: str
2584
- ) -> bt.Source | None:
2585
- """Fetch the Bionty source of the pinned ontology.
2399
+ return cls.cxg_categoricals_defaults
2586
2400
 
2587
- Returns None if the source does not exist.
2588
- """
2589
- version = self._pinned_ontologies.loc[(self._pinned_ontologies.index == entity) &
2590
- (self._pinned_ontologies["organism"] == organism) &
2591
- (self._pinned_ontologies["source"] == source), "version"].iloc[0]
2592
- return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
2593
-
2594
- entity_mapping = {
2595
- "var_index": ("Gene", self.organism, "ensembl"),
2596
- "cell_type": ("CellType", "all", "cl"),
2597
- "assay": ("ExperimentalFactor", "all", "efo"),
2598
- "self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
2599
- "development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
2600
- "disease": ("Disease", "all", "mondo"),
2601
- # "organism": ("Organism", "vertebrates", "ensembl"),
2602
- "sex": ("Phenotype", "all", "pato"),
2603
- "tissue": ("Tissue", "all", "uberon"),
2604
- }
2605
- # fmt: on
2606
-
2607
- # Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
2608
- entity_to_sources = {
2609
- entity: _fetch_bionty_source(*params)
2610
- for entity, params in entity_mapping.items()
2611
- if entity in obs.columns
2612
- or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
2613
- or entity == "var_index"
2614
- }
2615
-
2616
- return entity_to_sources
2617
-
2618
- def _convert_name_to_ontology_id(self, values: pd.Series, field: FieldAttr):
2619
- """Converts a column that stores a name into a column that stores the ontology id.
2620
-
2621
- cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
2622
- """
2623
- field_name = field.field.name
2624
- assert field_name == "name" # noqa: S101
2625
- cols = ["name", "ontology_id"]
2626
- registry = field.field.model
2401
+ @classmethod
2402
+ def _get_cxg_categoricals(cls) -> dict[str, FieldAttr]:
2403
+ """Returns the CELLxGENE schema mapped fields."""
2404
+ from ._cellxgene_schemas import _get_cxg_categoricals
2627
2405
 
2628
- if hasattr(registry, "ontology_id"):
2629
- validated_records = registry.filter(**{f"{field_name}__in": values})
2630
- mapper = (
2631
- pd.DataFrame(validated_records.values_list(*cols))
2632
- .set_index(0)
2633
- .to_dict()[1]
2634
- )
2635
- return values.map(mapper)
2406
+ return _get_cxg_categoricals()
2636
2407
 
2637
- def validate(self) -> bool: # type: ignore
2408
+ def validate(self) -> bool:
2638
2409
  """Validates the AnnData object against most cellxgene requirements."""
2410
+ from ._cellxgene_schemas import RESERVED_NAMES
2411
+
2639
2412
  # Verify that all required obs columns are present
2413
+ required_columns = list(self.cxg_categoricals_defaults.keys()) + ["donor_id"]
2640
2414
  missing_obs_fields = [
2641
2415
  name
2642
- for name in CellxGeneAnnDataCatManager._get_categoricals_defaults().keys()
2416
+ for name in required_columns
2643
2417
  if name not in self._adata.obs.columns
2644
2418
  and f"{name}_ontology_term_id" not in self._adata.obs.columns
2645
2419
  ]
2646
2420
  if len(missing_obs_fields) > 0:
2647
- missing_obs_fields_str = ", ".join(list(missing_obs_fields))
2648
- logger.error(f"missing required obs columns {missing_obs_fields_str}")
2649
- logger.info(
2650
- "consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
2651
- "to automatically add these columns with default values."
2421
+ logger.error(
2422
+ f"missing required obs columns {_format_values(missing_obs_fields)}\n"
2423
+ " → consider initializing a Curate object with `defaults=cxg.CellxGeneAnnDataCatManager.cxg_categoricals_defaults` to automatically add these columns with default values"
2652
2424
  )
2653
2425
  return False
2654
2426
 
2655
2427
  # Verify that no cellxgene reserved names are present
2656
- reserved_names = {
2657
- "ethnicity",
2658
- "ethnicity_ontology_term_id",
2659
- "X_normalization",
2660
- "default_field",
2661
- "layer_descriptions",
2662
- "tags",
2663
- "versions",
2664
- "contributors",
2665
- "preprint_doi",
2666
- "project_description",
2667
- "project_links",
2668
- "project_name",
2669
- "publication_doi",
2670
- }
2671
2428
  matched_columns = [
2672
- column for column in self._adata.obs.columns if column in reserved_names
2429
+ column for column in self._adata.obs.columns if column in RESERVED_NAMES
2673
2430
  ]
2674
2431
  if len(matched_columns) > 0:
2675
2432
  raise ValueError(
@@ -2696,6 +2453,26 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2696
2453
  Returns:
2697
2454
  An AnnData object which adheres to the cellxgene-schema.
2698
2455
  """
2456
+
2457
+ def _convert_name_to_ontology_id(values: pd.Series, field: FieldAttr):
2458
+ """Converts a column that stores a name into a column that stores the ontology id.
2459
+
2460
+ cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
2461
+ """
2462
+ field_name = field.field.name
2463
+ assert field_name == "name" # noqa: S101
2464
+ cols = ["name", "ontology_id"]
2465
+ registry = field.field.model
2466
+
2467
+ if hasattr(registry, "ontology_id"):
2468
+ validated_records = registry.filter(**{f"{field_name}__in": values})
2469
+ mapper = (
2470
+ pd.DataFrame(validated_records.values_list(*cols))
2471
+ .set_index(0)
2472
+ .to_dict()[1]
2473
+ )
2474
+ return values.map(mapper)
2475
+
2699
2476
  # Create a copy since we modify the AnnData object extensively
2700
2477
  adata_cxg = self._adata.copy()
2701
2478
 
@@ -2715,7 +2492,7 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2715
2492
  # convert name column to ontology_term_id column
2716
2493
  for column in adata_cxg.obs.columns:
2717
2494
  if column in self.categoricals and not column.endswith("_ontology_term_id"):
2718
- mapped_column = self._convert_name_to_ontology_id(
2495
+ mapped_column = _convert_name_to_ontology_id(
2719
2496
  adata_cxg.obs[column], field=self.categoricals.get(column)
2720
2497
  )
2721
2498
  if mapped_column is not None:
@@ -2881,7 +2658,7 @@ class TimeHandler:
2881
2658
 
2882
2659
 
2883
2660
  class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2884
- """Curator flow for Perturbation data."""
2661
+ """Categorical manager for `AnnData` to manage perturbations."""
2885
2662
 
2886
2663
  PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
2887
2664
 
@@ -2892,45 +2669,32 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2892
2669
  pert_dose: bool = True,
2893
2670
  pert_time: bool = True,
2894
2671
  *,
2672
+ cxg_schema_version: Literal["5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
2895
2673
  verbosity: str = "hint",
2896
- cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
2897
2674
  ):
2898
2675
  """Initialize the curator with configuration and validation settings."""
2899
- import bionty as bt
2900
-
2901
2676
  self._pert_time = pert_time
2902
2677
  self._pert_dose = pert_dose
2903
2678
 
2904
2679
  self._validate_initial_data(adata)
2905
- self._setup_configuration(adata)
2906
-
2907
- self._setup_sources(adata)
2908
- self._setup_compound_source()
2680
+ categoricals, categoricals_defaults = self._configure_categoricals(adata)
2909
2681
 
2910
2682
  super().__init__(
2911
2683
  adata=adata,
2912
- categoricals=self.PT_CATEGORICALS,
2913
- defaults=self.PT_DEFAULT_VALUES,
2914
- verbosity=verbosity,
2684
+ categoricals=categoricals,
2685
+ defaults=categoricals_defaults,
2915
2686
  organism=organism,
2916
- extra_sources=self.PT_SOURCES,
2687
+ extra_sources=self._configure_sources(adata),
2917
2688
  schema_version=cxg_schema_version,
2689
+ verbosity=verbosity,
2918
2690
  )
2919
2691
 
2920
- def _setup_configuration(self, adata: ad.AnnData):
2692
+ def _configure_categoricals(self, adata: ad.AnnData):
2921
2693
  """Set up default configuration values."""
2922
2694
  import bionty as bt
2923
2695
  import wetlab as wl
2924
2696
 
2925
- self.PT_DEFAULT_VALUES = (
2926
- CellxGeneAnnDataCatManager._get_categoricals_defaults()
2927
- | {
2928
- "cell_line": "unknown",
2929
- "pert_target": "unknown",
2930
- }
2931
- )
2932
-
2933
- self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
2697
+ categoricals = CellxGeneAnnDataCatManager._get_cxg_categoricals() | {
2934
2698
  k: v
2935
2699
  for k, v in {
2936
2700
  "cell_line": bt.CellLine.name,
@@ -2942,22 +2706,40 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2942
2706
  }.items()
2943
2707
  if k in adata.obs.columns
2944
2708
  }
2945
- # if "donor_id" in self.PT_CATEGORICALS:
2946
- # self.PT_CATEGORICALS["donor_id"] = Donor.name
2709
+ # if "donor_id" in categoricals:
2710
+ # categoricals["donor_id"] = Donor.name
2947
2711
 
2948
- def _setup_sources(self, adata: ad.AnnData):
2712
+ categoricals_defaults = CellxGeneAnnDataCatManager.cxg_categoricals_defaults | {
2713
+ "cell_line": "unknown",
2714
+ "pert_target": "unknown",
2715
+ }
2716
+
2717
+ return categoricals, categoricals_defaults
2718
+
2719
+ def _configure_sources(self, adata: ad.AnnData):
2949
2720
  """Set up data sources."""
2950
- self.PT_SOURCES = {}
2951
- # if "cell_line" in adata.obs.columns:
2952
- # self.PT_SOURCES["cell_line"] = (
2953
- # bt.Source.filter(name="depmap").first()
2954
- # )
2721
+ import bionty as bt
2722
+ import wetlab as wl
2723
+
2724
+ sources = {}
2725
+ if "cell_line" in adata.obs.columns:
2726
+ sources["cell_line"] = bt.Source.filter(
2727
+ entity="bionty.CellLine", name="depmap"
2728
+ ).first()
2955
2729
  if "pert_compound" in adata.obs.columns:
2956
- import bionty as bt
2730
+ with logger.mute():
2731
+ chebi_source = bt.Source.filter(
2732
+ entity="wetlab.Compound", name="chebi"
2733
+ ).first()
2734
+ if not chebi_source:
2735
+ wl.Compound.add_source(
2736
+ bt.Source.filter(entity="Drug", name="chebi").first()
2737
+ )
2957
2738
 
2958
- self.PT_SOURCES["pert_compound"] = bt.Source.filter(
2739
+ sources["pert_compound"] = bt.Source.filter(
2959
2740
  entity="wetlab.Compound", name="chebi"
2960
2741
  ).first()
2742
+ return sources
2961
2743
 
2962
2744
  def _validate_initial_data(self, adata: ad.AnnData):
2963
2745
  """Validate the initial data structure."""
@@ -3005,20 +2787,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
3005
2787
  adata.obs[col_name].cat.remove_unused_categories()
3006
2788
  logger.important(f"mapped 'pert_name' to '{col_name}'")
3007
2789
 
3008
- def _setup_compound_source(self):
3009
- """Set up the compound source with muted logging."""
3010
- import bionty as bt
3011
- import wetlab as wl
3012
-
3013
- with logger.mute():
3014
- chebi_source = bt.Source.filter(
3015
- entity="wetlab.Compound", name="chebi"
3016
- ).first()
3017
- if not chebi_source:
3018
- wl.Compound.add_source(
3019
- bt.Source.filter(entity="Drug", name="chebi").first()
3020
- )
3021
-
3022
2790
  def validate(self) -> bool: # type: ignore
3023
2791
  """Validate the AnnData object."""
3024
2792
  validated = super().validate()
@@ -3136,8 +2904,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
3136
2904
 
3137
2905
  def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
3138
2906
  """Make sure the source and organism are saved in the same database as the registry."""
3139
- from lamindb.core._settings import settings
3140
-
3141
2907
  db = registry.filter().db
3142
2908
  source = kwargs.get("source")
3143
2909
  organism = kwargs.get("organism")
@@ -3162,44 +2928,15 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
3162
2928
  return filter_kwargs
3163
2929
 
3164
2930
 
3165
- def inspect_instance(
3166
- values: Iterable[str],
3167
- field: FieldAttr,
3168
- registry: type[Record],
3169
- exclude: str | list | None = None,
3170
- **kwargs,
3171
- ):
3172
- """Inspect values using a registry."""
3173
- # inspect exclude values in the default instance
3174
- values = list(values)
3175
- include_validated = []
3176
- if exclude is not None:
3177
- exclude = [exclude] if isinstance(exclude, str) else exclude
3178
- exclude = [i for i in exclude if i in values]
3179
- if len(exclude) > 0:
3180
- # exclude values are validated without source and organism
3181
- inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
3182
- # if exclude values are validated, remove them from the values
3183
- values = [i for i in values if i not in inspect_result_exclude.validated]
3184
- include_validated = inspect_result_exclude.validated
3185
-
3186
- inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
3187
- inspect_result._validated += include_validated
3188
- inspect_result._non_validated = [
3189
- i for i in inspect_result.non_validated if i not in include_validated
3190
- ]
3191
-
3192
- return inspect_result
3193
-
3194
-
3195
- def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
2931
+ def configure_organism(registry: Record, organism: str | None = None) -> dict[str, str]:
3196
2932
  """Check if a registry needs an organism and return the organism name."""
3197
- if hasattr(registry, "organism_id"):
2933
+ from ..models._from_values import _is_organism_required
2934
+
2935
+ if _is_organism_required(registry):
3198
2936
  import bionty as bt
3199
2937
 
3200
- if organism is None and bt.settings.organism is None:
3201
- return {}
3202
- return {"organism": organism or bt.settings.organism.name}
2938
+ if organism is not None or bt.settings.organism is not None:
2939
+ return {"organism": organism or bt.settings.organism.name}
3203
2940
  return {}
3204
2941
 
3205
2942
 
@@ -3209,7 +2946,6 @@ def validate_categories(
3209
2946
  key: str,
3210
2947
  organism: str | None = None,
3211
2948
  source: Record | None = None,
3212
- exclude: str | list | None = None,
3213
2949
  hint_print: str | None = None,
3214
2950
  curator: CatManager | None = None,
3215
2951
  ) -> tuple[bool, list[str]]:
@@ -3221,13 +2957,9 @@ def validate_categories(
3221
2957
  key: The key referencing the slot in the DataFrame.
3222
2958
  organism: The organism name.
3223
2959
  source: The source record.
3224
- exclude: Exclude specific values from validation.
3225
2960
  standardize: Whether to standardize the values.
3226
2961
  hint_print: The hint to print that suggests fixing non-validated values.
3227
2962
  """
3228
- from lamindb.core._settings import settings
3229
- from lamindb.models._from_values import _format_values
3230
-
3231
2963
  model_field = f"{field.field.model.__name__}.{field.field.name}"
3232
2964
 
3233
2965
  def _log_mapping_info():
@@ -3237,36 +2969,26 @@ def validate_categories(
3237
2969
 
3238
2970
  registry = field.field.model
3239
2971
 
3240
- # {"organism": organism_name/organism_record}
3241
- kwargs = check_registry_organism(registry, organism)
2972
+ # {"organism": organism_name}
2973
+ kwargs = configure_organism(registry, organism)
3242
2974
  kwargs.update({"source": source} if source else {})
3243
2975
  kwargs_current = get_current_filter_kwargs(registry, kwargs)
3244
2976
 
3245
2977
  # inspect values from the default instance
3246
- inspect_result = inspect_instance(
3247
- values=values,
3248
- field=field,
3249
- registry=registry,
3250
- exclude=exclude,
3251
- **kwargs_current,
3252
- )
2978
+ inspect_result = registry.inspect(values, field=field, mute=True, **kwargs_current)
3253
2979
  non_validated = inspect_result.non_validated
3254
2980
  syn_mapper = inspect_result.synonyms_mapper
3255
2981
 
3256
2982
  # inspect the non-validated values from public (bionty only)
3257
2983
  values_validated = []
3258
2984
  if hasattr(registry, "public"):
3259
- verbosity = settings.verbosity
3260
- try:
3261
- settings.verbosity = "error"
3262
- public_records = registry.from_values(
3263
- non_validated,
3264
- field=field,
3265
- **kwargs_current,
3266
- )
3267
- values_validated += [getattr(r, field.field.name) for r in public_records]
3268
- finally:
3269
- settings.verbosity = verbosity
2985
+ public_records = registry.from_values(
2986
+ non_validated,
2987
+ field=field,
2988
+ mute=True,
2989
+ **kwargs_current,
2990
+ )
2991
+ values_validated += [getattr(r, field.field.name) for r in public_records]
3270
2992
 
3271
2993
  # logging messages
3272
2994
  non_validated_hint_print = hint_print or f'.add_new_from("{key}")'
@@ -3330,7 +3052,6 @@ def validate_categories_in_df(
3330
3052
  df: pd.DataFrame,
3331
3053
  fields: dict[str, FieldAttr],
3332
3054
  sources: dict[str, Record] = None,
3333
- exclude: dict | None = None,
3334
3055
  curator: CatManager | None = None,
3335
3056
  **kwargs,
3336
3057
  ) -> tuple[bool, dict]:
@@ -3348,7 +3069,6 @@ def validate_categories_in_df(
3348
3069
  field=field,
3349
3070
  key=key,
3350
3071
  source=sources.get(key),
3351
- exclude=exclude.get(key) if exclude else None,
3352
3072
  curator=curator,
3353
3073
  **kwargs,
3354
3074
  )
@@ -3359,9 +3079,10 @@ def validate_categories_in_df(
3359
3079
 
3360
3080
 
3361
3081
  def save_artifact(
3362
- data: pd.DataFrame | ad.AnnData | MuData,
3082
+ data: pd.DataFrame | ScverseDataStructures,
3083
+ *,
3363
3084
  fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
3364
- columns_field: FieldAttr | dict[str, FieldAttr] | None = None,
3085
+ index_field: FieldAttr | dict[str, FieldAttr] | None = None,
3365
3086
  description: str | None = None,
3366
3087
  organism: str | None = None,
3367
3088
  key: str | None = None,
@@ -3369,73 +3090,64 @@ def save_artifact(
3369
3090
  revises: Artifact | None = None,
3370
3091
  run: Run | None = None,
3371
3092
  schema: Schema | None = None,
3093
+ **kwargs,
3372
3094
  ) -> Artifact:
3373
3095
  """Save all metadata with an Artifact.
3374
3096
 
3375
3097
  Args:
3376
- data: The DataFrame/AnnData/MuData object to save.
3098
+ data: The object to save.
3377
3099
  fields: A dictionary mapping obs_column to registry_field.
3378
- columns_field: The registry field to validate variables index against.
3100
+ index_field: The registry field to validate variables index against.
3379
3101
  description: A description of the artifact.
3380
3102
  organism: The organism name.
3381
- type: The artifact type.
3382
3103
  key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
3383
3104
  artifact: A already registered artifact. Passing this will not save a new artifact from data.
3384
3105
  revises: Previous version of the artifact. Triggers a revision.
3385
3106
  run: The run that creates the artifact.
3107
+ schema: The Schema to associate with the Artifact.
3386
3108
 
3387
3109
  Returns:
3388
3110
  The saved Artifact.
3389
3111
  """
3390
- from ..models.artifact import add_labels, data_is_anndata, data_is_mudata
3112
+ from ..models.artifact import add_labels
3391
3113
 
3392
3114
  if artifact is None:
3393
- if data_is_anndata(data):
3394
- artifact = Artifact.from_anndata(
3115
+ if isinstance(data, pd.DataFrame):
3116
+ artifact = Artifact.from_df(
3395
3117
  data, description=description, key=key, revises=revises, run=run
3396
3118
  )
3397
- elif isinstance(data, pd.DataFrame):
3398
- artifact = Artifact.from_df(
3119
+ elif isinstance(data, AnnData):
3120
+ artifact = Artifact.from_anndata(
3399
3121
  data, description=description, key=key, revises=revises, run=run
3400
3122
  )
3401
3123
  elif data_is_mudata(data):
3402
3124
  artifact = Artifact.from_mudata(
3403
- data,
3404
- description=description,
3405
- key=key,
3406
- revises=revises,
3407
- run=run,
3125
+ data, description=description, key=key, revises=revises, run=run
3126
+ )
3127
+ elif data_is_spatialdata(data):
3128
+ artifact = Artifact.from_spatialdata(
3129
+ data, description=description, key=key, revises=revises, run=run
3130
+ )
3131
+ else:
3132
+ raise InvalidArgument( # pragma: no cover
3133
+ "data must be one of pd.Dataframe, AnnData, MuData, SpatialData."
3408
3134
  )
3409
- artifact.schema = schema
3410
3135
  artifact.save()
3411
3136
 
3412
- if organism is not None and columns_field is not None:
3413
- feature_kwargs = check_registry_organism(
3137
+ if organism is not None and index_field is not None:
3138
+ feature_kwargs = configure_organism(
3414
3139
  (
3415
- list(columns_field.values())[0].field.model
3416
- if isinstance(columns_field, dict)
3417
- else columns_field.field.model
3140
+ list(index_field.values())[0].field.model
3141
+ if isinstance(index_field, dict)
3142
+ else index_field.field.model
3418
3143
  ),
3419
3144
  organism,
3420
3145
  )
3421
3146
  else:
3422
3147
  feature_kwargs = {}
3423
3148
 
3424
- if artifact.otype == "DataFrame":
3425
- artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) # type: ignore
3426
- elif artifact.otype == "AnnData":
3427
- artifact.features._add_set_from_anndata( # type: ignore
3428
- var_field=columns_field, **feature_kwargs
3429
- )
3430
- elif artifact.otype == "MuData":
3431
- artifact.features._add_set_from_mudata( # type: ignore
3432
- var_fields=columns_field, **feature_kwargs
3433
- )
3434
- else:
3435
- raise NotImplementedError
3436
-
3437
3149
  def _add_labels(
3438
- data,
3150
+ data: pd.DataFrame | ScverseDataStructures,
3439
3151
  artifact: Artifact,
3440
3152
  fields: dict[str, FieldAttr],
3441
3153
  feature_ref_is_name: bool | None = None,
@@ -3444,7 +3156,7 @@ def save_artifact(
3444
3156
  for key, field in fields.items():
3445
3157
  feature = features.get(key)
3446
3158
  registry = field.field.model
3447
- filter_kwargs = check_registry_organism(registry, organism)
3159
+ filter_kwargs = configure_organism(registry, organism)
3448
3160
  filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
3449
3161
  df = data if isinstance(data, pd.DataFrame) else data.obs
3450
3162
  # multi-value columns are separated by "|"
@@ -3471,35 +3183,81 @@ def save_artifact(
3471
3183
  from_curator=True,
3472
3184
  )
3473
3185
 
3474
- if artifact.otype == "MuData":
3475
- for modality, modality_fields in fields.items():
3476
- column_field_modality = columns_field.get(modality)
3477
- if modality == "obs":
3478
- _add_labels(
3479
- data,
3480
- artifact,
3481
- modality_fields,
3482
- feature_ref_is_name=(
3483
- None
3484
- if column_field_modality is None
3485
- else _ref_is_name(column_field_modality)
3486
- ),
3487
- )
3488
- else:
3489
- _add_labels(
3490
- data[modality],
3491
- artifact,
3492
- modality_fields,
3493
- feature_ref_is_name=(
3494
- None
3495
- if column_field_modality is None
3496
- else _ref_is_name(column_field_modality)
3497
- ),
3498
- )
3499
- else:
3500
- _add_labels(
3501
- data, artifact, fields, feature_ref_is_name=_ref_is_name(columns_field)
3502
- )
3186
+ match artifact.otype:
3187
+ case "DataFrame":
3188
+ artifact.features._add_set_from_df(field=index_field, **feature_kwargs) # type: ignore
3189
+ _add_labels(
3190
+ data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
3191
+ )
3192
+ case "AnnData":
3193
+ artifact.features._add_set_from_anndata( # type: ignore
3194
+ var_field=index_field, **feature_kwargs
3195
+ )
3196
+ _add_labels(
3197
+ data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
3198
+ )
3199
+ case "MuData":
3200
+ artifact.features._add_set_from_mudata( # type: ignore
3201
+ var_fields=index_field, **feature_kwargs
3202
+ )
3203
+ for modality, modality_fields in fields.items():
3204
+ column_field_modality = index_field.get(modality)
3205
+ if modality == "obs":
3206
+ _add_labels(
3207
+ data,
3208
+ artifact,
3209
+ modality_fields,
3210
+ feature_ref_is_name=(
3211
+ None
3212
+ if column_field_modality is None
3213
+ else _ref_is_name(column_field_modality)
3214
+ ),
3215
+ )
3216
+ else:
3217
+ _add_labels(
3218
+ data[modality],
3219
+ artifact,
3220
+ modality_fields,
3221
+ feature_ref_is_name=(
3222
+ None
3223
+ if column_field_modality is None
3224
+ else _ref_is_name(column_field_modality)
3225
+ ),
3226
+ )
3227
+ case "SpatialData":
3228
+ artifact.features._add_set_from_spatialdata( # type: ignore
3229
+ sample_metadata_key=kwargs.get("sample_metadata_key", "sample"),
3230
+ var_fields=index_field,
3231
+ **feature_kwargs,
3232
+ )
3233
+ sample_metadata_key = kwargs.get("sample_metadata_key", "sample")
3234
+ for accessor, accessor_fields in fields.items():
3235
+ column_field = index_field.get(accessor)
3236
+ if accessor == sample_metadata_key:
3237
+ _add_labels(
3238
+ data.get_attrs(
3239
+ key=sample_metadata_key, return_as="df", flatten=True
3240
+ ),
3241
+ artifact,
3242
+ accessor_fields,
3243
+ feature_ref_is_name=(
3244
+ None if column_field is None else _ref_is_name(column_field)
3245
+ ),
3246
+ )
3247
+ else:
3248
+ _add_labels(
3249
+ data.tables[accessor],
3250
+ artifact,
3251
+ accessor_fields,
3252
+ feature_ref_is_name=(
3253
+ None if column_field is None else _ref_is_name(column_field)
3254
+ ),
3255
+ )
3256
+ case _:
3257
+ raise NotImplementedError # pragma: no cover
3258
+
3259
+ artifact.schema = schema
3260
+ artifact.save()
3503
3261
 
3504
3262
  slug = ln_setup.settings.instance.slug
3505
3263
  if ln_setup.settings.instance.is_remote: # pdagma: no cover
@@ -3529,8 +3287,7 @@ def update_registry(
3529
3287
  organism: str | None = None,
3530
3288
  dtype: str | None = None,
3531
3289
  source: Record | None = None,
3532
- exclude: str | list | None = None,
3533
- **kwargs,
3290
+ **create_kwargs,
3534
3291
  ) -> None:
3535
3292
  """Save features or labels records in the default instance..
3536
3293
 
@@ -3543,14 +3300,12 @@ def update_registry(
3543
3300
  organism: The organism name.
3544
3301
  dtype: The type of the feature.
3545
3302
  source: The source record.
3546
- exclude: Values to exclude from inspect.
3547
- kwargs: Additional keyword arguments to pass to the registry model to create new records.
3303
+ **create_kwargs: Additional keyword arguments to pass to the registry model to create new records.
3548
3304
  """
3549
- from lamindb.core._settings import settings
3550
3305
  from lamindb.models.save import save as ln_save
3551
3306
 
3552
3307
  registry = field.field.model
3553
- filter_kwargs = check_registry_organism(registry, organism)
3308
+ filter_kwargs = configure_organism(registry, organism)
3554
3309
  filter_kwargs.update({"source": source} if source else {})
3555
3310
  values = [i for i in values if isinstance(i, str) and i]
3556
3311
  if not values:
@@ -3608,14 +3363,16 @@ def update_registry(
3608
3363
  registry(
3609
3364
  **init_kwargs,
3610
3365
  **{k: v for k, v in filter_kwargs.items() if k != "source"},
3611
- **{k: v for k, v in kwargs.items() if k != "sources"},
3366
+ **{
3367
+ k: v for k, v in create_kwargs.items() if k != "sources"
3368
+ },
3612
3369
  )
3613
3370
  )
3614
3371
  ln_save(non_validated_records)
3615
3372
 
3616
3373
  # save parent labels for ulabels, for example a parent label "project" for label "project001"
3617
3374
  if registry == ULabel and field.field.name == "name":
3618
- save_ulabels_parent(values, field=field, key=key)
3375
+ save_ulabels_type(values, field=field, key=key)
3619
3376
 
3620
3377
  finally:
3621
3378
  settings.verbosity = verbosity
@@ -3653,16 +3410,18 @@ def log_saved_labels(
3653
3410
  )
3654
3411
 
3655
3412
 
3656
- def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None:
3657
- """Save a parent label for the given labels."""
3413
+ def save_ulabels_type(values: list[str], field: FieldAttr, key: str) -> None:
3414
+ """Save the ULabel type of the given labels."""
3658
3415
  registry = field.field.model
3659
3416
  assert registry == ULabel # noqa: S101
3660
- all_records = registry.from_values(list(values), field=field)
3661
- is_feature = registry.filter(name=f"{key}").one_or_none()
3662
- if is_feature is None:
3663
- is_feature = registry(name=f"{key}").save()
3664
- logger.important(f"Created a parent ULabel: {is_feature}")
3665
- is_feature.children.add(*all_records)
3417
+ all_records = registry.filter(**{field.field.name: list(values)}).all()
3418
+ # so `tissue_type` becomes `TissueType`
3419
+ type_name = "".join([i.capitalize() for i in key.lower().split("_")])
3420
+ ulabel_type = registry.filter(name=type_name, is_type=True).one_or_none()
3421
+ if ulabel_type is None:
3422
+ ulabel_type = registry(name=type_name, is_type=True).save()
3423
+ logger.important(f"Created a ULabel type: {ulabel_type}")
3424
+ all_records.update(type=ulabel_type)
3666
3425
 
3667
3426
 
3668
3427
  def _save_organism(name: str):
@@ -3761,7 +3520,6 @@ def from_tiledbsoma(
3761
3520
  obs_columns: FieldAttr = Feature.name,
3762
3521
  organism: str | None = None,
3763
3522
  sources: dict[str, Record] | None = None,
3764
- exclude: dict[str, str | list[str]] | None = None,
3765
3523
  ) -> TiledbsomaCatManager:
3766
3524
  return TiledbsomaCatManager(
3767
3525
  experiment_uri=experiment_uri,
@@ -3770,7 +3528,6 @@ def from_tiledbsoma(
3770
3528
  obs_columns=obs_columns,
3771
3529
  organism=organism,
3772
3530
  sources=sources,
3773
- exclude=exclude,
3774
3531
  )
3775
3532
 
3776
3533
 
@@ -3782,7 +3539,6 @@ def from_spatialdata(
3782
3539
  categoricals: dict[str, dict[str, FieldAttr]] | None = None,
3783
3540
  organism: str | None = None,
3784
3541
  sources: dict[str, dict[str, Record]] | None = None,
3785
- exclude: dict[str, dict] | None = None,
3786
3542
  verbosity: str = "hint",
3787
3543
  *,
3788
3544
  sample_metadata_key: str = "sample",
@@ -3799,7 +3555,6 @@ def from_spatialdata(
3799
3555
  verbosity=verbosity,
3800
3556
  organism=organism,
3801
3557
  sources=sources,
3802
- exclude=exclude,
3803
3558
  sample_metadata_key=sample_metadata_key,
3804
3559
  )
3805
3560