lamindb 1.2a2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,25 +1,27 @@
1
1
  """Curators.
2
2
 
3
- .. versionadded:: 1.1.0
4
-
5
3
  .. autosummary::
6
4
  :toctree: .
7
5
 
8
- Curator
9
6
  DataFrameCurator
10
7
  AnnDataCurator
8
+ MuDataCurator
9
+ SpatialDataCurator
11
10
 
12
- CatManager:
11
+ Helper classes.
13
12
 
14
13
  .. autosummary::
15
14
  :toctree: .
16
15
 
16
+ Curator
17
+ SlotsCurator
17
18
  CatManager
19
+ CatLookup
18
20
  DataFrameCatManager
19
21
  AnnDataCatManager
20
22
  MuDataCatManager
23
+ SpatialDataCatManager
21
24
  TiledbsomaCatManager
22
- CurateLookup
23
25
 
24
26
  """
25
27
 
@@ -27,9 +29,8 @@ from __future__ import annotations
27
29
 
28
30
  import copy
29
31
  import re
30
- from importlib import resources
31
32
  from itertools import chain
32
- from typing import TYPE_CHECKING, Any, Literal
33
+ from typing import TYPE_CHECKING, Any, Callable, Literal
33
34
 
34
35
  import anndata as ad
35
36
  import lamindb_setup as ln_setup
@@ -37,45 +38,44 @@ import pandas as pd
37
38
  import pandera
38
39
  import pyarrow as pa
39
40
  from lamin_utils import colors, logger
40
- from lamindb_setup.core import deprecated, upath
41
+ from lamindb_setup.core import deprecated
41
42
  from lamindb_setup.core._docs import doc_args
42
43
  from lamindb_setup.core.upath import UPath
43
44
 
44
- from lamindb.core.storage._backed_access import backed_access
45
-
46
- from ._cellxgene_schemas import _read_schema_versions
47
-
48
45
  if TYPE_CHECKING:
49
- from anndata import AnnData
50
46
  from lamindb_setup.core.types import UPathStr
47
+ from mudata import MuData
48
+ from spatialdata import SpatialData
51
49
 
52
- from lamindb.base.types import FieldAttr
50
+ from lamindb.core.types import ScverseDataStructures
53
51
  from lamindb.models import Record
54
52
  from lamindb.base.types import FieldAttr # noqa
55
53
  from lamindb.core._settings import settings
56
54
  from lamindb.models import (
57
55
  Artifact,
58
- Collection,
59
56
  Feature,
60
57
  Record,
61
58
  Run,
62
59
  Schema,
63
60
  ULabel,
64
61
  )
65
- from lamindb.models._feature_manager import parse_staged_feature_sets_from_anndata
66
- from lamindb.models.artifact import add_labels, data_is_anndata
67
- from lamindb.models.feature import parse_dtype, parse_dtype_single_cat
62
+ from lamindb.models.artifact import (
63
+ add_labels,
64
+ data_is_anndata,
65
+ data_is_mudata,
66
+ data_is_spatialdata,
67
+ )
68
+ from lamindb.models.feature import parse_dtype, parse_cat_dtype
68
69
  from lamindb.models._from_values import _format_values
69
70
 
70
71
  from ..errors import InvalidArgument, ValidationError
72
+ from anndata import AnnData
71
73
 
72
74
  if TYPE_CHECKING:
73
75
  from collections.abc import Iterable, MutableMapping
74
76
  from typing import Any
75
77
 
76
78
  from lamindb_setup.core.types import UPathStr
77
- from mudata import MuData
78
- from spatialdata import SpatialData
79
79
 
80
80
  from lamindb.models.query_set import RecordList
81
81
 
@@ -86,7 +86,7 @@ def strip_ansi_codes(text):
86
86
  return ansi_pattern.sub("", text)
87
87
 
88
88
 
89
- class CurateLookup:
89
+ class CatLookup:
90
90
  """Lookup categories from the reference instance.
91
91
 
92
92
  Args:
@@ -94,10 +94,10 @@ class CurateLookup:
94
94
  slots: A dictionary of slot fields to lookup.
95
95
  public: Whether to lookup from the public instance. Defaults to False.
96
96
 
97
- Example:
98
- >>> curator = ln.Curator.from_df(...)
99
- >>> curator.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
100
- <Category: alveolar_type_1_fibroblast_cell>
97
+ Example::
98
+
99
+ curator = ln.curators.DataFrameCurator(...)
100
+ curator.cat.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
101
101
 
102
102
  """
103
103
 
@@ -106,16 +106,22 @@ class CurateLookup:
106
106
  categoricals: dict[str, FieldAttr],
107
107
  slots: dict[str, FieldAttr] = None,
108
108
  public: bool = False,
109
+ organism: str | None = None,
110
+ sources: dict[str, Record] | None = None,
109
111
  ) -> None:
110
112
  slots = slots or {}
111
113
  self._categoricals = {**categoricals, **slots}
112
114
  self._public = public
115
+ self._organism = organism
116
+ self._sources = sources
113
117
 
114
118
  def __getattr__(self, name):
115
119
  if name in self._categoricals:
116
120
  registry = self._categoricals[name].field.model
117
121
  if self._public and hasattr(registry, "public"):
118
- return registry.public().lookup()
122
+ return registry.public(
123
+ organism=self._organism, source=self._sources.get(name)
124
+ ).lookup()
119
125
  else:
120
126
  return registry.lookup()
121
127
  raise AttributeError(
@@ -126,7 +132,9 @@ class CurateLookup:
126
132
  if name in self._categoricals:
127
133
  registry = self._categoricals[name].field.model
128
134
  if self._public and hasattr(registry, "public"):
129
- return registry.public().lookup()
135
+ return registry.public(
136
+ organism=self._organism, source=self._sources.get(name)
137
+ ).lookup()
130
138
  else:
131
139
  return registry.lookup()
132
140
  raise AttributeError(
@@ -150,7 +158,7 @@ class CurateLookup:
150
158
  " → categories.alveolar_type_1_fibroblast_cell\n\n"
151
159
  "To look up public ontologies, use .lookup(public=True)"
152
160
  )
153
- else: # pdagma: no cover
161
+ else: # pragma: no cover
154
162
  return colors.warning("No fields are found!")
155
163
 
156
164
 
@@ -163,7 +171,7 @@ SLOTS_DOCSTRING = """Curator objects by slot.
163
171
  """
164
172
 
165
173
 
166
- VALIDATE_DOCSTRING = """Validate dataset.
174
+ VALIDATE_DOCSTRING = """Validate dataset against Schema.
167
175
 
168
176
  Raises:
169
177
  lamindb.errors.ValidationError: If validation fails.
@@ -183,15 +191,17 @@ Returns:
183
191
 
184
192
 
185
193
  class Curator:
186
- """Dataset curator.
194
+ """Curator base class.
187
195
 
188
196
  A `Curator` object makes it easy to validate, standardize & annotate datasets.
189
197
 
190
- .. versionadded:: 1.1.0
191
-
192
198
  See:
193
199
  - :class:`~lamindb.curators.DataFrameCurator`
194
200
  - :class:`~lamindb.curators.AnnDataCurator`
201
+ - :class:`~lamindb.curators.MuDataCurator`
202
+ - :class:`~lamindb.curators.SpatialDataCurator`
203
+
204
+ .. versionadded:: 1.1.0
195
205
  """
196
206
 
197
207
  def __init__(self, dataset: Any, schema: Schema | None = None):
@@ -199,7 +209,12 @@ class Curator:
199
209
  self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
200
210
  if isinstance(self._dataset, Artifact):
201
211
  self._artifact = self._dataset
202
- if self._artifact.otype in {"DataFrame", "AnnData"}:
212
+ if self._artifact.otype in {
213
+ "DataFrame",
214
+ "AnnData",
215
+ "MuData",
216
+ "SpatialData",
217
+ }:
203
218
  self._dataset = self._dataset.load()
204
219
  self._schema: Schema | None = schema
205
220
  self._is_validated: bool = False
@@ -208,7 +223,7 @@ class Curator:
208
223
  @doc_args(VALIDATE_DOCSTRING)
209
224
  def validate(self) -> bool | str:
210
225
  """{}""" # noqa: D415
211
- pass # pdagma: no cover
226
+ pass # pragma: no cover
212
227
 
213
228
  @doc_args(SAVE_ARTIFACT_DOCSTRING)
214
229
  def save_artifact(
@@ -222,12 +237,97 @@ class Curator:
222
237
  """{}""" # noqa: D415
223
238
  # Note that this docstring has to be consistent with the Artifact()
224
239
  # constructor signature
225
- pass
240
+ pass # pragma: no cover
241
+
242
+
243
+ class SlotsCurator(Curator):
244
+ """Curator for a dataset with slots.
245
+
246
+ Args:
247
+ dataset: The dataset to validate & annotate.
248
+ schema: A `Schema` object that defines the validation constraints.
249
+
250
+ .. versionadded:: 1.3.0
251
+ """
252
+
253
+ def __init__(
254
+ self,
255
+ dataset: Any,
256
+ schema: Schema,
257
+ ) -> None:
258
+ super().__init__(dataset=dataset, schema=schema)
259
+ self._slots: dict[str, DataFrameCurator] = {}
260
+
261
+ # used in MuDataCurator and SpatialDataCurator
262
+ # in form of {table/modality_key: var_field}
263
+ self._var_fields: dict[str, FieldAttr] = {}
264
+ # in form of {table/modality_key: categoricals}
265
+ self._categoricals: dict[str, dict[str, FieldAttr]] = {}
266
+
267
+ @property
268
+ @doc_args(SLOTS_DOCSTRING)
269
+ def slots(self) -> dict[str, DataFrameCurator]:
270
+ """{}""" # noqa: D415
271
+ return self._slots
272
+
273
+ @doc_args(VALIDATE_DOCSTRING)
274
+ def validate(self) -> None:
275
+ """{}""" # noqa: D415
276
+ for _, curator in self._slots.items():
277
+ curator.validate()
278
+
279
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
280
+ def save_artifact(
281
+ self,
282
+ *,
283
+ key: str | None = None,
284
+ description: str | None = None,
285
+ revises: Artifact | None = None,
286
+ run: Run | None = None,
287
+ ) -> Artifact:
288
+ """{}""" # noqa: D415
289
+ if not self._is_validated:
290
+ self.validate()
291
+
292
+ # default implementation for MuDataCurator and SpatialDataCurator
293
+ return save_artifact( # type: ignore
294
+ self._dataset,
295
+ key=key,
296
+ description=description,
297
+ fields=self._categoricals,
298
+ index_field=self._var_fields,
299
+ artifact=self._artifact,
300
+ revises=revises,
301
+ run=run,
302
+ schema=self._schema,
303
+ )
304
+
305
+
306
+ def check_dtype(expected_type) -> Callable:
307
+ """Creates a check function for Pandera that validates a column's dtype.
308
+
309
+ Args:
310
+ expected_type: String identifier for the expected type ('int', 'float', or 'num')
311
+
312
+ Returns:
313
+ A function that checks if a series has the expected dtype
314
+ """
315
+
316
+ def check_function(series):
317
+ if expected_type == "int":
318
+ is_valid = pd.api.types.is_integer_dtype(series.dtype)
319
+ elif expected_type == "float":
320
+ is_valid = pd.api.types.is_float_dtype(series.dtype)
321
+ elif expected_type == "num":
322
+ is_valid = pd.api.types.is_numeric_dtype(series.dtype)
323
+ return is_valid
324
+
325
+ return check_function
226
326
 
227
327
 
228
328
  class DataFrameCurator(Curator):
229
329
  # the example in the docstring is tested in test_curators_quickstart_example
230
- """Curator for a DataFrame object.
330
+ """Curator for `DataFrame`.
231
331
 
232
332
  See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
233
333
 
@@ -278,12 +378,33 @@ class DataFrameCurator(Curator):
278
378
  # populate features
279
379
  pandera_columns = {}
280
380
  for feature in schema.features.all():
281
- pandera_dtype = (
282
- feature.dtype if not feature.dtype.startswith("cat") else "category"
283
- )
284
- pandera_columns[feature.name] = pandera.Column(
285
- pandera_dtype, nullable=feature.nullable
286
- )
381
+ if feature.dtype in {"int", "float", "num"}:
382
+ dtype = (
383
+ self._dataset[feature.name].dtype
384
+ if feature.name in self._dataset.columns
385
+ else None
386
+ )
387
+ pandera_columns[feature.name] = pandera.Column(
388
+ dtype=None,
389
+ checks=pandera.Check(
390
+ check_dtype(feature.dtype),
391
+ element_wise=False,
392
+ error=f"Column '{feature.name}' failed dtype check for '{feature.dtype}': got {dtype}",
393
+ ),
394
+ nullable=feature.nullable,
395
+ coerce=feature.coerce_dtype,
396
+ )
397
+ else:
398
+ pandera_dtype = (
399
+ feature.dtype
400
+ if not feature.dtype.startswith("cat")
401
+ else "category"
402
+ )
403
+ pandera_columns[feature.name] = pandera.Column(
404
+ pandera_dtype,
405
+ nullable=feature.nullable,
406
+ coerce=feature.coerce_dtype,
407
+ )
287
408
  if feature.dtype.startswith("cat"):
288
409
  categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
289
410
  self._pandera_schema = pandera.DataFrameSchema(
@@ -293,7 +414,7 @@ class DataFrameCurator(Curator):
293
414
  assert schema.itype is not None # noqa: S101
294
415
  self._cat_manager = DataFrameCatManager(
295
416
  self._dataset,
296
- columns=parse_dtype_single_cat(schema.itype, is_itype=True)["field"],
417
+ columns=parse_cat_dtype(schema.itype, is_itype=True)["field"],
297
418
  categoricals=categoricals,
298
419
  )
299
420
 
@@ -378,16 +499,16 @@ class DataFrameCurator(Curator):
378
499
  description: str | None = None,
379
500
  revises: Artifact | None = None,
380
501
  run: Run | None = None,
381
- ):
502
+ ) -> Artifact:
382
503
  """{}""" # noqa: D415
383
504
  if not self._is_validated:
384
505
  self.validate() # raises ValidationError if doesn't validate
385
- result = parse_dtype_single_cat(self._schema.itype, is_itype=True)
506
+ result = parse_cat_dtype(self._schema.itype, is_itype=True)
386
507
  return save_artifact( # type: ignore
387
508
  self._dataset,
388
509
  description=description,
389
510
  fields=self._cat_manager.categoricals,
390
- columns_field=result["field"],
511
+ index_field=result["field"],
391
512
  key=key,
392
513
  artifact=self._artifact,
393
514
  revises=revises,
@@ -396,9 +517,9 @@ class DataFrameCurator(Curator):
396
517
  )
397
518
 
398
519
 
399
- class AnnDataCurator(Curator):
520
+ class AnnDataCurator(SlotsCurator):
400
521
  # the example in the docstring is tested in test_curators_quickstart_example
401
- """Curator for a DataFrame object.
522
+ """Curator for `AnnData`.
402
523
 
403
524
  See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
404
525
 
@@ -446,7 +567,7 @@ class AnnDataCurator(Curator):
446
567
  ).save()
447
568
 
448
569
  # curate an AnnData
449
- adata = datasets.small_dataset1(otype="AnnData")
570
+ adata = ln.core.datasets.small_dataset1(otype="AnnData")
450
571
  curator = ln.curators.AnnDataCurator(adata, anndata_schema)
451
572
  artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
452
573
  assert artifact.schema == anndata_schema
@@ -466,28 +587,16 @@ class AnnDataCurator(Curator):
466
587
  self._slots = {
467
588
  slot: DataFrameCurator(
468
589
  (
469
- self._dataset.__getattribute__(slot).T
590
+ getattr(self._dataset, slot).T
470
591
  if slot == "var"
471
- else self._dataset.__getattribute__(slot)
592
+ else getattr(self._dataset, slot)
472
593
  ),
473
594
  slot_schema,
474
595
  )
475
596
  for slot, slot_schema in schema.slots.items()
476
- if slot in {"obs", "var"}
597
+ if slot in {"obs", "var", "uns"}
477
598
  }
478
599
 
479
- @property
480
- @doc_args(SLOTS_DOCSTRING)
481
- def slots(self) -> dict[str, DataFrameCurator]:
482
- """{}""" # noqa: D415
483
- return self._slots
484
-
485
- @doc_args(VALIDATE_DOCSTRING)
486
- def validate(self) -> None:
487
- """{}""" # noqa: D415
488
- for _, curator in self._slots.items():
489
- curator.validate()
490
-
491
600
  @doc_args(SAVE_ARTIFACT_DOCSTRING)
492
601
  def save_artifact(
493
602
  self,
@@ -496,18 +605,20 @@ class AnnDataCurator(Curator):
496
605
  description: str | None = None,
497
606
  revises: Artifact | None = None,
498
607
  run: Run | None = None,
499
- ):
608
+ ) -> Artifact:
500
609
  """{}""" # noqa: D415
501
610
  if not self._is_validated:
502
611
  self.validate()
612
+ if "obs" in self.slots:
613
+ categoricals = self.slots["obs"]._cat_manager.categoricals
614
+ else:
615
+ categoricals = {}
503
616
  return save_artifact( # type: ignore
504
617
  self._dataset,
505
618
  description=description,
506
- fields=self.slots["obs"]._cat_manager.categoricals,
507
- columns_field=(
508
- parse_dtype_single_cat(self.slots["var"]._schema.itype, is_itype=True)[
509
- "field"
510
- ]
619
+ fields=categoricals,
620
+ index_field=(
621
+ parse_cat_dtype(self.slots["var"]._schema.itype, is_itype=True)["field"]
511
622
  if "var" in self._slots
512
623
  else None
513
624
  ),
@@ -519,34 +630,286 @@ class AnnDataCurator(Curator):
519
630
  )
520
631
 
521
632
 
522
- class CatManager:
523
- """Manage valid categoricals by updating registries.
633
+ def _assign_var_fields_categoricals_multimodal(
634
+ modality: str | None,
635
+ slot_type: str,
636
+ slot: str,
637
+ slot_schema: Schema,
638
+ var_fields: dict[str, FieldAttr],
639
+ categoricals: dict[str, dict[str, FieldAttr]],
640
+ slots: dict[str, DataFrameCurator],
641
+ ) -> None:
642
+ """Assigns var_fields and categoricals for multimodal data curators."""
643
+ if modality is not None:
644
+ # Makes sure that all tables are present
645
+ var_fields[modality] = None
646
+ categoricals[modality] = {}
647
+
648
+ if slot_type == "var":
649
+ var_field = parse_cat_dtype(slot_schema.itype, is_itype=True)["field"]
650
+ if modality is None:
651
+ # This should rarely/never be used since tables should have different var fields
652
+ var_fields[slot] = var_field # pragma: no cover
653
+ else:
654
+ # Note that this is NOT nested since the nested key is always "var"
655
+ var_fields[modality] = var_field
656
+ else:
657
+ obs_fields = slots[slot]._cat_manager.categoricals
658
+ if modality is None:
659
+ categoricals[slot] = obs_fields
660
+ else:
661
+ # Note that this is NOT nested since the nested key is always "obs"
662
+ categoricals[modality] = obs_fields
663
+
664
+
665
+ class MuDataCurator(SlotsCurator):
666
+ # the example in the docstring is tested in test_curators_quickstart_example
667
+ """Curator for `MuData`.
524
668
 
525
- A `CatManager` object makes it easy to validate, standardize & annotate datasets.
669
+ See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
526
670
 
527
- Example:
671
+ .. versionadded:: 1.3.0
528
672
 
529
- >>> cat_manager = ln.CatManager(
530
- >>> dataset,
531
- >>> # define validation criteria as mappings
532
- >>> columns=Feature.name, # map column names
533
- >>> categoricals={"perturbation": ULabel.name}, # map categories
534
- >>> )
535
- >>> cat_manager.validate() # validate the dataframe
536
- >>> artifact = cat_manager.save_artifact(description="my RNA-seq")
537
- >>> artifact.describe() # see annotations
673
+ Args:
674
+ dataset: The MuData-like object to validate & annotate.
675
+ schema: A `Schema` object that defines the validation constraints.
538
676
 
539
- `cat_manager.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
677
+ Example::
540
678
 
541
- If you find non-validated values, you have several options:
679
+ import lamindb as ln
680
+ import bionty as bt
681
+
682
+ # define the global obs schema
683
+ obs_schema = ln.Schema(
684
+ name="mudata_papalexi21_subset_obs_schema",
685
+ features=[
686
+ ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
687
+ ln.Feature(name="replicate", dtype="cat[ULabel[Replicate]]").save(),
688
+ ],
689
+ ).save()
690
+
691
+ # define the ['rna'].obs schema
692
+ obs_schema_rna = ln.Schema(
693
+ name="mudata_papalexi21_subset_rna_obs_schema",
694
+ features=[
695
+ ln.Feature(name="nCount_RNA", dtype=int).save(),
696
+ ln.Feature(name="nFeature_RNA", dtype=int).save(),
697
+ ln.Feature(name="percent.mito", dtype=float).save(),
698
+ ],
699
+ coerce_dtype=True,
700
+ ).save()
701
+
702
+ # define the ['hto'].obs schema
703
+ obs_schema_hto = ln.Schema(
704
+ name="mudata_papalexi21_subset_hto_obs_schema",
705
+ features=[
706
+ ln.Feature(name="nCount_HTO", dtype=int).save(),
707
+ ln.Feature(name="nFeature_HTO", dtype=int).save(),
708
+ ln.Feature(name="technique", dtype=bt.ExperimentalFactor).save(),
709
+ ],
710
+ coerce_dtype=True,
711
+ ).save()
712
+
713
+ # define ['rna'].var schema
714
+ var_schema_rna = ln.Schema(
715
+ name="mudata_papalexi21_subset_rna_var_schema",
716
+ itype=bt.Gene.symbol,
717
+ dtype=float,
718
+ ).save()
542
719
 
543
- - new values found in the data can be registered using :meth:`~lamindb.curators.DataFrameCatManager.add_new_from`
544
- - non-validated values can be accessed using :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
720
+ # define composite schema
721
+ mudata_schema = ln.Schema(
722
+ name="mudata_papalexi21_subset_mudata_schema",
723
+ otype="MuData",
724
+ components={
725
+ "obs": obs_schema,
726
+ "rna:obs": obs_schema_rna,
727
+ "hto:obs": obs_schema_hto,
728
+ "rna:var": var_schema_rna,
729
+ },
730
+ ).save()
731
+
732
+ # curate a MuData
733
+ mdata = ln.core.datasets.mudata_papalexi21_subset()
734
+ bt.settings.organism = "human" # set the organism
735
+ curator = ln.curators.MuDataCurator(mdata, mudata_schema)
736
+ artifact = curator.save_artifact(key="example_datasets/mudata_papalexi21_subset.h5mu")
737
+ assert artifact.schema == mudata_schema
545
738
  """
546
739
 
547
740
  def __init__(
548
- self, *, dataset, categoricals, sources, organism, exclude, columns_field=None
549
- ):
741
+ self,
742
+ dataset: MuData | Artifact,
743
+ schema: Schema,
744
+ ) -> None:
745
+ super().__init__(dataset=dataset, schema=schema)
746
+ if not data_is_mudata(self._dataset):
747
+ raise InvalidArgument("dataset must be MuData-like.")
748
+ if schema.otype != "MuData":
749
+ raise InvalidArgument("Schema otype must be 'MuData'.")
750
+
751
+ for slot, slot_schema in schema.slots.items():
752
+ # Assign to _slots
753
+ if ":" in slot:
754
+ modality, modality_slot = slot.split(":")
755
+ schema_dataset = self._dataset.__getitem__(modality)
756
+ else:
757
+ modality, modality_slot = None, slot
758
+ schema_dataset = self._dataset
759
+ self._slots[slot] = DataFrameCurator(
760
+ (
761
+ getattr(schema_dataset, modality_slot).T
762
+ if modality_slot == "var"
763
+ else getattr(schema_dataset, modality_slot)
764
+ ),
765
+ slot_schema,
766
+ )
767
+ _assign_var_fields_categoricals_multimodal(
768
+ modality=modality,
769
+ slot_type=modality_slot,
770
+ slot=slot,
771
+ slot_schema=slot_schema,
772
+ var_fields=self._var_fields,
773
+ categoricals=self._categoricals,
774
+ slots=self._slots,
775
+ )
776
+
777
+ # for consistency with BaseCatManager
778
+ self._columns_field = self._var_fields
779
+
780
+
781
+ class SpatialDataCurator(SlotsCurator):
782
+ # the example in the docstring is tested in test_curators_quickstart_example
783
+ """Curator for `SpatialData`.
784
+
785
+ See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
786
+
787
+ .. versionadded:: 1.3.0
788
+
789
+ Args:
790
+ dataset: The SpatialData-like object to validate & annotate.
791
+ schema: A `Schema` object that defines the validation constraints.
792
+
793
+ Example::
794
+
795
+ import lamindb as ln
796
+ import bionty as bt
797
+
798
+ # define sample schema
799
+ sample_schema = ln.Schema(
800
+ name="blobs_sample_level_metadata",
801
+ features=[
802
+ ln.Feature(name="assay", dtype=bt.ExperimentalFactor).save(),
803
+ ln.Feature(name="disease", dtype=bt.Disease).save(),
804
+ ln.Feature(name="development_stage", dtype=bt.DevelopmentalStage).save(),
805
+ ],
806
+ coerce_dtype=True
807
+ ).save()
808
+
809
+ # define table obs schema
810
+ blobs_obs_schema = ln.Schema(
811
+ name="blobs_obs_level_metadata",
812
+ features=[
813
+ ln.Feature(name="sample_region", dtype="str").save(),
814
+ ],
815
+ coerce_dtype=True
816
+ ).save()
817
+
818
+ # define table var schema
819
+ blobs_var_schema = ln.Schema(
820
+ name="blobs_var_schema",
821
+ itype=bt.Gene.ensembl_gene_id,
822
+ dtype=int
823
+ ).save()
824
+
825
+ # define composite schema
826
+ spatialdata_schema = ln.Schema(
827
+ name="blobs_spatialdata_schema",
828
+ otype="SpatialData",
829
+ components={
830
+ "sample": sample_schema,
831
+ "table:obs": blobs_obs_schema,
832
+ "table:var": blobs_var_schema,
833
+ }).save()
834
+
835
+ # curate a SpatialData
836
+ spatialdata = ln.core.datasets.spatialdata_blobs()
837
+ curator = ln.curators.SpatialDataCurator(spatialdata, spatialdata_schema)
838
+ try:
839
+ curator.validate()
840
+ except ln.errors.ValidationError as error:
841
+ print(error)
842
+
843
+ # validate again (must pass now) and save artifact
844
+ artifact = curator.save_artifact(key="example_datasets/spatialdata1.zarr")
845
+ assert artifact.schema == spatialdata_schema
846
+ """
847
+
848
+ def __init__(
849
+ self,
850
+ dataset: SpatialData | Artifact,
851
+ schema: Schema,
852
+ *,
853
+ sample_metadata_key: str | None = "sample",
854
+ ) -> None:
855
+ super().__init__(dataset=dataset, schema=schema)
856
+ if not data_is_spatialdata(self._dataset):
857
+ raise InvalidArgument("dataset must be SpatialData-like.")
858
+ if schema.otype != "SpatialData":
859
+ raise InvalidArgument("Schema otype must be 'SpatialData'.")
860
+
861
+ for slot, slot_schema in schema.slots.items():
862
+ # Assign to _slots
863
+ if ":" in slot:
864
+ table_key, table_slot = slot.split(":")
865
+ schema_dataset = self._dataset.tables.__getitem__(table_key)
866
+ # sample metadata (does not have a `:` separator)
867
+ else:
868
+ table_key = None
869
+ table_slot = slot
870
+ schema_dataset = self._dataset.get_attrs(
871
+ key=sample_metadata_key, return_as="df", flatten=True
872
+ )
873
+
874
+ self._slots[slot] = DataFrameCurator(
875
+ (
876
+ getattr(schema_dataset, table_slot).T
877
+ if table_slot == "var"
878
+ else (
879
+ getattr(schema_dataset, table_slot)
880
+ if table_slot != sample_metadata_key
881
+ else schema_dataset
882
+ ) # just take the schema_dataset if it's the sample metadata key
883
+ ),
884
+ slot_schema,
885
+ )
886
+
887
+ _assign_var_fields_categoricals_multimodal(
888
+ modality=table_key,
889
+ slot_type=table_slot,
890
+ slot=slot,
891
+ slot_schema=slot_schema,
892
+ var_fields=self._var_fields,
893
+ categoricals=self._categoricals,
894
+ slots=self._slots,
895
+ )
896
+
897
+ # for consistency with BaseCatManager
898
+ self._columns_field = self._var_fields
899
+
900
+
901
+ class CatManager:
902
+ """Manage categoricals by updating registries.
903
+
904
+ This class is accessible from within a `DataFrameCurator` via the `.cat` attribute.
905
+
906
+ If you find non-validated values, you have several options:
907
+
908
+ - new values found in the data can be registered via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.add_new_from`
909
+ - non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
910
+ """
911
+
912
+ def __init__(self, *, dataset, categoricals, sources, organism, columns_field=None):
550
913
  # the below is shared with Curator
551
914
  self._artifact: Artifact = None # pass the dataset as an artifact
552
915
  self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
@@ -558,11 +921,16 @@ class CatManager:
558
921
  # shared until here
559
922
  self._categoricals = categoricals or {}
560
923
  self._non_validated = None
561
- self._organism = organism
562
924
  self._sources = sources or {}
563
- self._exclude = exclude or {}
564
925
  self._columns_field = columns_field
565
926
  self._validate_category_error_messages: str = ""
927
+ # make sure to only fetch organism once at the beginning
928
+ if organism:
929
+ self._organism = organism
930
+ else:
931
+ fields = list(self._categoricals.values()) + [columns_field]
932
+ organisms = {get_organism_kwargs(field).get("organism") for field in fields}
933
+ self._organism = organisms.pop() if len(organisms) > 0 else None
566
934
 
567
935
  @property
568
936
  def non_validated(self) -> dict[str, list[str]]:
@@ -607,7 +975,7 @@ class CatManager:
607
975
  Returns:
608
976
  The boolean `True` if the dataset is validated. Otherwise, a string with the error message.
609
977
  """
610
- pass
978
+ pass # pragma: no cover
611
979
 
612
980
  def standardize(self, key: str) -> None:
613
981
  """Replace synonyms with standardized values.
@@ -620,7 +988,7 @@ class CatManager:
620
988
  Returns:
621
989
  None
622
990
  """
623
- pass # pdagma: no cover
991
+ pass # pragma: no cover
624
992
 
625
993
  @doc_args(SAVE_ARTIFACT_DOCSTRING)
626
994
  def save_artifact(
@@ -632,64 +1000,30 @@ class CatManager:
632
1000
  run: Run | None = None,
633
1001
  ) -> Artifact:
634
1002
  """{}""" # noqa: D415
635
- from lamindb.core._settings import settings
636
-
1003
+ # Make sure all labels are saved in the current instance
637
1004
  if not self._is_validated:
638
1005
  self.validate() # returns True or False
639
1006
  if not self._is_validated: # need to raise error manually
640
1007
  raise ValidationError("Dataset does not validate. Please curate.")
641
1008
 
642
- # Make sure all labels are saved in the current instance
643
- verbosity = settings.verbosity
644
- try:
645
- settings.verbosity = "warning"
646
- self._artifact = save_artifact( # type: ignore
647
- self._dataset,
648
- description=description,
649
- fields=self.categoricals,
650
- columns_field=self._columns_field,
651
- key=key,
652
- artifact=self._artifact,
653
- revises=revises,
654
- run=run,
655
- schema=None,
656
- organism=self._organism,
657
- )
658
- finally:
659
- settings.verbosity = verbosity
1009
+ self._artifact = save_artifact( # type: ignore
1010
+ self._dataset,
1011
+ key=key,
1012
+ description=description,
1013
+ fields=self.categoricals,
1014
+ index_field=self._columns_field,
1015
+ artifact=self._artifact,
1016
+ revises=revises,
1017
+ run=run,
1018
+ schema=None,
1019
+ organism=self._organism,
1020
+ )
660
1021
 
661
1022
  return self._artifact
662
1023
 
663
1024
 
664
1025
  class DataFrameCatManager(CatManager):
665
- """Curation flow for a DataFrame object.
666
-
667
- See also :class:`~lamindb.Curator`.
668
-
669
- Args:
670
- df: The DataFrame object to curate.
671
- columns: The field attribute for the feature column.
672
- categoricals: A dictionary mapping column names to registry_field.
673
- verbosity: The verbosity level.
674
- organism: The organism name.
675
- sources: A dictionary mapping column names to Source records.
676
- exclude: A dictionary mapping column names to values to exclude from validation.
677
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
678
- using the exclude parameter ensures they are not validated.
679
-
680
- Returns:
681
- A curator object.
682
-
683
- Examples:
684
- >>> import bionty as bt
685
- >>> curator = ln.Curator.from_df(
686
- ... df,
687
- ... categoricals={
688
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
689
- ... "donor_id": ULabel.name
690
- ... }
691
- ... )
692
- """
1026
+ """Categorical manager for `DataFrame`."""
693
1027
 
694
1028
  def __init__(
695
1029
  self,
@@ -699,10 +1033,7 @@ class DataFrameCatManager(CatManager):
699
1033
  verbosity: str = "hint",
700
1034
  organism: str | None = None,
701
1035
  sources: dict[str, Record] | None = None,
702
- exclude: dict | None = None,
703
1036
  ) -> None:
704
- from lamindb.core._settings import settings
705
-
706
1037
  if organism is not None and not isinstance(organism, str):
707
1038
  raise ValueError("organism must be a string such as 'human' or 'mouse'!")
708
1039
 
@@ -714,20 +1045,21 @@ class DataFrameCatManager(CatManager):
714
1045
  organism=organism,
715
1046
  categoricals=categoricals,
716
1047
  sources=sources,
717
- exclude=exclude,
718
1048
  )
719
1049
  self._save_columns()
720
1050
 
721
- def lookup(self, public: bool = False) -> CurateLookup:
1051
+ def lookup(self, public: bool = False) -> CatLookup:
722
1052
  """Lookup categories.
723
1053
 
724
1054
  Args:
725
1055
  public: If "public", the lookup is performed on the public reference.
726
1056
  """
727
- return CurateLookup(
1057
+ return CatLookup(
728
1058
  categoricals=self._categoricals,
729
1059
  slots={"columns": self._columns_field},
730
1060
  public=public,
1061
+ organism=self._organism,
1062
+ sources=self._sources,
731
1063
  )
732
1064
 
733
1065
  def _save_columns(self, validated_only: bool = True) -> None:
@@ -736,28 +1068,26 @@ class DataFrameCatManager(CatManager):
736
1068
  update_registry(
737
1069
  values=list(self.categoricals.keys()),
738
1070
  field=self._columns_field,
739
- key="columns",
1071
+ key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
740
1072
  validated_only=False,
741
1073
  source=self._sources.get("columns"),
742
- exclude=self._exclude.get("columns"),
743
1074
  )
744
1075
 
745
1076
  # Save the rest of the columns based on validated_only
746
- additional_columns = set(self._dataset.columns) - set(self.categoricals.keys())
1077
+ additional_columns = set(self._dataset.keys()) - set(self.categoricals.keys())
747
1078
  if additional_columns:
748
1079
  update_registry(
749
1080
  values=list(additional_columns),
750
1081
  field=self._columns_field,
751
- key="columns",
1082
+ key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
752
1083
  validated_only=validated_only,
753
1084
  df=self._dataset, # Get the Feature type from df
754
1085
  source=self._sources.get("columns"),
755
- exclude=self._exclude.get("columns"),
756
1086
  )
757
1087
 
758
1088
  @deprecated(new_name="is run by default")
759
1089
  def add_new_from_columns(self, organism: str | None = None, **kwargs):
760
- pass
1090
+ pass # pragma: no cover
761
1091
 
762
1092
  def validate(self) -> bool:
763
1093
  """Validate variables and categorical observations.
@@ -778,7 +1108,6 @@ class DataFrameCatManager(CatManager):
778
1108
  self._dataset,
779
1109
  fields=self.categoricals,
780
1110
  sources=self._sources,
781
- exclude=self._exclude,
782
1111
  curator=self,
783
1112
  organism=self._organism,
784
1113
  )
@@ -814,7 +1143,7 @@ class DataFrameCatManager(CatManager):
814
1143
  else:
815
1144
  if key not in avail_keys:
816
1145
  if key in self._categoricals:
817
- logger.info(f"No unstandardized values found for {key!r}")
1146
+ logger.warning(f"No non-standardized values found for {key!r}")
818
1147
  else:
819
1148
  raise KeyError(
820
1149
  f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
@@ -852,7 +1181,6 @@ class DataFrameCatManager(CatManager):
852
1181
  key=categorical,
853
1182
  validated_only=validated_only,
854
1183
  source=self._sources.get(categorical),
855
- exclude=self._exclude.get(categorical),
856
1184
  organism=self._organism,
857
1185
  )
858
1186
  # adding new records removes them from non_validated
@@ -882,32 +1210,7 @@ class DataFrameCatManager(CatManager):
882
1210
 
883
1211
 
884
1212
  class AnnDataCatManager(CatManager):
885
- """Manage categorical curation.
886
-
887
- Args:
888
- data: The AnnData object or an AnnData-like path.
889
- var_index: The registry field for mapping the ``.var`` index.
890
- categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
891
- obs_columns: The registry field for mapping the ``.obs.columns``.
892
- verbosity: The verbosity level.
893
- organism: The organism name.
894
- sources: A dictionary mapping ``.obs.columns`` to Source records.
895
- exclude: A dictionary mapping column names to values to exclude from validation.
896
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
897
- using the exclude parameter ensures they are not validated.
898
-
899
- Examples:
900
- >>> import bionty as bt
901
- >>> curator = ln.Curator.from_anndata(
902
- ... adata,
903
- ... var_index=bt.Gene.ensembl_gene_id,
904
- ... categoricals={
905
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
906
- ... "donor_id": ULabel.name
907
- ... },
908
- ... organism="human",
909
- ... )
910
- """
1213
+ """Categorical manager for `AnnData`."""
911
1214
 
912
1215
  def __init__(
913
1216
  self,
@@ -918,13 +1221,12 @@ class AnnDataCatManager(CatManager):
918
1221
  verbosity: str = "hint",
919
1222
  organism: str | None = None,
920
1223
  sources: dict[str, Record] | None = None,
921
- exclude: dict | None = None,
922
1224
  ) -> None:
923
1225
  if isinstance(var_index, str):
924
- raise TypeError("var_index parameter has to be a bionty field")
1226
+ raise TypeError(
1227
+ "var_index parameter has to be a field, e.g. Gene.ensembl_gene_id"
1228
+ )
925
1229
 
926
- if sources is None:
927
- sources = {}
928
1230
  if not data_is_anndata(data):
929
1231
  raise TypeError("data has to be an AnnData object")
930
1232
 
@@ -935,12 +1237,12 @@ class AnnDataCatManager(CatManager):
935
1237
 
936
1238
  self._obs_fields = categoricals or {}
937
1239
  self._var_field = var_index
1240
+ self._sources = sources or {}
938
1241
  super().__init__(
939
1242
  dataset=data,
940
1243
  categoricals=categoricals,
941
- sources=sources,
1244
+ sources=self._sources,
942
1245
  organism=organism,
943
- exclude=exclude,
944
1246
  columns_field=var_index,
945
1247
  )
946
1248
  self._adata = self._dataset
@@ -950,8 +1252,7 @@ class AnnDataCatManager(CatManager):
950
1252
  columns=obs_columns,
951
1253
  verbosity=verbosity,
952
1254
  organism=None,
953
- sources=sources,
954
- exclude=exclude,
1255
+ sources=self._sources,
955
1256
  )
956
1257
 
957
1258
  @property
@@ -964,16 +1265,18 @@ class AnnDataCatManager(CatManager):
964
1265
  """Return the obs fields to validate against."""
965
1266
  return self._obs_fields
966
1267
 
967
- def lookup(self, public: bool = False) -> CurateLookup:
1268
+ def lookup(self, public: bool = False) -> CatLookup:
968
1269
  """Lookup categories.
969
1270
 
970
1271
  Args:
971
1272
  public: If "public", the lookup is performed on the public reference.
972
1273
  """
973
- return CurateLookup(
1274
+ return CatLookup(
974
1275
  categoricals=self._obs_fields,
975
1276
  slots={"columns": self._columns_field, "var_index": self._var_field},
976
1277
  public=public,
1278
+ organism=self._organism,
1279
+ sources=self._sources,
977
1280
  )
978
1281
 
979
1282
  def _save_from_var_index(
@@ -989,7 +1292,6 @@ class AnnDataCatManager(CatManager):
989
1292
  validated_only=validated_only,
990
1293
  organism=self._organism,
991
1294
  source=self._sources.get("var_index"),
992
- exclude=self._exclude.get("var_index"),
993
1295
  )
994
1296
 
995
1297
  def add_new_from(self, key: str, **kwargs):
@@ -1033,7 +1335,6 @@ class AnnDataCatManager(CatManager):
1033
1335
  key="var_index",
1034
1336
  source=self._sources.get("var_index"),
1035
1337
  hint_print=".add_new_from_var_index()",
1036
- exclude=self._exclude.get("var_index"),
1037
1338
  organism=self._organism, # type: ignore
1038
1339
  )
1039
1340
  else:
@@ -1077,59 +1378,29 @@ class AnnDataCatManager(CatManager):
1077
1378
 
1078
1379
 
1079
1380
  class MuDataCatManager(CatManager):
1080
- """Curation flow for a ``MuData`` object.
1081
-
1082
- Args:
1083
- mdata: The MuData object to curate.
1084
- var_index: The registry field for mapping the ``.var`` index for each modality.
1085
- For example:
1086
- ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
1087
- categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
1088
- Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
1089
- verbosity: The verbosity level.
1090
- organism: The organism name.
1091
- sources: A dictionary mapping ``.obs.columns`` to Source records.
1092
- exclude: A dictionary mapping column names to values to exclude from validation.
1093
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1094
- using the exclude parameter ensures they are not validated.
1095
-
1096
- Examples:
1097
- >>> import bionty as bt
1098
- >>> curator = ln.Curator.from_mudata(
1099
- ... mdata,
1100
- ... var_index={
1101
- ... "rna": bt.Gene.ensembl_gene_id,
1102
- ... "adt": CellMarker.name
1103
- ... },
1104
- ... categoricals={
1105
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
1106
- ... "donor_id": ULabel.name
1107
- ... },
1108
- ... organism="human",
1109
- ... )
1110
- """
1381
+ """Categorical manager for `MuData`."""
1111
1382
 
1112
1383
  def __init__(
1113
1384
  self,
1114
1385
  mdata: MuData | Artifact,
1115
- var_index: dict[str, FieldAttr],
1386
+ var_index: dict[str, FieldAttr] | None = None,
1116
1387
  categoricals: dict[str, FieldAttr] | None = None,
1117
1388
  verbosity: str = "hint",
1118
1389
  organism: str | None = None,
1119
1390
  sources: dict[str, Record] | None = None,
1120
- exclude: dict | None = None, # {modality: {field: [values]}}
1121
1391
  ) -> None:
1122
1392
  super().__init__(
1123
1393
  dataset=mdata,
1124
1394
  categoricals={},
1125
1395
  sources=sources,
1126
1396
  organism=organism,
1127
- exclude=exclude,
1128
1397
  )
1129
- self._columns_field = var_index # this is for consistency with BaseCatManager
1130
- self._var_fields = var_index
1398
+ self._columns_field = (
1399
+ var_index or {}
1400
+ ) # this is for consistency with BaseCatManager
1401
+ self._var_fields = var_index or {}
1131
1402
  self._verify_modality(self._var_fields.keys())
1132
- self._obs_fields = self._parse_categoricals(categoricals)
1403
+ self._obs_fields = self._parse_categoricals(categoricals or {})
1133
1404
  self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
1134
1405
  self._verbosity = verbosity
1135
1406
  self._obs_df_curator = None
@@ -1140,7 +1411,6 @@ class MuDataCatManager(CatManager):
1140
1411
  categoricals=self._obs_fields.get("obs", {}),
1141
1412
  verbosity=verbosity,
1142
1413
  sources=self._sources.get("obs"),
1143
- exclude=self._exclude.get("obs"),
1144
1414
  organism=organism,
1145
1415
  )
1146
1416
  self._mod_adata_curators = {
@@ -1150,7 +1420,6 @@ class MuDataCatManager(CatManager):
1150
1420
  categoricals=self._obs_fields.get(modality),
1151
1421
  verbosity=verbosity,
1152
1422
  sources=self._sources.get(modality),
1153
- exclude=self._exclude.get(modality),
1154
1423
  organism=organism,
1155
1424
  )
1156
1425
  for modality in self._modalities
@@ -1199,7 +1468,7 @@ class MuDataCatManager(CatManager):
1199
1468
  obs_fields["obs"][k] = v
1200
1469
  return obs_fields
1201
1470
 
1202
- def lookup(self, public: bool = False) -> CurateLookup:
1471
+ def lookup(self, public: bool = False) -> CatLookup:
1203
1472
  """Lookup categories.
1204
1473
 
1205
1474
  Args:
@@ -1212,12 +1481,14 @@ class MuDataCatManager(CatManager):
1212
1481
  obs_fields[k] = v
1213
1482
  else:
1214
1483
  obs_fields[f"{mod}:{k}"] = v
1215
- return CurateLookup(
1484
+ return CatLookup(
1216
1485
  categoricals=obs_fields,
1217
1486
  slots={
1218
1487
  **{f"{k}_var_index": v for k, v in self._var_fields.items()},
1219
1488
  },
1220
1489
  public=public,
1490
+ organism=self._organism,
1491
+ sources=self._sources,
1221
1492
  )
1222
1493
 
1223
1494
  @deprecated(new_name="is run by default")
@@ -1227,7 +1498,7 @@ class MuDataCatManager(CatManager):
1227
1498
  column_names: list[str] | None = None,
1228
1499
  **kwargs,
1229
1500
  ):
1230
- pass
1501
+ pass # pragma: no cover
1231
1502
 
1232
1503
  def add_new_from_var_index(self, modality: str, **kwargs):
1233
1504
  """Update variable records.
@@ -1271,16 +1542,8 @@ class MuDataCatManager(CatManager):
1271
1542
 
1272
1543
  def validate(self) -> bool:
1273
1544
  """Validate categories."""
1274
- from lamindb.core._settings import settings
1275
-
1276
1545
  # add all validated records to the current instance
1277
- verbosity = settings.verbosity
1278
- try:
1279
- settings.verbosity = "error"
1280
- self._update_registry_all()
1281
- finally:
1282
- settings.verbosity = verbosity
1283
-
1546
+ self._update_registry_all()
1284
1547
  self._non_validated = {} # type: ignore
1285
1548
 
1286
1549
  obs_validated = True
@@ -1329,393 +1592,287 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
1329
1592
  )
1330
1593
 
1331
1594
 
1332
- class TiledbsomaCatManager(CatManager):
1333
- """Curation flow for `tiledbsoma.Experiment`.
1334
-
1335
- Args:
1336
- experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
1337
- var_index: The registry fields for mapping the `.var` indices for measurements.
1338
- Should be in the form `{"measurement name": ("var column", field)}`.
1339
- These keys should be used in the flattened form (`'{measurement name}__{column name in .var}'`)
1340
- in `.standardize` or `.add_new_from`, see the output of `.var_index`.
1341
- categoricals: A dictionary mapping categorical `.obs` columns to a registry field.
1342
- obs_columns: The registry field for mapping the names of the `.obs` columns.
1343
- organism: The organism name.
1344
- sources: A dictionary mapping `.obs` columns to Source records.
1345
- exclude: A dictionary mapping column names to values to exclude from validation.
1346
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1347
- using the exclude parameter ensures they are not validated.
1348
-
1349
- Examples:
1350
- >>> import bionty as bt
1351
- >>> curator = ln.Curator.from_tiledbsoma(
1352
- ... "./my_array_store.tiledbsoma",
1353
- ... var_index={"RNA": ("var_id", bt.Gene.symbol)},
1354
- ... categoricals={
1355
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
1356
- ... "donor_id": ULabel.name
1357
- ... },
1358
- ... organism="human",
1359
- ... )
1360
- """
1595
+ class SpatialDataCatManager(CatManager):
1596
+ """Categorical manager for `SpatialData`."""
1361
1597
 
1362
1598
  def __init__(
1363
1599
  self,
1364
- experiment_uri: UPathStr | Artifact,
1365
- var_index: dict[str, tuple[str, FieldAttr]],
1366
- categoricals: dict[str, FieldAttr] | None = None,
1367
- obs_columns: FieldAttr = Feature.name,
1600
+ sdata: Any,
1601
+ var_index: dict[str, FieldAttr],
1602
+ categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1603
+ verbosity: str = "hint",
1368
1604
  organism: str | None = None,
1369
- sources: dict[str, Record] | None = None,
1370
- exclude: dict[str, str | list[str]] | None = None,
1371
- ):
1372
- self._obs_fields = categoricals or {}
1373
- self._var_fields = var_index
1374
- self._columns_field = obs_columns
1375
- if isinstance(experiment_uri, Artifact):
1376
- self._dataset = experiment_uri.path
1377
- self._artifact = experiment_uri
1605
+ sources: dict[str, dict[str, Record]] | None = None,
1606
+ *,
1607
+ sample_metadata_key: str | None = "sample",
1608
+ ) -> None:
1609
+ super().__init__(
1610
+ dataset=sdata,
1611
+ categoricals={},
1612
+ sources=sources,
1613
+ organism=organism,
1614
+ )
1615
+ if isinstance(sdata, Artifact):
1616
+ self._sdata = sdata.load()
1378
1617
  else:
1379
- self._dataset = UPath(experiment_uri)
1380
- self._artifact = None
1381
- self._organism = organism
1382
- self._sources = sources or {}
1383
- self._exclude = exclude or {}
1384
-
1385
- self._is_validated: bool | None = False
1386
- self._non_validated_values: dict[str, list] | None = None
1387
- self._validated_values: dict[str, list] = {}
1388
- # filled by _check_save_keys
1389
- self._n_obs: int | None = None
1390
- self._valid_obs_keys: list[str] | None = None
1391
- self._obs_pa_schema: pa.lib.Schema | None = (
1392
- None # this is needed to create the obs feature set
1618
+ self._sdata = self._dataset
1619
+ self._sample_metadata_key = sample_metadata_key
1620
+ self._write_path = None
1621
+ self._var_fields = var_index
1622
+ self._verify_accessor_exists(self._var_fields.keys())
1623
+ self._categoricals = categoricals
1624
+ self._table_keys = set(self._var_fields.keys()) | set(
1625
+ self._categoricals.keys() - {self._sample_metadata_key}
1393
1626
  )
1394
- self._valid_var_keys: list[str] | None = None
1395
- self._var_fields_flat: dict[str, FieldAttr] | None = None
1396
- self._check_save_keys()
1397
-
1398
- # check that the provided keys in var_index and categoricals are available in the store
1399
- # and save features
1400
- def _check_save_keys(self):
1401
- from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1402
-
1403
- with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1404
- experiment_obs = experiment.obs
1405
- self._n_obs = len(experiment_obs)
1406
- self._obs_pa_schema = experiment_obs.schema
1407
- valid_obs_keys = [
1408
- k for k in self._obs_pa_schema.names if k != "soma_joinid"
1409
- ]
1410
- self._valid_obs_keys = valid_obs_keys
1411
-
1412
- valid_var_keys = []
1413
- ms_list = []
1414
- for ms in experiment.ms.keys():
1415
- ms_list.append(ms)
1416
- var_ms = experiment.ms[ms].var
1417
- valid_var_keys += [
1418
- f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
1419
- ]
1420
- self._valid_var_keys = valid_var_keys
1627
+ self._verbosity = verbosity
1628
+ self._sample_df_curator = None
1629
+ if self._sample_metadata_key is not None:
1630
+ self._sample_metadata = self._sdata.get_attrs(
1631
+ key=self._sample_metadata_key, return_as="df", flatten=True
1632
+ )
1633
+ self._is_validated = False
1421
1634
 
1422
- # check validity of keys in categoricals
1635
+ # Check validity of keys in categoricals
1423
1636
  nonval_keys = []
1424
- for obs_key in self._obs_fields.keys():
1425
- if obs_key not in valid_obs_keys:
1426
- nonval_keys.append(obs_key)
1637
+ for accessor, accessor_categoricals in self._categoricals.items():
1638
+ if (
1639
+ accessor == self._sample_metadata_key
1640
+ and self._sample_metadata is not None
1641
+ ):
1642
+ for key in accessor_categoricals.keys():
1643
+ if key not in self._sample_metadata.columns:
1644
+ nonval_keys.append(key)
1645
+ else:
1646
+ for key in accessor_categoricals.keys():
1647
+ if key not in self._sdata[accessor].obs.columns:
1648
+ nonval_keys.append(key)
1649
+
1427
1650
  _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1428
1651
 
1429
- # check validity of keys in var_index
1430
- self._var_fields_flat = {}
1652
+ # check validity of keys in sources
1431
1653
  nonval_keys = []
1432
- for ms_key in self._var_fields.keys():
1433
- var_key, var_field = self._var_fields[ms_key]
1434
- var_key_flat = f"{ms_key}__{var_key}"
1435
- if var_key_flat not in valid_var_keys:
1436
- nonval_keys.append(f"({ms_key}, {var_key})")
1654
+ for accessor, accessor_sources in self._sources.items():
1655
+ if (
1656
+ accessor == self._sample_metadata_key
1657
+ and self._sample_metadata is not None
1658
+ ):
1659
+ columns = self._sample_metadata.columns
1660
+ elif accessor != self._sample_metadata_key:
1661
+ columns = self._sdata[accessor].obs.columns
1437
1662
  else:
1438
- self._var_fields_flat[var_key_flat] = var_field
1439
- _maybe_curation_keys_not_present(nonval_keys, "var_index")
1440
-
1441
- # check validity of keys in sources and exclude
1442
- valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
1443
- for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
1444
- nonval_keys = []
1445
- for arg_key in dct.keys():
1446
- if arg_key not in valid_arg_keys:
1447
- nonval_keys.append(arg_key)
1448
- _maybe_curation_keys_not_present(nonval_keys, name)
1663
+ continue
1664
+ for key in accessor_sources:
1665
+ if key not in columns:
1666
+ nonval_keys.append(key)
1667
+ _maybe_curation_keys_not_present(nonval_keys, "sources")
1449
1668
 
1450
- # register obs columns' names
1451
- register_columns = list(self._obs_fields.keys())
1452
- organism = check_registry_organism(
1453
- self._columns_field.field.model, self._organism
1454
- ).get("organism")
1455
- update_registry(
1456
- values=register_columns,
1457
- field=self._columns_field,
1458
- key="columns",
1459
- validated_only=False,
1460
- organism=organism,
1461
- source=self._sources.get("columns"),
1462
- exclude=self._exclude.get("columns"),
1463
- )
1464
- additional_columns = [k for k in valid_obs_keys if k not in register_columns]
1465
- # no need to register with validated_only=True if columns are features
1669
+ # Set up sample level metadata and table Curator objects
1466
1670
  if (
1467
- len(additional_columns) > 0
1468
- and self._columns_field.field.model is not Feature
1671
+ self._sample_metadata_key is not None
1672
+ and self._sample_metadata_key in self._categoricals
1469
1673
  ):
1470
- update_registry(
1471
- values=additional_columns,
1472
- field=self._columns_field,
1473
- key="columns",
1474
- validated_only=True,
1674
+ self._sample_df_curator = DataFrameCatManager(
1675
+ df=self._sample_metadata,
1676
+ columns=Feature.name,
1677
+ categoricals=self._categoricals.get(self._sample_metadata_key, {}),
1678
+ verbosity=verbosity,
1679
+ sources=self._sources.get(self._sample_metadata_key),
1680
+ organism=organism,
1681
+ )
1682
+ self._table_adata_curators = {
1683
+ table: AnnDataCatManager(
1684
+ data=self._sdata[table],
1685
+ var_index=var_index.get(table),
1686
+ categoricals=self._categoricals.get(table),
1687
+ verbosity=verbosity,
1688
+ sources=self._sources.get(table),
1475
1689
  organism=organism,
1476
- source=self._sources.get("columns"),
1477
- exclude=self._exclude.get("columns"),
1478
1690
  )
1691
+ for table in self._table_keys
1692
+ }
1479
1693
 
1480
- def validate(self):
1481
- """Validate categories."""
1482
- from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1694
+ self._non_validated = None
1483
1695
 
1484
- validated = True
1485
- self._non_validated_values = {}
1486
- with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1487
- for ms, (key, field) in self._var_fields.items():
1488
- var_ms = experiment.ms[ms].var
1489
- var_ms_key = f"{ms}__{key}"
1490
- # it was already validated and cached
1491
- if var_ms_key in self._validated_values:
1492
- continue
1493
- var_ms_values = (
1494
- var_ms.read(column_names=[key]).concat()[key].to_pylist()
1495
- )
1496
- organism = check_registry_organism(
1497
- field.field.model, self._organism
1498
- ).get("organism")
1499
- update_registry(
1500
- values=var_ms_values,
1501
- field=field,
1502
- key=var_ms_key,
1503
- validated_only=True,
1504
- organism=organism,
1505
- source=self._sources.get(var_ms_key),
1506
- exclude=self._exclude.get(var_ms_key),
1507
- )
1508
- _, non_val = validate_categories(
1509
- values=var_ms_values,
1510
- field=field,
1511
- key=var_ms_key,
1512
- organism=organism,
1513
- source=self._sources.get(var_ms_key),
1514
- exclude=self._exclude.get(var_ms_key),
1515
- )
1516
- if len(non_val) > 0:
1517
- validated = False
1518
- self._non_validated_values[var_ms_key] = non_val
1519
- else:
1520
- self._validated_values[var_ms_key] = var_ms_values
1696
+ @property
1697
+ def var_index(self) -> FieldAttr:
1698
+ """Return the registry fields to validate variables indices against."""
1699
+ return self._var_fields
1521
1700
 
1522
- obs = experiment.obs
1523
- for key, field in self._obs_fields.items():
1524
- # already validated and cached
1525
- if key in self._validated_values:
1526
- continue
1527
- values = pa.compute.unique(
1528
- obs.read(column_names=[key]).concat()[key]
1529
- ).to_pylist()
1530
- organism = check_registry_organism(
1531
- field.field.model, self._organism
1532
- ).get("organism")
1533
- update_registry(
1534
- values=values,
1535
- field=field,
1536
- key=key,
1537
- validated_only=True,
1538
- organism=organism,
1539
- source=self._sources.get(key),
1540
- exclude=self._exclude.get(key),
1541
- )
1542
- _, non_val = validate_categories(
1543
- values=values,
1544
- field=field,
1545
- key=key,
1546
- organism=organism,
1547
- source=self._sources.get(key),
1548
- exclude=self._exclude.get(key),
1549
- )
1550
- if len(non_val) > 0:
1551
- validated = False
1552
- self._non_validated_values[key] = non_val
1553
- else:
1554
- self._validated_values[key] = values
1555
- self._is_validated = validated
1556
- return self._is_validated
1701
+ @property
1702
+ def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
1703
+ """Return the categorical keys and fields to validate against."""
1704
+ return self._categoricals
1557
1705
 
1558
- def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
1559
- assert self._non_validated_values is not None # noqa: S101
1706
+ @property
1707
+ def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
1708
+ """Return the non-validated features and labels."""
1709
+ if self._non_validated is None:
1710
+ raise ValidationError("Please run validate() first!")
1711
+ return self._non_validated
1560
1712
 
1561
- if key in self._valid_obs_keys:
1562
- field = self._obs_fields[key]
1563
- elif key in self._valid_var_keys:
1564
- ms = key.partition("__")[0]
1565
- field = self._var_fields[ms][1]
1566
- else:
1567
- raise KeyError(f"key {key} is invalid!")
1568
- values = self._non_validated_values.get(key, [])
1569
- return values, field
1713
+ def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
1714
+ """Verify that the accessors exist (either a valid table or in attrs)."""
1715
+ for acc in accessors:
1716
+ is_present = False
1717
+ try:
1718
+ self._sdata.get_attrs(key=acc)
1719
+ is_present = True
1720
+ except KeyError:
1721
+ if acc in self._sdata.tables.keys():
1722
+ is_present = True
1723
+ if not is_present:
1724
+ raise ValidationError(f"Accessor '{acc}' does not exist!")
1570
1725
 
1571
- def add_new_from(self, key: str, **kwargs) -> None:
1572
- """Add validated & new categories.
1726
+ def lookup(self, public: bool = False) -> CatLookup:
1727
+ """Look up categories.
1573
1728
 
1574
1729
  Args:
1575
- key: The key referencing the slot in the `tiledbsoma` store.
1576
- It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
1577
- or a column name in `.obs`.
1730
+ public: Whether the lookup is performed on the public reference.
1578
1731
  """
1579
- if self._non_validated_values is None:
1580
- raise ValidationError("Run .validate() first.")
1581
- if key == "all":
1582
- keys = list(self._non_validated_values.keys())
1583
- else:
1584
- avail_keys = list(
1585
- chain(self._non_validated_values.keys(), self._validated_values.keys())
1586
- )
1587
- if key not in avail_keys:
1588
- raise KeyError(
1589
- f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
1590
- )
1591
- keys = [key]
1592
- for k in keys:
1593
- values, field = self._non_validated_values_field(k)
1594
- if len(values) == 0:
1595
- continue
1596
- organism = check_registry_organism(field.field.model, self._organism).get(
1597
- "organism"
1732
+ cat_values_dict = list(self.categoricals.values())[0]
1733
+ return CatLookup(
1734
+ categoricals=cat_values_dict,
1735
+ slots={"accessors": cat_values_dict.keys()},
1736
+ public=public,
1737
+ organism=self._organism,
1738
+ sources=self._sources,
1739
+ )
1740
+
1741
+ def _update_registry_all(self) -> None:
1742
+ """Saves labels of all features for sample and table metadata."""
1743
+ if self._sample_df_curator is not None:
1744
+ self._sample_df_curator._update_registry_all(
1745
+ validated_only=True,
1598
1746
  )
1599
- update_registry(
1600
- values=values,
1601
- field=field,
1602
- key=k,
1603
- validated_only=False,
1604
- organism=organism,
1605
- source=self._sources.get(k),
1606
- exclude=self._exclude.get(k),
1607
- **kwargs,
1747
+ for _, adata_curator in self._table_adata_curators.items():
1748
+ adata_curator._obs_df_curator._update_registry_all(
1749
+ validated_only=True,
1608
1750
  )
1609
- # update non-validated values list but keep the key there
1610
- # it will be removed by .validate()
1611
- if k in self._non_validated_values:
1612
- self._non_validated_values[k] = []
1613
1751
 
1614
- @property
1615
- def non_validated(self) -> dict[str, list]:
1616
- """Return the non-validated features and labels."""
1617
- non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
1618
- return non_val
1752
+ def add_new_from_var_index(self, table: str, **kwargs) -> None:
1753
+ """Save new values from ``.var.index`` of table.
1619
1754
 
1620
- @property
1621
- def var_index(self) -> dict[str, FieldAttr]:
1622
- """Return the registry fields with flattened keys to validate variables indices against."""
1623
- return self._var_fields_flat
1755
+ Args:
1756
+ table: The table key.
1757
+ organism: The organism name.
1758
+ **kwargs: Additional keyword arguments to pass to create new records.
1759
+ """
1760
+ if self._non_validated is None:
1761
+ raise ValidationError("Run .validate() first.")
1762
+ self._table_adata_curators[table].add_new_from_var_index(**kwargs)
1763
+ if table in self.non_validated.keys():
1764
+ if "var_index" in self._non_validated[table]:
1765
+ self._non_validated[table].pop("var_index")
1624
1766
 
1625
- @property
1626
- def categoricals(self) -> dict[str, FieldAttr]:
1627
- """Return the obs fields to validate against."""
1628
- return self._obs_fields
1767
+ if len(self.non_validated[table].values()) == 0:
1768
+ self.non_validated.pop(table)
1629
1769
 
1630
- def lookup(self, public: bool = False) -> CurateLookup:
1631
- """Lookup categories.
1770
+ def add_new_from(
1771
+ self,
1772
+ key: str,
1773
+ accessor: str | None = None,
1774
+ **kwargs,
1775
+ ) -> None:
1776
+ """Save new values of categorical from sample level metadata or table.
1632
1777
 
1633
1778
  Args:
1634
- public: If "public", the lookup is performed on the public reference.
1779
+ key: The key referencing the slot in the DataFrame.
1780
+ accessor: The accessor key such as 'sample' or 'table x'.
1781
+ organism: The organism name.
1782
+ **kwargs: Additional keyword arguments to pass to create new records.
1635
1783
  """
1636
- return CurateLookup(
1637
- categoricals=self._obs_fields,
1638
- slots={"columns": self._columns_field, **self._var_fields_flat},
1639
- public=public,
1640
- )
1784
+ if self._non_validated is None:
1785
+ raise ValidationError("Run .validate() first.")
1641
1786
 
1642
- def standardize(self, key: str):
1643
- """Replace synonyms with standardized values.
1787
+ if len(kwargs) > 0 and key == "all":
1788
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
1789
+
1790
+ if accessor not in self.categoricals:
1791
+ raise ValueError(
1792
+ f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
1793
+ )
1794
+
1795
+ if accessor in self._table_adata_curators:
1796
+ adata_curator = self._table_adata_curators[accessor]
1797
+ adata_curator.add_new_from(key=key, **kwargs)
1798
+ if accessor == self._sample_metadata_key:
1799
+ self._sample_df_curator.add_new_from(key=key, **kwargs)
1800
+
1801
+ if accessor in self.non_validated.keys():
1802
+ if len(self.non_validated[accessor].values()) == 0:
1803
+ self.non_validated.pop(accessor)
1804
+
1805
+ def standardize(self, key: str, accessor: str | None = None) -> None:
1806
+ """Replace synonyms with canonical values.
1644
1807
 
1645
1808
  Modifies the dataset inplace.
1646
1809
 
1647
1810
  Args:
1648
- key: The key referencing the slot in the `tiledbsoma` store.
1649
- It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
1650
- or a column name in `.obs`.
1811
+ key: The key referencing the slot in the table or sample metadata.
1812
+ accessor: The accessor key such as 'sample_key' or 'table_key'.
1651
1813
  """
1652
1814
  if len(self.non_validated) == 0:
1653
1815
  logger.warning("values are already standardized")
1654
1816
  return
1655
- avail_keys = list(self._non_validated_values.keys())
1656
- if key == "all":
1657
- keys = avail_keys
1817
+ if self._artifact is not None:
1818
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
1819
+
1820
+ if accessor == self._sample_metadata_key:
1821
+ if key not in self._sample_metadata.columns:
1822
+ raise ValueError(f"key '{key}' not present in '{accessor}'!")
1658
1823
  else:
1659
- if key not in avail_keys:
1660
- raise KeyError(
1661
- f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
1662
- )
1663
- keys = [key]
1824
+ if (
1825
+ key == "var_index" and self._sdata.tables[accessor].var.index is None
1826
+ ) or (
1827
+ key != "var_index"
1828
+ and key not in self._sdata.tables[accessor].obs.columns
1829
+ ):
1830
+ raise ValueError(f"key '{key}' not present in '{accessor}'!")
1664
1831
 
1665
- for k in keys:
1666
- values, field = self._non_validated_values_field(k)
1667
- if len(values) == 0:
1668
- continue
1669
- if k in self._valid_var_keys:
1670
- ms, _, slot_key = k.partition("__")
1671
- slot = lambda experiment: experiment.ms[ms].var # noqa: B023
1672
- else:
1673
- slot = lambda experiment: experiment.obs
1674
- slot_key = k
1675
- # errors if public ontology and the model has no organism
1676
- # has to be fixed in bionty
1677
- organism = check_registry_organism(field.field.model, self._organism).get(
1678
- "organism"
1679
- )
1680
- syn_mapper = standardize_categories(
1681
- values=values,
1682
- field=field,
1683
- source=self._sources.get(k),
1684
- organism=organism,
1685
- )
1686
- if (n_syn_mapper := len(syn_mapper)) == 0:
1687
- continue
1832
+ if accessor in self._table_adata_curators.keys():
1833
+ adata_curator = self._table_adata_curators[accessor]
1834
+ adata_curator.standardize(key)
1835
+ if accessor == self._sample_metadata_key:
1836
+ self._sample_df_curator.standardize(key)
1688
1837
 
1689
- from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1838
+ if len(self.non_validated[accessor].values()) == 0:
1839
+ self.non_validated.pop(accessor)
1690
1840
 
1691
- with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1692
- value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
1693
- table = slot(experiment).read(value_filter=value_filter).concat()
1841
+ def validate(self) -> bool:
1842
+ """Validate variables and categorical observations.
1694
1843
 
1695
- if len(table) == 0:
1696
- continue
1844
+ This method also registers the validated records in the current instance:
1845
+ - from public sources
1697
1846
 
1698
- df = table.to_pandas()
1699
- # map values
1700
- df[slot_key] = df[slot_key].map(
1701
- lambda val: syn_mapper.get(val, val) # noqa
1702
- )
1703
- # write the mapped values
1704
- with _open_tiledbsoma(self._dataset, mode="w") as experiment:
1705
- slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
1706
- # update non_validated dict
1707
- non_val_k = [
1708
- nv for nv in self._non_validated_values[k] if nv not in syn_mapper
1709
- ]
1710
- self._non_validated_values[k] = non_val_k
1847
+ Args:
1848
+ organism: The organism name.
1711
1849
 
1712
- syn_mapper_print = _format_values(
1713
- [f'"{m_k}" "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
1714
- )
1715
- s = "s" if n_syn_mapper > 1 else ""
1716
- logger.success(
1717
- f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
1718
- )
1850
+ Returns:
1851
+ Whether the SpatialData object is validated.
1852
+ """
1853
+ # add all validated records to the current instance
1854
+ self._update_registry_all()
1855
+
1856
+ self._non_validated = {} # type: ignore
1857
+
1858
+ sample_validated = True
1859
+ if self._sample_df_curator:
1860
+ logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
1861
+ sample_validated &= self._sample_df_curator.validate()
1862
+ if len(self._sample_df_curator.non_validated) > 0:
1863
+ self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
1864
+ logger.print("")
1865
+
1866
+ mods_validated = True
1867
+ for table, adata_curator in self._table_adata_curators.items():
1868
+ logger.info(f"validating categoricals of table '{table}' ...")
1869
+ mods_validated &= adata_curator.validate()
1870
+ if len(adata_curator.non_validated) > 0:
1871
+ self._non_validated[table] = adata_curator.non_validated # type: ignore
1872
+ logger.print("")
1873
+
1874
+ self._is_validated = sample_validated & mods_validated
1875
+ return self._is_validated
1719
1876
 
1720
1877
  def save_artifact(
1721
1878
  self,
@@ -1725,424 +1882,373 @@ class TiledbsomaCatManager(CatManager):
1725
1882
  revises: Artifact | None = None,
1726
1883
  run: Run | None = None,
1727
1884
  ) -> Artifact:
1728
- """Save the validated `tiledbsoma` store and metadata.
1885
+ """Save the validated SpatialData store and metadata.
1729
1886
 
1730
1887
  Args:
1731
- description: A description of the ``tiledbsoma`` store.
1888
+ description: A description of the dataset.
1732
1889
  key: A path-like key to reference artifact in default storage,
1733
- e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
1890
+ e.g., `"myartifact.zarr"`. Artifacts with the same key form a version family.
1734
1891
  revises: Previous version of the artifact. Triggers a revision.
1735
1892
  run: The run that creates the artifact.
1736
1893
 
1737
1894
  Returns:
1738
1895
  A saved artifact record.
1739
1896
  """
1740
- from lamindb.models.artifact import add_labels
1741
-
1742
1897
  if not self._is_validated:
1743
1898
  self.validate()
1744
1899
  if not self._is_validated:
1745
1900
  raise ValidationError("Dataset does not validate. Please curate.")
1746
1901
 
1747
- if self._artifact is None:
1748
- artifact = Artifact(
1749
- self._dataset,
1750
- description=description,
1751
- key=key,
1752
- revises=revises,
1753
- run=run,
1754
- )
1755
- artifact.n_observations = self._n_obs
1756
- artifact.otype = "tiledbsoma"
1757
- artifact.save()
1758
- else:
1759
- artifact = self._artifact
1760
-
1761
- feature_sets = {}
1762
- if len(self._obs_fields) > 0:
1763
- organism = check_registry_organism(
1764
- self._columns_field.field.model, self._organism
1765
- ).get("organism")
1766
- empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
1767
- mock_df = pa.Table.from_pydict(
1768
- empty_dict, schema=self._obs_pa_schema
1769
- ).to_pandas()
1770
- # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
1771
- feature_sets["obs"] = Schema.from_df(
1772
- df=mock_df,
1773
- field=self._columns_field,
1774
- mute=True,
1775
- organism=organism,
1776
- )
1777
- for ms in self._var_fields:
1778
- var_key, var_field = self._var_fields[ms]
1779
- organism = check_registry_organism(
1780
- var_field.field.model, self._organism
1781
- ).get("organism")
1782
- feature_sets[f"{ms}__var"] = Schema.from_values(
1783
- values=self._validated_values[f"{ms}__{var_key}"],
1784
- field=var_field,
1785
- organism=organism,
1786
- raise_validation_error=False,
1787
- )
1788
- artifact._staged_feature_sets = feature_sets
1789
-
1790
- feature_ref_is_name = _ref_is_name(self._columns_field)
1791
- features = Feature.lookup().dict()
1792
- for key, field in self._obs_fields.items():
1793
- feature = features.get(key)
1794
- registry = field.field.model
1795
- organism = check_registry_organism(field.field.model, self._organism).get(
1796
- "organism"
1797
- )
1798
- labels = registry.from_values(
1799
- values=self._validated_values[key], field=field, organism=organism
1800
- )
1801
- if len(labels) == 0:
1802
- continue
1803
- if hasattr(registry, "_name_field"):
1804
- label_ref_is_name = field.field.name == registry._name_field
1805
- add_labels(
1806
- artifact,
1807
- records=labels,
1808
- feature=feature,
1809
- feature_ref_is_name=feature_ref_is_name,
1810
- label_ref_is_name=label_ref_is_name,
1811
- from_curator=True,
1812
- )
1813
-
1814
- return artifact.save()
1815
-
1816
-
1817
- class SpatialDataCatManager(CatManager):
1818
- """Curation flow for a ``Spatialdata`` object.
1819
-
1820
- See also :class:`~lamindb.Curator`.
1821
-
1822
- Note that if genes or other measurements are removed from the SpatialData object,
1823
- the object should be recreated.
1824
-
1825
- In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
1902
+ return save_artifact(
1903
+ self._sdata,
1904
+ description=description,
1905
+ fields=self.categoricals,
1906
+ index_field=self.var_index,
1907
+ key=key,
1908
+ artifact=self._artifact,
1909
+ revises=revises,
1910
+ run=run,
1911
+ schema=None,
1912
+ organism=self._organism,
1913
+ sample_metadata_key=self._sample_metadata_key,
1914
+ )
1826
1915
 
1827
- Args:
1828
- sdata: The SpatialData object to curate.
1829
- var_index: A dictionary mapping table keys to the ``.var`` indices.
1830
- categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
1831
1916
 
1832
- organism: The organism name.
1833
- sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
1834
- exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
1835
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1836
- using the exclude parameter ensures they are not validated.
1837
- verbosity: The verbosity level of the logger.
1838
- sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
1839
-
1840
- Examples:
1841
- >>> import bionty as bt
1842
- >>> curator = SpatialDataCatManager(
1843
- ... sdata,
1844
- ... var_index={
1845
- ... "table_1": bt.Gene.ensembl_gene_id,
1846
- ... },
1847
- ... categoricals={
1848
- ... "table1":
1849
- ... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
1850
- ... "sample":
1851
- ... {"experimental_factor": bt.ExperimentalFactor.name},
1852
- ... },
1853
- ... organism="human",
1854
- ... )
1855
- """
1917
+ class TiledbsomaCatManager(CatManager):
1918
+ """Categorical manager for `tiledbsoma.Experiment`."""
1856
1919
 
1857
1920
  def __init__(
1858
1921
  self,
1859
- sdata: Any,
1860
- var_index: dict[str, FieldAttr],
1861
- categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1862
- verbosity: str = "hint",
1922
+ experiment_uri: UPathStr | Artifact,
1923
+ var_index: dict[str, tuple[str, FieldAttr]],
1924
+ categoricals: dict[str, FieldAttr] | None = None,
1925
+ obs_columns: FieldAttr = Feature.name,
1863
1926
  organism: str | None = None,
1864
- sources: dict[str, dict[str, Record]] | None = None,
1865
- exclude: dict[str, dict] | None = None,
1866
- *,
1867
- sample_metadata_key: str | None = "sample",
1868
- ) -> None:
1869
- super().__init__(
1870
- dataset=sdata,
1871
- categoricals={},
1872
- sources=sources,
1873
- organism=organism,
1874
- exclude=exclude,
1875
- )
1876
- if isinstance(sdata, Artifact):
1877
- # TODO: load() doesn't yet work
1878
- self._sdata = sdata.load()
1879
- else:
1880
- self._sdata = self._dataset
1881
- self._sample_metadata_key = sample_metadata_key
1882
- self._write_path = None
1927
+ sources: dict[str, Record] | None = None,
1928
+ ):
1929
+ self._obs_fields = categoricals or {}
1883
1930
  self._var_fields = var_index
1884
- self._verify_accessor_exists(self._var_fields.keys())
1885
- self._categoricals = categoricals
1886
- self._table_keys = set(self._var_fields.keys()) | set(
1887
- self._categoricals.keys() - {self._sample_metadata_key}
1888
- )
1889
- self._verbosity = verbosity
1890
- self._sample_df_curator = None
1891
- if self._sample_metadata_key is not None:
1892
- self._sample_metadata = self._sdata.get_attrs(
1893
- key=self._sample_metadata_key, return_as="df", flatten=True
1894
- )
1895
- self._is_validated = False
1896
-
1897
- # Check validity of keys in categoricals
1898
- nonval_keys = []
1899
- for accessor, accessor_categoricals in self._categoricals.items():
1900
- if (
1901
- accessor == self._sample_metadata_key
1902
- and self._sample_metadata is not None
1903
- ):
1904
- for key in accessor_categoricals.keys():
1905
- if key not in self._sample_metadata.columns:
1906
- nonval_keys.append(key)
1907
- else:
1908
- for key in accessor_categoricals.keys():
1909
- if key not in self._sdata[accessor].obs.columns:
1910
- nonval_keys.append(key)
1911
-
1912
- _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1913
-
1914
- # check validity of keys in sources and exclude
1915
- for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
1916
- nonval_keys = []
1917
- for accessor, accessor_sources in dct.items():
1918
- if (
1919
- accessor == self._sample_metadata_key
1920
- and self._sample_metadata is not None
1921
- ):
1922
- columns = self._sample_metadata.columns
1923
- elif accessor != self._sample_metadata_key:
1924
- columns = self._sdata[accessor].obs.columns
1925
- else:
1926
- continue
1927
- for key in accessor_sources:
1928
- if key not in columns:
1929
- nonval_keys.append(key)
1930
- _maybe_curation_keys_not_present(nonval_keys, name)
1931
+ self._columns_field = obs_columns
1932
+ if isinstance(experiment_uri, Artifact):
1933
+ self._dataset = experiment_uri.path
1934
+ self._artifact = experiment_uri
1935
+ else:
1936
+ self._dataset = UPath(experiment_uri)
1937
+ self._artifact = None
1938
+ self._organism = organism
1939
+ self._sources = sources or {}
1931
1940
 
1932
- # Set up sample level metadata and table Curator objects
1933
- if (
1934
- self._sample_metadata_key is not None
1935
- and self._sample_metadata_key in self._categoricals
1936
- ):
1937
- self._sample_df_curator = DataFrameCatManager(
1938
- df=self._sample_metadata,
1939
- columns=Feature.name,
1940
- categoricals=self._categoricals.get(self._sample_metadata_key, {}),
1941
- verbosity=verbosity,
1942
- sources=self._sources.get(self._sample_metadata_key),
1943
- exclude=self._exclude.get(self._sample_metadata_key),
1944
- organism=organism,
1945
- )
1946
- self._table_adata_curators = {
1947
- table: AnnDataCatManager(
1948
- data=self._sdata[table],
1949
- var_index=var_index.get(table),
1950
- categoricals=self._categoricals.get(table),
1951
- verbosity=verbosity,
1952
- sources=self._sources.get(table),
1953
- exclude=self._exclude.get(table),
1954
- organism=organism,
1955
- )
1956
- for table in self._table_keys
1957
- }
1941
+ self._is_validated: bool | None = False
1942
+ self._non_validated_values: dict[str, list] | None = None
1943
+ self._validated_values: dict[str, list] = {}
1944
+ # filled by _check_save_keys
1945
+ self._n_obs: int | None = None
1946
+ self._valid_obs_keys: list[str] | None = None
1947
+ self._obs_pa_schema: pa.lib.Schema | None = (
1948
+ None # this is needed to create the obs feature set
1949
+ )
1950
+ self._valid_var_keys: list[str] | None = None
1951
+ self._var_fields_flat: dict[str, FieldAttr] | None = None
1952
+ self._check_save_keys()
1958
1953
 
1959
- self._non_validated = None
1954
+ # check that the provided keys in var_index and categoricals are available in the store
1955
+ # and save features
1956
+ def _check_save_keys(self):
1957
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1960
1958
 
1961
- @property
1962
- def var_index(self) -> FieldAttr:
1963
- """Return the registry fields to validate variables indices against."""
1964
- return self._var_fields
1959
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1960
+ experiment_obs = experiment.obs
1961
+ self._n_obs = len(experiment_obs)
1962
+ self._obs_pa_schema = experiment_obs.schema
1963
+ valid_obs_keys = [
1964
+ k for k in self._obs_pa_schema.names if k != "soma_joinid"
1965
+ ]
1966
+ self._valid_obs_keys = valid_obs_keys
1965
1967
 
1966
- @property
1967
- def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
1968
- """Return the categorical keys and fields to validate against."""
1969
- return self._categoricals
1968
+ valid_var_keys = []
1969
+ ms_list = []
1970
+ for ms in experiment.ms.keys():
1971
+ ms_list.append(ms)
1972
+ var_ms = experiment.ms[ms].var
1973
+ valid_var_keys += [
1974
+ f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
1975
+ ]
1976
+ self._valid_var_keys = valid_var_keys
1970
1977
 
1971
- @property
1972
- def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
1973
- """Return the non-validated features and labels."""
1974
- if self._non_validated is None:
1975
- raise ValidationError("Please run validate() first!")
1976
- return self._non_validated
1978
+ # check validity of keys in categoricals
1979
+ nonval_keys = []
1980
+ for obs_key in self._obs_fields.keys():
1981
+ if obs_key not in valid_obs_keys:
1982
+ nonval_keys.append(obs_key)
1983
+ _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1977
1984
 
1978
- def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
1979
- """Verify that the accessors exist (either a valid table or in attrs)."""
1980
- for acc in accessors:
1981
- is_present = False
1982
- try:
1983
- self._sdata.get_attrs(key=acc)
1984
- is_present = True
1985
- except KeyError:
1986
- if acc in self._sdata.tables.keys():
1987
- is_present = True
1988
- if not is_present:
1989
- raise ValidationError(f"Accessor '{acc}' does not exist!")
1985
+ # check validity of keys in var_index
1986
+ self._var_fields_flat = {}
1987
+ nonval_keys = []
1988
+ for ms_key in self._var_fields.keys():
1989
+ var_key, var_field = self._var_fields[ms_key]
1990
+ var_key_flat = f"{ms_key}__{var_key}"
1991
+ if var_key_flat not in valid_var_keys:
1992
+ nonval_keys.append(f"({ms_key}, {var_key})")
1993
+ else:
1994
+ self._var_fields_flat[var_key_flat] = var_field
1995
+ _maybe_curation_keys_not_present(nonval_keys, "var_index")
1990
1996
 
1991
- def lookup(self, public: bool = False) -> CurateLookup:
1992
- """Look up categories.
1997
+ # check validity of keys in sources
1998
+ valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
1999
+ nonval_keys = []
2000
+ for arg_key in self._sources.keys():
2001
+ if arg_key not in valid_arg_keys:
2002
+ nonval_keys.append(arg_key)
2003
+ _maybe_curation_keys_not_present(nonval_keys, "sources")
1993
2004
 
1994
- Args:
1995
- public: Whether the lookup is performed on the public reference.
1996
- """
1997
- cat_values_dict = list(self.categoricals.values())[0]
1998
- return CurateLookup(
1999
- categoricals=cat_values_dict,
2000
- slots={"accessors": cat_values_dict.keys()},
2001
- public=public,
2005
+ # register obs columns' names
2006
+ register_columns = list(self._obs_fields.keys())
2007
+ update_registry(
2008
+ values=register_columns,
2009
+ field=self._columns_field,
2010
+ key="columns",
2011
+ validated_only=False,
2012
+ organism=self._organism,
2013
+ source=self._sources.get("columns"),
2002
2014
  )
2003
-
2004
- def _update_registry_all(self) -> None:
2005
- """Saves labels of all features for sample and table metadata."""
2006
- if self._sample_df_curator is not None:
2007
- self._sample_df_curator._update_registry_all(
2008
- validated_only=True,
2009
- )
2010
- for _, adata_curator in self._table_adata_curators.items():
2011
- adata_curator._obs_df_curator._update_registry_all(
2015
+ additional_columns = [k for k in valid_obs_keys if k not in register_columns]
2016
+ # no need to register with validated_only=True if columns are features
2017
+ if (
2018
+ len(additional_columns) > 0
2019
+ and self._columns_field.field.model is not Feature
2020
+ ):
2021
+ update_registry(
2022
+ values=additional_columns,
2023
+ field=self._columns_field,
2024
+ key="columns",
2012
2025
  validated_only=True,
2026
+ organism=self._organism,
2027
+ source=self._sources.get("columns"),
2013
2028
  )
2014
2029
 
2015
- def add_new_from_var_index(self, table: str, **kwargs) -> None:
2016
- """Save new values from ``.var.index`` of table.
2030
+ def validate(self):
2031
+ """Validate categories."""
2032
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
2017
2033
 
2018
- Args:
2019
- table: The table key.
2020
- organism: The organism name.
2021
- **kwargs: Additional keyword arguments to pass to create new records.
2022
- """
2023
- if self._non_validated is None:
2024
- raise ValidationError("Run .validate() first.")
2025
- self._table_adata_curators[table].add_new_from_var_index(**kwargs)
2026
- if table in self.non_validated.keys():
2027
- if "var_index" in self._non_validated[table]:
2028
- self._non_validated[table].pop("var_index")
2034
+ validated = True
2035
+ self._non_validated_values = {}
2036
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
2037
+ for ms, (key, field) in self._var_fields.items():
2038
+ var_ms = experiment.ms[ms].var
2039
+ var_ms_key = f"{ms}__{key}"
2040
+ # it was already validated and cached
2041
+ if var_ms_key in self._validated_values:
2042
+ continue
2043
+ var_ms_values = (
2044
+ var_ms.read(column_names=[key]).concat()[key].to_pylist()
2045
+ )
2046
+ update_registry(
2047
+ values=var_ms_values,
2048
+ field=field,
2049
+ key=var_ms_key,
2050
+ validated_only=True,
2051
+ organism=self._organism,
2052
+ source=self._sources.get(var_ms_key),
2053
+ )
2054
+ _, non_val = validate_categories(
2055
+ values=var_ms_values,
2056
+ field=field,
2057
+ key=var_ms_key,
2058
+ organism=self._organism,
2059
+ source=self._sources.get(var_ms_key),
2060
+ )
2061
+ if len(non_val) > 0:
2062
+ validated = False
2063
+ self._non_validated_values[var_ms_key] = non_val
2064
+ else:
2065
+ self._validated_values[var_ms_key] = var_ms_values
2066
+
2067
+ obs = experiment.obs
2068
+ for key, field in self._obs_fields.items():
2069
+ # already validated and cached
2070
+ if key in self._validated_values:
2071
+ continue
2072
+ values = pa.compute.unique(
2073
+ obs.read(column_names=[key]).concat()[key]
2074
+ ).to_pylist()
2075
+ update_registry(
2076
+ values=values,
2077
+ field=field,
2078
+ key=key,
2079
+ validated_only=True,
2080
+ organism=self._organism,
2081
+ source=self._sources.get(key),
2082
+ )
2083
+ _, non_val = validate_categories(
2084
+ values=values,
2085
+ field=field,
2086
+ key=key,
2087
+ organism=self._organism,
2088
+ source=self._sources.get(key),
2089
+ )
2090
+ if len(non_val) > 0:
2091
+ validated = False
2092
+ self._non_validated_values[key] = non_val
2093
+ else:
2094
+ self._validated_values[key] = values
2095
+ self._is_validated = validated
2096
+ return self._is_validated
2097
+
2098
+ def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
2099
+ assert self._non_validated_values is not None # noqa: S101
2029
2100
 
2030
- if len(self.non_validated[table].values()) == 0:
2031
- self.non_validated.pop(table)
2101
+ if key in self._valid_obs_keys:
2102
+ field = self._obs_fields[key]
2103
+ elif key in self._valid_var_keys:
2104
+ ms = key.partition("__")[0]
2105
+ field = self._var_fields[ms][1]
2106
+ else:
2107
+ raise KeyError(f"key {key} is invalid!")
2108
+ values = self._non_validated_values.get(key, [])
2109
+ return values, field
2032
2110
 
2033
- def add_new_from(
2034
- self,
2035
- key: str,
2036
- accessor: str | None = None,
2037
- **kwargs,
2038
- ) -> None:
2039
- """Save new values of categorical from sample level metadata or table.
2111
+ def add_new_from(self, key: str, **kwargs) -> None:
2112
+ """Add validated & new categories.
2040
2113
 
2041
2114
  Args:
2042
- key: The key referencing the slot in the DataFrame.
2043
- accessor: The accessor key such as 'sample' or 'table x'.
2044
- organism: The organism name.
2045
- **kwargs: Additional keyword arguments to pass to create new records.
2115
+ key: The key referencing the slot in the `tiledbsoma` store.
2116
+ It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
2117
+ or a column name in `.obs`.
2046
2118
  """
2047
- if self._non_validated is None:
2119
+ if self._non_validated_values is None:
2048
2120
  raise ValidationError("Run .validate() first.")
2121
+ if key == "all":
2122
+ keys = list(self._non_validated_values.keys())
2123
+ else:
2124
+ avail_keys = list(
2125
+ chain(self._non_validated_values.keys(), self._validated_values.keys())
2126
+ )
2127
+ if key not in avail_keys:
2128
+ raise KeyError(
2129
+ f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
2130
+ )
2131
+ keys = [key]
2132
+ for k in keys:
2133
+ values, field = self._non_validated_values_field(k)
2134
+ if len(values) == 0:
2135
+ continue
2136
+ update_registry(
2137
+ values=values,
2138
+ field=field,
2139
+ key=k,
2140
+ validated_only=False,
2141
+ organism=self._organism,
2142
+ source=self._sources.get(k),
2143
+ **kwargs,
2144
+ )
2145
+ # update non-validated values list but keep the key there
2146
+ # it will be removed by .validate()
2147
+ if k in self._non_validated_values:
2148
+ self._non_validated_values[k] = []
2049
2149
 
2050
- if len(kwargs) > 0 and key == "all":
2051
- raise ValueError("Cannot pass additional arguments to 'all' key!")
2150
+ @property
2151
+ def non_validated(self) -> dict[str, list]:
2152
+ """Return the non-validated features and labels."""
2153
+ non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
2154
+ return non_val
2052
2155
 
2053
- if accessor not in self.categoricals:
2054
- raise ValueError(
2055
- f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
2056
- )
2156
+ @property
2157
+ def var_index(self) -> dict[str, FieldAttr]:
2158
+ """Return the registry fields with flattened keys to validate variables indices against."""
2159
+ return self._var_fields_flat
2057
2160
 
2058
- if accessor in self._table_adata_curators:
2059
- adata_curator = self._table_adata_curators[accessor]
2060
- adata_curator.add_new_from(key=key, **kwargs)
2061
- if accessor == self._sample_metadata_key:
2062
- self._sample_df_curator.add_new_from(key=key, **kwargs)
2161
+ @property
2162
+ def categoricals(self) -> dict[str, FieldAttr]:
2163
+ """Return the obs fields to validate against."""
2164
+ return self._obs_fields
2063
2165
 
2064
- if accessor in self.non_validated.keys():
2065
- if len(self.non_validated[accessor].values()) == 0:
2066
- self.non_validated.pop(accessor)
2166
+ def lookup(self, public: bool = False) -> CatLookup:
2167
+ """Lookup categories.
2067
2168
 
2068
- def standardize(self, key: str, accessor: str | None = None) -> None:
2069
- """Replace synonyms with canonical values.
2169
+ Args:
2170
+ public: If "public", the lookup is performed on the public reference.
2171
+ """
2172
+ return CatLookup(
2173
+ categoricals=self._obs_fields,
2174
+ slots={"columns": self._columns_field, **self._var_fields_flat},
2175
+ public=public,
2176
+ organism=self._organism,
2177
+ sources=self._sources,
2178
+ )
2179
+
2180
+ def standardize(self, key: str):
2181
+ """Replace synonyms with standardized values.
2070
2182
 
2071
2183
  Modifies the dataset inplace.
2072
2184
 
2073
2185
  Args:
2074
- key: The key referencing the slot in the table or sample metadata.
2075
- accessor: The accessor key such as 'sample_key' or 'table_key'.
2186
+ key: The key referencing the slot in the `tiledbsoma` store.
2187
+ It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
2188
+ or a column name in `.obs`.
2076
2189
  """
2077
2190
  if len(self.non_validated) == 0:
2078
2191
  logger.warning("values are already standardized")
2079
2192
  return
2080
- if self._artifact is not None:
2081
- raise RuntimeError("can't mutate the dataset when an artifact is passed!")
2082
-
2083
- if accessor == self._sample_metadata_key:
2084
- if key not in self._sample_metadata.columns:
2085
- raise ValueError(f"key '{key}' not present in '{accessor}'!")
2193
+ avail_keys = list(self._non_validated_values.keys())
2194
+ if key == "all":
2195
+ keys = avail_keys
2086
2196
  else:
2087
- if (
2088
- key == "var_index" and self._sdata.tables[accessor].var.index is None
2089
- ) or (
2090
- key != "var_index"
2091
- and key not in self._sdata.tables[accessor].obs.columns
2092
- ):
2093
- raise ValueError(f"key '{key}' not present in '{accessor}'!")
2094
-
2095
- if accessor in self._table_adata_curators.keys():
2096
- adata_curator = self._table_adata_curators[accessor]
2097
- adata_curator.standardize(key)
2098
- if accessor == self._sample_metadata_key:
2099
- self._sample_df_curator.standardize(key)
2100
-
2101
- if len(self.non_validated[accessor].values()) == 0:
2102
- self.non_validated.pop(accessor)
2103
-
2104
- def validate(self) -> bool:
2105
- """Validate variables and categorical observations.
2106
-
2107
- This method also registers the validated records in the current instance:
2108
- - from public sources
2109
-
2110
- Args:
2111
- organism: The organism name.
2197
+ if key not in avail_keys:
2198
+ raise KeyError(
2199
+ f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
2200
+ )
2201
+ keys = [key]
2112
2202
 
2113
- Returns:
2114
- Whether the SpatialData object is validated.
2115
- """
2116
- from lamindb.core._settings import settings
2203
+ for k in keys:
2204
+ values, field = self._non_validated_values_field(k)
2205
+ if len(values) == 0:
2206
+ continue
2207
+ if k in self._valid_var_keys:
2208
+ ms, _, slot_key = k.partition("__")
2209
+ slot = lambda experiment: experiment.ms[ms].var # noqa: B023
2210
+ else:
2211
+ slot = lambda experiment: experiment.obs
2212
+ slot_key = k
2213
+ syn_mapper = standardize_categories(
2214
+ values=values,
2215
+ field=field,
2216
+ source=self._sources.get(k),
2217
+ organism=self._organism,
2218
+ )
2219
+ if (n_syn_mapper := len(syn_mapper)) == 0:
2220
+ continue
2117
2221
 
2118
- # add all validated records to the current instance
2119
- verbosity = settings.verbosity
2120
- try:
2121
- settings.verbosity = "error"
2122
- self._update_registry_all()
2123
- finally:
2124
- settings.verbosity = verbosity
2222
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
2125
2223
 
2126
- self._non_validated = {} # type: ignore
2224
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
2225
+ value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
2226
+ table = slot(experiment).read(value_filter=value_filter).concat()
2127
2227
 
2128
- sample_validated = True
2129
- if self._sample_df_curator:
2130
- logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
2131
- sample_validated &= self._sample_df_curator.validate()
2132
- if len(self._sample_df_curator.non_validated) > 0:
2133
- self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
2134
- logger.print("")
2228
+ if len(table) == 0:
2229
+ continue
2135
2230
 
2136
- mods_validated = True
2137
- for table, adata_curator in self._table_adata_curators.items():
2138
- logger.info(f"validating categoricals of table '{table}' ...")
2139
- mods_validated &= adata_curator.validate()
2140
- if len(adata_curator.non_validated) > 0:
2141
- self._non_validated[table] = adata_curator.non_validated # type: ignore
2142
- logger.print("")
2231
+ df = table.to_pandas()
2232
+ # map values
2233
+ df[slot_key] = df[slot_key].map(
2234
+ lambda val: syn_mapper.get(val, val) # noqa
2235
+ )
2236
+ # write the mapped values
2237
+ with _open_tiledbsoma(self._dataset, mode="w") as experiment:
2238
+ slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
2239
+ # update non_validated dict
2240
+ non_val_k = [
2241
+ nv for nv in self._non_validated_values[k] if nv not in syn_mapper
2242
+ ]
2243
+ self._non_validated_values[k] = non_val_k
2143
2244
 
2144
- self._is_validated = sample_validated & mods_validated
2145
- return self._is_validated
2245
+ syn_mapper_print = _format_values(
2246
+ [f'"{m_k}" → "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
2247
+ )
2248
+ s = "s" if n_syn_mapper > 1 else ""
2249
+ logger.success(
2250
+ f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
2251
+ )
2146
2252
 
2147
2253
  def save_artifact(
2148
2254
  self,
@@ -2152,217 +2258,112 @@ class SpatialDataCatManager(CatManager):
2152
2258
  revises: Artifact | None = None,
2153
2259
  run: Run | None = None,
2154
2260
  ) -> Artifact:
2261
+ """Save the validated `tiledbsoma` store and metadata.
2262
+
2263
+ Args:
2264
+ description: A description of the ``tiledbsoma`` store.
2265
+ key: A path-like key to reference artifact in default storage,
2266
+ e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
2267
+ revises: Previous version of the artifact. Triggers a revision.
2268
+ run: The run that creates the artifact.
2269
+
2270
+ Returns:
2271
+ A saved artifact record.
2272
+ """
2155
2273
  if not self._is_validated:
2156
2274
  self.validate()
2157
2275
  if not self._is_validated:
2158
2276
  raise ValidationError("Dataset does not validate. Please curate.")
2159
2277
 
2160
- verbosity = settings.verbosity
2161
- try:
2162
- settings.verbosity = "warning"
2163
-
2164
- self._artifact = Artifact.from_spatialdata(
2165
- self._sdata,
2166
- key=key,
2278
+ if self._artifact is None:
2279
+ artifact = Artifact(
2280
+ self._dataset,
2167
2281
  description=description,
2282
+ key=key,
2168
2283
  revises=revises,
2169
2284
  run=run,
2170
2285
  )
2171
- self._artifact.save()
2286
+ artifact.n_observations = self._n_obs
2287
+ artifact.otype = "tiledbsoma"
2288
+ artifact.save()
2289
+ else:
2290
+ artifact = self._artifact
2291
+
2292
+ feature_sets = {}
2293
+ if len(self._obs_fields) > 0:
2294
+ empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
2295
+ mock_df = pa.Table.from_pydict(
2296
+ empty_dict, schema=self._obs_pa_schema
2297
+ ).to_pandas()
2298
+ # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
2299
+ feature_sets["obs"] = Schema.from_df(
2300
+ df=mock_df,
2301
+ field=self._columns_field,
2302
+ mute=True,
2303
+ organism=self._organism,
2304
+ )
2305
+ for ms in self._var_fields:
2306
+ var_key, var_field = self._var_fields[ms]
2307
+ feature_sets[f"{ms}__var"] = Schema.from_values(
2308
+ values=self._validated_values[f"{ms}__{var_key}"],
2309
+ field=var_field,
2310
+ organism=self._organism,
2311
+ raise_validation_error=False,
2312
+ )
2313
+ artifact._staged_feature_sets = feature_sets
2172
2314
 
2173
- # Link schemas
2174
- feature_kwargs = check_registry_organism(
2175
- (list(self._var_fields.values())[0].field.model),
2176
- self._organism,
2315
+ feature_ref_is_name = _ref_is_name(self._columns_field)
2316
+ features = Feature.lookup().dict()
2317
+ for key, field in self._obs_fields.items():
2318
+ feature = features.get(key)
2319
+ registry = field.field.model
2320
+ labels = registry.from_values(
2321
+ values=self._validated_values[key],
2322
+ field=field,
2323
+ organism=self._organism,
2177
2324
  )
2178
-
2179
- def _add_set_from_spatialdata(
2180
- host: Artifact | Collection | Run,
2181
- var_fields: dict[str, FieldAttr],
2182
- obs_fields: dict[str, FieldAttr] = None,
2183
- mute: bool = False,
2184
- organism: str | Record | None = None,
2185
- ):
2186
- """Add Schemas from SpatialData."""
2187
- if obs_fields is None:
2188
- obs_fields = {}
2189
- assert host.otype == "SpatialData" # noqa: S101
2190
-
2191
- feature_sets = {}
2192
-
2193
- # sample features
2194
- sample_features = Feature.from_values(self._sample_metadata.columns) # type: ignore
2195
- if len(sample_features) > 0:
2196
- feature_sets[self._sample_metadata_key] = Schema(
2197
- features=sample_features
2198
- )
2199
-
2200
- # table features
2201
- for table, field in var_fields.items():
2202
- table_fs = parse_staged_feature_sets_from_anndata(
2203
- self._sdata[table],
2204
- var_field=field,
2205
- obs_field=obs_fields.get(table, Feature.name),
2206
- mute=mute,
2207
- organism=organism,
2208
- )
2209
- for k, v in table_fs.items():
2210
- feature_sets[f"['{table}'].{k}"] = v
2211
-
2212
- def _unify_staged_feature_sets_by_hash(
2213
- feature_sets: MutableMapping[str, Schema],
2214
- ):
2215
- unique_values: dict[str, Any] = {}
2216
-
2217
- for key, value in feature_sets.items():
2218
- value_hash = (
2219
- value.hash
2220
- ) # Assuming each value has a .hash attribute
2221
- if value_hash in unique_values:
2222
- feature_sets[key] = unique_values[value_hash]
2223
- else:
2224
- unique_values[value_hash] = value
2225
-
2226
- return feature_sets
2227
-
2228
- # link feature sets
2229
- host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
2230
- feature_sets
2325
+ if len(labels) == 0:
2326
+ continue
2327
+ if hasattr(registry, "_name_field"):
2328
+ label_ref_is_name = field.field.name == registry._name_field
2329
+ add_labels(
2330
+ artifact,
2331
+ records=labels,
2332
+ feature=feature,
2333
+ feature_ref_is_name=feature_ref_is_name,
2334
+ label_ref_is_name=label_ref_is_name,
2335
+ from_curator=True,
2231
2336
  )
2232
- host.save()
2233
-
2234
- _add_set_from_spatialdata(
2235
- self._artifact, var_fields=self._var_fields, **feature_kwargs
2236
- )
2237
-
2238
- # Link labels
2239
- def _add_labels_from_spatialdata(
2240
- data,
2241
- artifact: Artifact,
2242
- fields: dict[str, FieldAttr],
2243
- feature_ref_is_name: bool | None = None,
2244
- ):
2245
- """Add Labels from SpatialData."""
2246
- features = Feature.lookup().dict()
2247
- for key, field in fields.items():
2248
- feature = features.get(key)
2249
- registry = field.field.model
2250
- filter_kwargs = check_registry_organism(registry, self._organism)
2251
- filter_kwargs_current = get_current_filter_kwargs(
2252
- registry, filter_kwargs
2253
- )
2254
- df = data if isinstance(data, pd.DataFrame) else data.obs
2255
- labels = registry.from_values(
2256
- df[key],
2257
- field=field,
2258
- **filter_kwargs_current,
2259
- )
2260
- if len(labels) == 0:
2261
- continue
2262
2337
 
2263
- label_ref_is_name = None
2264
- if hasattr(registry, "_name_field"):
2265
- label_ref_is_name = field.field.name == registry._name_field
2266
- add_labels(
2267
- artifact,
2268
- records=labels,
2269
- feature=feature,
2270
- feature_ref_is_name=feature_ref_is_name,
2271
- label_ref_is_name=label_ref_is_name,
2272
- from_curator=True,
2273
- )
2274
-
2275
- for accessor, accessor_fields in self._categoricals.items():
2276
- column_field = self._var_fields.get(accessor)
2277
- if accessor == self._sample_metadata_key:
2278
- _add_labels_from_spatialdata(
2279
- self._sample_metadata,
2280
- self._artifact,
2281
- accessor_fields,
2282
- feature_ref_is_name=(
2283
- None if column_field is None else _ref_is_name(column_field)
2284
- ),
2285
- )
2286
- else:
2287
- _add_labels_from_spatialdata(
2288
- self._sdata.tables[accessor],
2289
- self._artifact,
2290
- accessor_fields,
2291
- feature_ref_is_name=(
2292
- None if column_field is None else _ref_is_name(column_field)
2293
- ),
2294
- )
2295
-
2296
- finally:
2297
- settings.verbosity = verbosity
2298
-
2299
- slug = ln_setup.settings.instance.slug
2300
- if ln_setup.settings.instance.is_remote: # pragma: no cover
2301
- logger.important(
2302
- f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
2303
- )
2304
-
2305
- return self._artifact
2338
+ return artifact.save()
2306
2339
 
2307
2340
 
2308
- def _restrict_obs_fields(
2309
- obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
2310
- ) -> dict[str, str]:
2311
- """Restrict the obs fields to name return only available obs fields.
2341
+ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2342
+ """Categorical manager for `AnnData` respecting the CELLxGENE schema.
2312
2343
 
2313
- To simplify the curation, we only validate against either name or ontology_id.
2314
- If both are available, we validate against ontology_id.
2315
- If none are available, we validate against name.
2344
+ This will be superceded by a schema-based curation flow.
2316
2345
  """
2317
- obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
2318
- for name, field in obs_fields.items():
2319
- if name.endswith("_ontology_term_id"):
2320
- continue
2321
- # if both the ontology id and the name are present, only validate on the ontology_id
2322
- if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
2323
- obs_fields_unique.pop(name)
2324
- # if the neither name nor ontology id are present, validate on the name
2325
- # this will raise error downstream, we just use name to be more readable
2326
- if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
2327
- obs_fields_unique[name] = field
2328
-
2329
- # Only retain obs_fields_unique that have keys in adata.obs.columns
2330
- available_obs_fields = {
2331
- k: v for k, v in obs_fields_unique.items() if k in obs.columns
2332
- }
2333
-
2334
- return available_obs_fields
2335
-
2336
-
2337
- def _add_defaults_to_obs(
2338
- obs: pd.DataFrame,
2339
- defaults: dict[str, str],
2340
- ) -> None:
2341
- """Add default columns and values to obs DataFrame."""
2342
- added_defaults: dict = {}
2343
- for name, default in defaults.items():
2344
- if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
2345
- obs[name] = default
2346
- added_defaults[name] = default
2347
- logger.important(
2348
- f"added default value '{default}' to the adata.obs['{name}']"
2349
- )
2350
-
2351
-
2352
- class CellxGeneAnnDataCatManager(AnnDataCatManager):
2353
- """Annotation flow of AnnData based on CELLxGENE schema."""
2354
2346
 
2355
- _controls_were_created: bool | None = None
2347
+ cxg_categoricals_defaults = {
2348
+ "cell_type": "unknown",
2349
+ "development_stage": "unknown",
2350
+ "disease": "normal",
2351
+ "donor_id": "unknown",
2352
+ "self_reported_ethnicity": "unknown",
2353
+ "sex": "unknown",
2354
+ "suspension_type": "cell",
2355
+ "tissue_type": "tissue",
2356
+ }
2356
2357
 
2357
2358
  def __init__(
2358
2359
  self,
2359
- adata: ad.AnnData | UPathStr,
2360
+ adata: ad.AnnData,
2360
2361
  categoricals: dict[str, FieldAttr] | None = None,
2361
2362
  organism: Literal["human", "mouse"] = "human",
2362
2363
  *,
2364
+ schema_version: Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
2363
2365
  defaults: dict[str, str] = None,
2364
2366
  extra_sources: dict[str, Record] = None,
2365
- schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
2366
2367
  verbosity: str = "hint",
2367
2368
  ) -> None:
2368
2369
  """CELLxGENE schema curator.
@@ -2372,304 +2373,85 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2372
2373
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
2373
2374
  The CELLxGENE Curator maps against the required CELLxGENE fields by default.
2374
2375
  organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
2376
+ schema_version: The CELLxGENE schema version to curate against.
2375
2377
  defaults: Default values that are set if columns or column values are missing.
2376
2378
  extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
2377
2379
  These extra sources are joined with the CELLxGENE fixed sources.
2378
2380
  Use this parameter when subclassing.
2379
- exclude: A dictionary mapping column names to values to exclude.
2380
- schema_version: The CELLxGENE schema version to curate against.
2381
2381
  verbosity: The verbosity level.
2382
-
2383
2382
  """
2384
2383
  import bionty as bt
2385
2384
 
2386
- CellxGeneAnnDataCatManager._init_categoricals_additional_values()
2385
+ from ._cellxgene_schemas import (
2386
+ _add_defaults_to_obs,
2387
+ _create_sources,
2388
+ _init_categoricals_additional_values,
2389
+ _restrict_obs_fields,
2390
+ )
2387
2391
 
2388
- var_index: FieldAttr = bt.Gene.ensembl_gene_id
2392
+ # Add defaults first to ensure that we fetch valid sources
2393
+ if defaults:
2394
+ _add_defaults_to_obs(adata.obs, defaults)
2389
2395
 
2396
+ # Filter categoricals based on what's present in adata
2390
2397
  if categoricals is None:
2391
- categoricals = CellxGeneAnnDataCatManager._get_categoricals()
2398
+ categoricals = self._get_cxg_categoricals()
2399
+ categoricals = _restrict_obs_fields(adata.obs, categoricals)
2392
2400
 
2393
- self.organism = organism
2394
-
2395
- VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
2396
- if schema_version not in VALID_SCHEMA_VERSIONS:
2397
- valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
2398
- raise ValueError(
2399
- f"Invalid schema_version: {schema_version}. "
2400
- f"Valid versions are: {valid_versions}"
2401
- )
2401
+ # Configure sources
2402
+ sources = _create_sources(categoricals, schema_version, organism)
2402
2403
  self.schema_version = schema_version
2403
2404
  self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
2404
- with resources.path(
2405
- "lamindb.curators._cellxgene_schemas", "schema_versions.yml"
2406
- ) as schema_versions_path:
2407
- self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
2408
- self.schema_version
2409
- ]
2410
-
2411
- # Fetch AnnData obs to be able to set defaults and get sources
2412
- if isinstance(adata, ad.AnnData):
2413
- self._adata_obs = adata.obs
2414
- else:
2415
- self._adata_obs = backed_access(upath.create_path(adata)).obs # type: ignore
2416
-
2417
- # Add defaults first to ensure that we fetch valid sources
2418
- if defaults:
2419
- _add_defaults_to_obs(self._adata_obs, defaults)
2420
-
2421
- self.sources = self._create_sources(self._adata_obs)
2422
- self.sources = {
2423
- entity: source
2424
- for entity, source in self.sources.items()
2425
- if source is not None
2426
- }
2427
-
2428
2405
  # These sources are not a part of the cellxgene schema but rather passed through.
2429
2406
  # This is useful when other Curators extend the CELLxGENE curator
2430
2407
  if extra_sources:
2431
- self.sources = self.sources | extra_sources
2408
+ sources = sources | extra_sources
2432
2409
 
2433
- # Exclude default values from validation because they are not available in the pinned sources
2434
- exclude_keys = {
2435
- entity: default
2436
- for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
2437
- if entity in self._adata_obs.columns # type: ignore
2438
- }
2410
+ _init_categoricals_additional_values()
2439
2411
 
2440
2412
  super().__init__(
2441
2413
  data=adata,
2442
- var_index=var_index,
2443
- categoricals=_restrict_obs_fields(self._adata_obs, categoricals),
2414
+ var_index=bt.Gene.ensembl_gene_id,
2415
+ categoricals=categoricals,
2444
2416
  verbosity=verbosity,
2445
2417
  organism=organism,
2446
- sources=self.sources,
2447
- exclude=exclude_keys,
2418
+ sources=sources,
2448
2419
  )
2449
2420
 
2450
2421
  @classmethod
2451
- def _init_categoricals_additional_values(cls) -> None:
2452
- import bionty as bt
2453
-
2454
- import lamindb as ln
2455
-
2456
- # Note: if you add another control below, be mindful to change the if condition that
2457
- # triggers whether creating these records is re-considered
2458
- if cls._controls_were_created is None:
2459
- cls._controls_were_created = (
2460
- ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
2461
- is not None
2462
- )
2463
- if not cls._controls_were_created:
2464
- logger.important("Creating control labels in the CellxGene schema.")
2465
- bt.CellType(
2466
- ontology_id="unknown",
2467
- name="unknown",
2468
- description="From CellxGene schema.",
2469
- ).save()
2470
- pato = bt.Source.filter(name="pato", version="2024-03-28").one()
2471
- normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
2472
- bt.Disease(
2473
- uid=normal.uid,
2474
- name=normal.name,
2475
- ontology_id=normal.ontology_id,
2476
- description=normal.description,
2477
- source=normal.source,
2478
- ).save()
2479
- bt.Ethnicity(
2480
- ontology_id="na", name="na", description="From CellxGene schema."
2481
- ).save()
2482
- bt.Ethnicity(
2483
- ontology_id="unknown",
2484
- name="unknown",
2485
- description="From CellxGene schema.",
2486
- ).save()
2487
- bt.DevelopmentalStage(
2488
- ontology_id="unknown",
2489
- name="unknown",
2490
- description="From CellxGene schema.",
2491
- ).save()
2492
- bt.Phenotype(
2493
- ontology_id="unknown",
2494
- name="unknown",
2495
- description="From CellxGene schema.",
2496
- ).save()
2497
-
2498
- tissue_type = ln.ULabel(
2499
- name="TissueType",
2500
- is_type=True,
2501
- description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
2502
- ).save()
2503
- ln.ULabel(
2504
- name="tissue", type=tissue_type, description="From CellxGene schema."
2505
- ).save()
2506
- ln.ULabel(
2507
- name="organoid", type=tissue_type, description="From CellxGene schema."
2508
- ).save()
2509
- ln.ULabel(
2510
- name="cell culture",
2511
- type=tissue_type,
2512
- description="From CellxGene schema.",
2513
- ).save()
2514
-
2515
- suspension_type = ln.ULabel(
2516
- name="SuspensionType",
2517
- is_type=True,
2518
- description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
2519
- ).save()
2520
- ln.ULabel(
2521
- name="cell", type=suspension_type, description="From CellxGene schema."
2522
- ).save()
2523
- ln.ULabel(
2524
- name="nucleus",
2525
- type=suspension_type,
2526
- description="From CellxGene schema.",
2527
- ).save()
2528
- ln.ULabel(name="na", type=suspension_type).save()
2529
-
2530
- @classmethod
2531
- def _get_categoricals(cls) -> dict[str, FieldAttr]:
2532
- import bionty as bt
2533
-
2534
- return {
2535
- "assay": bt.ExperimentalFactor.name,
2536
- "assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
2537
- "cell_type": bt.CellType.name,
2538
- "cell_type_ontology_term_id": bt.CellType.ontology_id,
2539
- "development_stage": bt.DevelopmentalStage.name,
2540
- "development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
2541
- "disease": bt.Disease.name,
2542
- "disease_ontology_term_id": bt.Disease.ontology_id,
2543
- # "donor_id": "str", via pandera
2544
- "self_reported_ethnicity": bt.Ethnicity.name,
2545
- "self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
2546
- "sex": bt.Phenotype.name,
2547
- "sex_ontology_term_id": bt.Phenotype.ontology_id,
2548
- "suspension_type": ULabel.name,
2549
- "tissue": bt.Tissue.name,
2550
- "tissue_ontology_term_id": bt.Tissue.ontology_id,
2551
- "tissue_type": ULabel.name,
2552
- "organism": bt.Organism.name,
2553
- "organism_ontology_term_id": bt.Organism.ontology_id,
2554
- }
2555
-
2556
- @classmethod
2422
+ @deprecated(new_name="cxg_categoricals_defaults")
2557
2423
  def _get_categoricals_defaults(cls) -> dict[str, str]:
2558
- return {
2559
- "cell_type": "unknown",
2560
- "development_stage": "unknown",
2561
- "disease": "normal",
2562
- "donor_id": "unknown",
2563
- "self_reported_ethnicity": "unknown",
2564
- "sex": "unknown",
2565
- "suspension_type": "cell",
2566
- "tissue_type": "tissue",
2567
- }
2568
-
2569
- @property
2570
- def pinned_ontologies(self) -> pd.DataFrame:
2571
- return self._pinned_ontologies
2572
-
2573
- @property
2574
- def adata(self) -> AnnData:
2575
- return self._adata
2576
-
2577
- def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
2578
- """Creates a sources dictionary that can be passed to AnnDataCatManager."""
2579
- import bionty as bt
2580
-
2581
- # fmt: off
2582
- def _fetch_bionty_source(
2583
- entity: str, organism: str, source: str
2584
- ) -> bt.Source | None:
2585
- """Fetch the Bionty source of the pinned ontology.
2586
-
2587
- Returns None if the source does not exist.
2588
- """
2589
- version = self._pinned_ontologies.loc[(self._pinned_ontologies.index == entity) &
2590
- (self._pinned_ontologies["organism"] == organism) &
2591
- (self._pinned_ontologies["source"] == source), "version"].iloc[0]
2592
- return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
2593
-
2594
- entity_mapping = {
2595
- "var_index": ("Gene", self.organism, "ensembl"),
2596
- "cell_type": ("CellType", "all", "cl"),
2597
- "assay": ("ExperimentalFactor", "all", "efo"),
2598
- "self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
2599
- "development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
2600
- "disease": ("Disease", "all", "mondo"),
2601
- # "organism": ("Organism", "vertebrates", "ensembl"),
2602
- "sex": ("Phenotype", "all", "pato"),
2603
- "tissue": ("Tissue", "all", "uberon"),
2604
- }
2605
- # fmt: on
2606
-
2607
- # Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
2608
- entity_to_sources = {
2609
- entity: _fetch_bionty_source(*params)
2610
- for entity, params in entity_mapping.items()
2611
- if entity in obs.columns
2612
- or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
2613
- or entity == "var_index"
2614
- }
2615
-
2616
- return entity_to_sources
2424
+ return cls.cxg_categoricals_defaults
2617
2425
 
2618
- def _convert_name_to_ontology_id(self, values: pd.Series, field: FieldAttr):
2619
- """Converts a column that stores a name into a column that stores the ontology id.
2426
+ @classmethod
2427
+ def _get_cxg_categoricals(cls) -> dict[str, FieldAttr]:
2428
+ """Returns the CELLxGENE schema mapped fields."""
2429
+ from ._cellxgene_schemas import _get_cxg_categoricals
2620
2430
 
2621
- cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
2622
- """
2623
- field_name = field.field.name
2624
- assert field_name == "name" # noqa: S101
2625
- cols = ["name", "ontology_id"]
2626
- registry = field.field.model
2627
-
2628
- if hasattr(registry, "ontology_id"):
2629
- validated_records = registry.filter(**{f"{field_name}__in": values})
2630
- mapper = (
2631
- pd.DataFrame(validated_records.values_list(*cols))
2632
- .set_index(0)
2633
- .to_dict()[1]
2634
- )
2635
- return values.map(mapper)
2431
+ return _get_cxg_categoricals()
2636
2432
 
2637
- def validate(self) -> bool: # type: ignore
2433
+ def validate(self) -> bool:
2638
2434
  """Validates the AnnData object against most cellxgene requirements."""
2435
+ from ._cellxgene_schemas import RESERVED_NAMES
2436
+
2639
2437
  # Verify that all required obs columns are present
2438
+ required_columns = list(self.cxg_categoricals_defaults.keys()) + ["donor_id"]
2640
2439
  missing_obs_fields = [
2641
2440
  name
2642
- for name in CellxGeneAnnDataCatManager._get_categoricals_defaults().keys()
2441
+ for name in required_columns
2643
2442
  if name not in self._adata.obs.columns
2644
2443
  and f"{name}_ontology_term_id" not in self._adata.obs.columns
2645
2444
  ]
2646
2445
  if len(missing_obs_fields) > 0:
2647
- missing_obs_fields_str = ", ".join(list(missing_obs_fields))
2648
- logger.error(f"missing required obs columns {missing_obs_fields_str}")
2649
- logger.info(
2650
- "consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
2651
- "to automatically add these columns with default values."
2446
+ logger.error(
2447
+ f"missing required obs columns {_format_values(missing_obs_fields)}\n"
2448
+ " → consider initializing a Curate object with `defaults=cxg.CellxGeneAnnDataCatManager.cxg_categoricals_defaults` to automatically add these columns with default values"
2652
2449
  )
2653
2450
  return False
2654
2451
 
2655
2452
  # Verify that no cellxgene reserved names are present
2656
- reserved_names = {
2657
- "ethnicity",
2658
- "ethnicity_ontology_term_id",
2659
- "X_normalization",
2660
- "default_field",
2661
- "layer_descriptions",
2662
- "tags",
2663
- "versions",
2664
- "contributors",
2665
- "preprint_doi",
2666
- "project_description",
2667
- "project_links",
2668
- "project_name",
2669
- "publication_doi",
2670
- }
2671
2453
  matched_columns = [
2672
- column for column in self._adata.obs.columns if column in reserved_names
2454
+ column for column in self._adata.obs.columns if column in RESERVED_NAMES
2673
2455
  ]
2674
2456
  if len(matched_columns) > 0:
2675
2457
  raise ValueError(
@@ -2696,6 +2478,26 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2696
2478
  Returns:
2697
2479
  An AnnData object which adheres to the cellxgene-schema.
2698
2480
  """
2481
+
2482
+ def _convert_name_to_ontology_id(values: pd.Series, field: FieldAttr):
2483
+ """Converts a column that stores a name into a column that stores the ontology id.
2484
+
2485
+ cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
2486
+ """
2487
+ field_name = field.field.name
2488
+ assert field_name == "name" # noqa: S101
2489
+ cols = ["name", "ontology_id"]
2490
+ registry = field.field.model
2491
+
2492
+ if hasattr(registry, "ontology_id"):
2493
+ validated_records = registry.filter(**{f"{field_name}__in": values})
2494
+ mapper = (
2495
+ pd.DataFrame(validated_records.values_list(*cols))
2496
+ .set_index(0)
2497
+ .to_dict()[1]
2498
+ )
2499
+ return values.map(mapper)
2500
+
2699
2501
  # Create a copy since we modify the AnnData object extensively
2700
2502
  adata_cxg = self._adata.copy()
2701
2503
 
@@ -2715,7 +2517,7 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2715
2517
  # convert name column to ontology_term_id column
2716
2518
  for column in adata_cxg.obs.columns:
2717
2519
  if column in self.categoricals and not column.endswith("_ontology_term_id"):
2718
- mapped_column = self._convert_name_to_ontology_id(
2520
+ mapped_column = _convert_name_to_ontology_id(
2719
2521
  adata_cxg.obs[column], field=self.categoricals.get(column)
2720
2522
  )
2721
2523
  if mapped_column is not None:
@@ -2881,7 +2683,7 @@ class TimeHandler:
2881
2683
 
2882
2684
 
2883
2685
  class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2884
- """Curator flow for Perturbation data."""
2686
+ """Categorical manager for `AnnData` to manage perturbations."""
2885
2687
 
2886
2688
  PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
2887
2689
 
@@ -2892,45 +2694,32 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2892
2694
  pert_dose: bool = True,
2893
2695
  pert_time: bool = True,
2894
2696
  *,
2697
+ cxg_schema_version: Literal["5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
2895
2698
  verbosity: str = "hint",
2896
- cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
2897
2699
  ):
2898
2700
  """Initialize the curator with configuration and validation settings."""
2899
- import bionty as bt
2900
-
2901
2701
  self._pert_time = pert_time
2902
2702
  self._pert_dose = pert_dose
2903
2703
 
2904
2704
  self._validate_initial_data(adata)
2905
- self._setup_configuration(adata)
2906
-
2907
- self._setup_sources(adata)
2908
- self._setup_compound_source()
2705
+ categoricals, categoricals_defaults = self._configure_categoricals(adata)
2909
2706
 
2910
2707
  super().__init__(
2911
2708
  adata=adata,
2912
- categoricals=self.PT_CATEGORICALS,
2913
- defaults=self.PT_DEFAULT_VALUES,
2914
- verbosity=verbosity,
2709
+ categoricals=categoricals,
2710
+ defaults=categoricals_defaults,
2915
2711
  organism=organism,
2916
- extra_sources=self.PT_SOURCES,
2712
+ extra_sources=self._configure_sources(adata),
2917
2713
  schema_version=cxg_schema_version,
2714
+ verbosity=verbosity,
2918
2715
  )
2919
2716
 
2920
- def _setup_configuration(self, adata: ad.AnnData):
2717
+ def _configure_categoricals(self, adata: ad.AnnData):
2921
2718
  """Set up default configuration values."""
2922
2719
  import bionty as bt
2923
2720
  import wetlab as wl
2924
2721
 
2925
- self.PT_DEFAULT_VALUES = (
2926
- CellxGeneAnnDataCatManager._get_categoricals_defaults()
2927
- | {
2928
- "cell_line": "unknown",
2929
- "pert_target": "unknown",
2930
- }
2931
- )
2932
-
2933
- self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
2722
+ categoricals = CellxGeneAnnDataCatManager._get_cxg_categoricals() | {
2934
2723
  k: v
2935
2724
  for k, v in {
2936
2725
  "cell_line": bt.CellLine.name,
@@ -2942,22 +2731,41 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2942
2731
  }.items()
2943
2732
  if k in adata.obs.columns
2944
2733
  }
2945
- # if "donor_id" in self.PT_CATEGORICALS:
2946
- # self.PT_CATEGORICALS["donor_id"] = Donor.name
2734
+ # if "donor_id" in categoricals:
2735
+ # categoricals["donor_id"] = Donor.name
2736
+
2737
+ categoricals_defaults = CellxGeneAnnDataCatManager.cxg_categoricals_defaults | {
2738
+ "cell_line": "unknown",
2739
+ "pert_target": "unknown",
2740
+ }
2947
2741
 
2948
- def _setup_sources(self, adata: ad.AnnData):
2742
+ return categoricals, categoricals_defaults
2743
+
2744
+ def _configure_sources(self, adata: ad.AnnData):
2949
2745
  """Set up data sources."""
2950
- self.PT_SOURCES = {}
2746
+ import bionty as bt
2747
+ import wetlab as wl
2748
+
2749
+ sources = {}
2750
+ # # do not yet specify cell_line source
2951
2751
  # if "cell_line" in adata.obs.columns:
2952
- # self.PT_SOURCES["cell_line"] = (
2953
- # bt.Source.filter(name="depmap").first()
2954
- # )
2752
+ # sources["cell_line"] = bt.Source.filter(
2753
+ # entity="bionty.CellLine", name="depmap"
2754
+ # ).first()
2955
2755
  if "pert_compound" in adata.obs.columns:
2956
- import bionty as bt
2756
+ with logger.mute():
2757
+ chebi_source = bt.Source.filter(
2758
+ entity="wetlab.Compound", name="chebi"
2759
+ ).first()
2760
+ if not chebi_source:
2761
+ wl.Compound.add_source(
2762
+ bt.Source.filter(entity="Drug", name="chebi").first()
2763
+ )
2957
2764
 
2958
- self.PT_SOURCES["pert_compound"] = bt.Source.filter(
2765
+ sources["pert_compound"] = bt.Source.filter(
2959
2766
  entity="wetlab.Compound", name="chebi"
2960
2767
  ).first()
2768
+ return sources
2961
2769
 
2962
2770
  def _validate_initial_data(self, adata: ad.AnnData):
2963
2771
  """Validate the initial data structure."""
@@ -3005,20 +2813,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
3005
2813
  adata.obs[col_name].cat.remove_unused_categories()
3006
2814
  logger.important(f"mapped 'pert_name' to '{col_name}'")
3007
2815
 
3008
- def _setup_compound_source(self):
3009
- """Set up the compound source with muted logging."""
3010
- import bionty as bt
3011
- import wetlab as wl
3012
-
3013
- with logger.mute():
3014
- chebi_source = bt.Source.filter(
3015
- entity="wetlab.Compound", name="chebi"
3016
- ).first()
3017
- if not chebi_source:
3018
- wl.Compound.add_source(
3019
- bt.Source.filter(entity="Drug", name="chebi").first()
3020
- )
3021
-
3022
2816
  def validate(self) -> bool: # type: ignore
3023
2817
  """Validate the AnnData object."""
3024
2818
  validated = super().validate()
@@ -3136,70 +2930,47 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
3136
2930
 
3137
2931
  def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
3138
2932
  """Make sure the source and organism are saved in the same database as the registry."""
3139
- from lamindb.core._settings import settings
3140
-
3141
2933
  db = registry.filter().db
3142
2934
  source = kwargs.get("source")
3143
2935
  organism = kwargs.get("organism")
3144
2936
  filter_kwargs = kwargs.copy()
3145
- try:
3146
- verbosity = settings.verbosity
3147
- settings.verbosity = "error"
3148
- if isinstance(organism, Record) and organism._state.db != "default":
3149
- if db is None or db == "default":
3150
- organism_default = copy.copy(organism)
3151
- # save the organism record in the default database
3152
- organism_default.save()
3153
- filter_kwargs["organism"] = organism_default
3154
- if isinstance(source, Record) and source._state.db != "default":
3155
- if db is None or db == "default":
3156
- source_default = copy.copy(source)
3157
- # save the source record in the default database
3158
- source_default.save()
3159
- filter_kwargs["source"] = source_default
3160
- finally:
3161
- settings.verbosity = verbosity
3162
- return filter_kwargs
3163
-
3164
2937
 
3165
- def inspect_instance(
3166
- values: Iterable[str],
3167
- field: FieldAttr,
3168
- registry: type[Record],
3169
- exclude: str | list | None = None,
3170
- **kwargs,
3171
- ):
3172
- """Inspect values using a registry."""
3173
- # inspect exclude values in the default instance
3174
- values = list(values)
3175
- include_validated = []
3176
- if exclude is not None:
3177
- exclude = [exclude] if isinstance(exclude, str) else exclude
3178
- exclude = [i for i in exclude if i in values]
3179
- if len(exclude) > 0:
3180
- # exclude values are validated without source and organism
3181
- inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
3182
- # if exclude values are validated, remove them from the values
3183
- values = [i for i in values if i not in inspect_result_exclude.validated]
3184
- include_validated = inspect_result_exclude.validated
3185
-
3186
- inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
3187
- inspect_result._validated += include_validated
3188
- inspect_result._non_validated = [
3189
- i for i in inspect_result.non_validated if i not in include_validated
3190
- ]
2938
+ if isinstance(organism, Record) and organism._state.db != "default":
2939
+ if db is None or db == "default":
2940
+ organism_default = copy.copy(organism)
2941
+ # save the organism record in the default database
2942
+ organism_default.save()
2943
+ filter_kwargs["organism"] = organism_default
2944
+ if isinstance(source, Record) and source._state.db != "default":
2945
+ if db is None or db == "default":
2946
+ source_default = copy.copy(source)
2947
+ # save the source record in the default database
2948
+ source_default.save()
2949
+ filter_kwargs["source"] = source_default
3191
2950
 
3192
- return inspect_result
2951
+ return filter_kwargs
3193
2952
 
3194
2953
 
3195
- def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
2954
+ def get_organism_kwargs(
2955
+ field: FieldAttr, organism: str | None = None
2956
+ ) -> dict[str, str]:
3196
2957
  """Check if a registry needs an organism and return the organism name."""
3197
- if hasattr(registry, "organism_id"):
2958
+ registry = field.field.model
2959
+ if registry.__base__.__name__ == "BioRecord":
3198
2960
  import bionty as bt
2961
+ from bionty._organism import is_organism_required
3199
2962
 
3200
- if organism is None and bt.settings.organism is None:
3201
- return {}
3202
- return {"organism": organism or bt.settings.organism.name}
2963
+ from ..models._from_values import get_organism_record_from_field
2964
+
2965
+ if is_organism_required(registry):
2966
+ if organism is not None or bt.settings.organism is not None:
2967
+ return {"organism": organism or bt.settings.organism.name}
2968
+ else:
2969
+ organism_record = get_organism_record_from_field(
2970
+ field, organism=organism
2971
+ )
2972
+ if organism_record is not None:
2973
+ return {"organism": organism_record.name}
3203
2974
  return {}
3204
2975
 
3205
2976
 
@@ -3209,7 +2980,6 @@ def validate_categories(
3209
2980
  key: str,
3210
2981
  organism: str | None = None,
3211
2982
  source: Record | None = None,
3212
- exclude: str | list | None = None,
3213
2983
  hint_print: str | None = None,
3214
2984
  curator: CatManager | None = None,
3215
2985
  ) -> tuple[bool, list[str]]:
@@ -3221,13 +2991,9 @@ def validate_categories(
3221
2991
  key: The key referencing the slot in the DataFrame.
3222
2992
  organism: The organism name.
3223
2993
  source: The source record.
3224
- exclude: Exclude specific values from validation.
3225
2994
  standardize: Whether to standardize the values.
3226
2995
  hint_print: The hint to print that suggests fixing non-validated values.
3227
2996
  """
3228
- from lamindb.core._settings import settings
3229
- from lamindb.models._from_values import _format_values
3230
-
3231
2997
  model_field = f"{field.field.model.__name__}.{field.field.name}"
3232
2998
 
3233
2999
  def _log_mapping_info():
@@ -3237,36 +3003,25 @@ def validate_categories(
3237
3003
 
3238
3004
  registry = field.field.model
3239
3005
 
3240
- # {"organism": organism_name/organism_record}
3241
- kwargs = check_registry_organism(registry, organism)
3242
- kwargs.update({"source": source} if source else {})
3243
- kwargs_current = get_current_filter_kwargs(registry, kwargs)
3006
+ kwargs_current = get_current_filter_kwargs(
3007
+ registry, {"organism": organism, "source": source}
3008
+ )
3244
3009
 
3245
3010
  # inspect values from the default instance
3246
- inspect_result = inspect_instance(
3247
- values=values,
3248
- field=field,
3249
- registry=registry,
3250
- exclude=exclude,
3251
- **kwargs_current,
3252
- )
3011
+ inspect_result = registry.inspect(values, field=field, mute=True, **kwargs_current)
3253
3012
  non_validated = inspect_result.non_validated
3254
3013
  syn_mapper = inspect_result.synonyms_mapper
3255
3014
 
3256
- # inspect the non-validated values from public (bionty only)
3015
+ # inspect the non-validated values from public (BioRecord only)
3257
3016
  values_validated = []
3258
3017
  if hasattr(registry, "public"):
3259
- verbosity = settings.verbosity
3260
- try:
3261
- settings.verbosity = "error"
3262
- public_records = registry.from_values(
3263
- non_validated,
3264
- field=field,
3265
- **kwargs_current,
3266
- )
3267
- values_validated += [getattr(r, field.field.name) for r in public_records]
3268
- finally:
3269
- settings.verbosity = verbosity
3018
+ public_records = registry.from_values(
3019
+ non_validated,
3020
+ field=field,
3021
+ mute=True,
3022
+ **kwargs_current,
3023
+ )
3024
+ values_validated += [getattr(r, field.field.name) for r in public_records]
3270
3025
 
3271
3026
  # logging messages
3272
3027
  non_validated_hint_print = hint_print or f'.add_new_from("{key}")'
@@ -3330,7 +3085,6 @@ def validate_categories_in_df(
3330
3085
  df: pd.DataFrame,
3331
3086
  fields: dict[str, FieldAttr],
3332
3087
  sources: dict[str, Record] = None,
3333
- exclude: dict | None = None,
3334
3088
  curator: CatManager | None = None,
3335
3089
  **kwargs,
3336
3090
  ) -> tuple[bool, dict]:
@@ -3348,7 +3102,6 @@ def validate_categories_in_df(
3348
3102
  field=field,
3349
3103
  key=key,
3350
3104
  source=sources.get(key),
3351
- exclude=exclude.get(key) if exclude else None,
3352
3105
  curator=curator,
3353
3106
  **kwargs,
3354
3107
  )
@@ -3359,9 +3112,10 @@ def validate_categories_in_df(
3359
3112
 
3360
3113
 
3361
3114
  def save_artifact(
3362
- data: pd.DataFrame | ad.AnnData | MuData,
3115
+ data: pd.DataFrame | ScverseDataStructures,
3116
+ *,
3363
3117
  fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
3364
- columns_field: FieldAttr | dict[str, FieldAttr] | None = None,
3118
+ index_field: FieldAttr | dict[str, FieldAttr] | None = None,
3365
3119
  description: str | None = None,
3366
3120
  organism: str | None = None,
3367
3121
  key: str | None = None,
@@ -3369,73 +3123,52 @@ def save_artifact(
3369
3123
  revises: Artifact | None = None,
3370
3124
  run: Run | None = None,
3371
3125
  schema: Schema | None = None,
3126
+ **kwargs,
3372
3127
  ) -> Artifact:
3373
3128
  """Save all metadata with an Artifact.
3374
3129
 
3375
3130
  Args:
3376
- data: The DataFrame/AnnData/MuData object to save.
3131
+ data: The object to save.
3377
3132
  fields: A dictionary mapping obs_column to registry_field.
3378
- columns_field: The registry field to validate variables index against.
3133
+ index_field: The registry field to validate variables index against.
3379
3134
  description: A description of the artifact.
3380
3135
  organism: The organism name.
3381
- type: The artifact type.
3382
3136
  key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
3383
3137
  artifact: A already registered artifact. Passing this will not save a new artifact from data.
3384
3138
  revises: Previous version of the artifact. Triggers a revision.
3385
3139
  run: The run that creates the artifact.
3140
+ schema: The Schema to associate with the Artifact.
3386
3141
 
3387
3142
  Returns:
3388
3143
  The saved Artifact.
3389
3144
  """
3390
- from ..models.artifact import add_labels, data_is_anndata, data_is_mudata
3145
+ from ..models.artifact import add_labels
3391
3146
 
3392
3147
  if artifact is None:
3393
- if data_is_anndata(data):
3394
- artifact = Artifact.from_anndata(
3148
+ if isinstance(data, pd.DataFrame):
3149
+ artifact = Artifact.from_df(
3395
3150
  data, description=description, key=key, revises=revises, run=run
3396
3151
  )
3397
- elif isinstance(data, pd.DataFrame):
3398
- artifact = Artifact.from_df(
3152
+ elif isinstance(data, AnnData):
3153
+ artifact = Artifact.from_anndata(
3399
3154
  data, description=description, key=key, revises=revises, run=run
3400
3155
  )
3401
3156
  elif data_is_mudata(data):
3402
3157
  artifact = Artifact.from_mudata(
3403
- data,
3404
- description=description,
3405
- key=key,
3406
- revises=revises,
3407
- run=run,
3158
+ data, description=description, key=key, revises=revises, run=run
3159
+ )
3160
+ elif data_is_spatialdata(data):
3161
+ artifact = Artifact.from_spatialdata(
3162
+ data, description=description, key=key, revises=revises, run=run
3163
+ )
3164
+ else:
3165
+ raise InvalidArgument( # pragma: no cover
3166
+ "data must be one of pd.Dataframe, AnnData, MuData, SpatialData."
3408
3167
  )
3409
- artifact.schema = schema
3410
3168
  artifact.save()
3411
3169
 
3412
- if organism is not None and columns_field is not None:
3413
- feature_kwargs = check_registry_organism(
3414
- (
3415
- list(columns_field.values())[0].field.model
3416
- if isinstance(columns_field, dict)
3417
- else columns_field.field.model
3418
- ),
3419
- organism,
3420
- )
3421
- else:
3422
- feature_kwargs = {}
3423
-
3424
- if artifact.otype == "DataFrame":
3425
- artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) # type: ignore
3426
- elif artifact.otype == "AnnData":
3427
- artifact.features._add_set_from_anndata( # type: ignore
3428
- var_field=columns_field, **feature_kwargs
3429
- )
3430
- elif artifact.otype == "MuData":
3431
- artifact.features._add_set_from_mudata( # type: ignore
3432
- var_fields=columns_field, **feature_kwargs
3433
- )
3434
- else:
3435
- raise NotImplementedError
3436
-
3437
3170
  def _add_labels(
3438
- data,
3171
+ data: pd.DataFrame | ScverseDataStructures,
3439
3172
  artifact: Artifact,
3440
3173
  fields: dict[str, FieldAttr],
3441
3174
  feature_ref_is_name: bool | None = None,
@@ -3444,19 +3177,15 @@ def save_artifact(
3444
3177
  for key, field in fields.items():
3445
3178
  feature = features.get(key)
3446
3179
  registry = field.field.model
3447
- filter_kwargs = check_registry_organism(registry, organism)
3448
- filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
3180
+ # we don't need source here because all records are already in the DB
3181
+ filter_kwargs = get_current_filter_kwargs(registry, {"organism": organism})
3449
3182
  df = data if isinstance(data, pd.DataFrame) else data.obs
3450
3183
  # multi-value columns are separated by "|"
3451
3184
  if not df[key].isna().all() and df[key].str.contains("|").any():
3452
3185
  values = df[key].str.split("|").explode().unique()
3453
3186
  else:
3454
3187
  values = df[key].unique()
3455
- labels = registry.from_values(
3456
- values,
3457
- field=field,
3458
- **filter_kwargs_current,
3459
- )
3188
+ labels = registry.from_values(values, field=field, **filter_kwargs)
3460
3189
  if len(labels) == 0:
3461
3190
  continue
3462
3191
  label_ref_is_name = None
@@ -3471,35 +3200,87 @@ def save_artifact(
3471
3200
  from_curator=True,
3472
3201
  )
3473
3202
 
3474
- if artifact.otype == "MuData":
3475
- for modality, modality_fields in fields.items():
3476
- column_field_modality = columns_field.get(modality)
3477
- if modality == "obs":
3478
- _add_labels(
3479
- data,
3480
- artifact,
3481
- modality_fields,
3482
- feature_ref_is_name=(
3483
- None
3484
- if column_field_modality is None
3485
- else _ref_is_name(column_field_modality)
3486
- ),
3487
- )
3203
+ match artifact.otype:
3204
+ case "DataFrame":
3205
+ artifact.features._add_set_from_df(field=index_field, organism=organism) # type: ignore
3206
+ _add_labels(
3207
+ data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
3208
+ )
3209
+ case "AnnData":
3210
+ if schema is not None and "uns" in schema.slots:
3211
+ uns_field = parse_cat_dtype(schema.slots["uns"].itype, is_itype=True)[
3212
+ "field"
3213
+ ]
3488
3214
  else:
3489
- _add_labels(
3490
- data[modality],
3491
- artifact,
3492
- modality_fields,
3493
- feature_ref_is_name=(
3494
- None
3495
- if column_field_modality is None
3496
- else _ref_is_name(column_field_modality)
3497
- ),
3498
- )
3499
- else:
3500
- _add_labels(
3501
- data, artifact, fields, feature_ref_is_name=_ref_is_name(columns_field)
3502
- )
3215
+ uns_field = None
3216
+ artifact.features._add_set_from_anndata( # type: ignore
3217
+ var_field=index_field, uns_field=uns_field, organism=organism
3218
+ )
3219
+ _add_labels(
3220
+ data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
3221
+ )
3222
+ case "MuData":
3223
+ artifact.features._add_set_from_mudata( # type: ignore
3224
+ var_fields=index_field, organism=organism
3225
+ )
3226
+ for modality, modality_fields in fields.items():
3227
+ column_field_modality = index_field.get(modality)
3228
+ if modality == "obs":
3229
+ _add_labels(
3230
+ data,
3231
+ artifact,
3232
+ modality_fields,
3233
+ feature_ref_is_name=(
3234
+ None
3235
+ if column_field_modality is None
3236
+ else _ref_is_name(column_field_modality)
3237
+ ),
3238
+ )
3239
+ else:
3240
+ _add_labels(
3241
+ data[modality],
3242
+ artifact,
3243
+ modality_fields,
3244
+ feature_ref_is_name=(
3245
+ None
3246
+ if column_field_modality is None
3247
+ else _ref_is_name(column_field_modality)
3248
+ ),
3249
+ )
3250
+ case "SpatialData":
3251
+ artifact.features._add_set_from_spatialdata( # type: ignore
3252
+ sample_metadata_key=kwargs.get("sample_metadata_key", "sample"),
3253
+ var_fields=index_field,
3254
+ organism=organism,
3255
+ )
3256
+ sample_metadata_key = kwargs.get("sample_metadata_key", "sample")
3257
+ for accessor, accessor_fields in fields.items():
3258
+ column_field = index_field.get(accessor)
3259
+ if accessor == sample_metadata_key:
3260
+ _add_labels(
3261
+ data.get_attrs(
3262
+ key=sample_metadata_key, return_as="df", flatten=True
3263
+ ),
3264
+ artifact,
3265
+ accessor_fields,
3266
+ feature_ref_is_name=(
3267
+ None if column_field is None else _ref_is_name(column_field)
3268
+ ),
3269
+ )
3270
+ else:
3271
+ _add_labels(
3272
+ data.tables[accessor],
3273
+ artifact,
3274
+ accessor_fields,
3275
+ feature_ref_is_name=(
3276
+ None if column_field is None else _ref_is_name(column_field)
3277
+ ),
3278
+ )
3279
+ case _:
3280
+ raise NotImplementedError # pragma: no cover
3281
+
3282
+ artifact.schema = schema
3283
+ artifact.save()
3503
3284
 
3504
3285
  slug = ln_setup.settings.instance.slug
3505
3286
  if ln_setup.settings.instance.is_remote: # pdagma: no cover
@@ -3529,8 +3310,7 @@ def update_registry(
3529
3310
  organism: str | None = None,
3530
3311
  dtype: str | None = None,
3531
3312
  source: Record | None = None,
3532
- exclude: str | list | None = None,
3533
- **kwargs,
3313
+ **create_kwargs,
3534
3314
  ) -> None:
3535
3315
  """Save features or labels records in the default instance..
3536
3316
 
@@ -3543,82 +3323,68 @@ def update_registry(
3543
3323
  organism: The organism name.
3544
3324
  dtype: The type of the feature.
3545
3325
  source: The source record.
3546
- exclude: Values to exclude from inspect.
3547
- kwargs: Additional keyword arguments to pass to the registry model to create new records.
3326
+ **create_kwargs: Additional keyword arguments to pass to the registry model to create new records.
3548
3327
  """
3549
- from lamindb.core._settings import settings
3550
3328
  from lamindb.models.save import save as ln_save
3551
3329
 
3552
3330
  registry = field.field.model
3553
- filter_kwargs = check_registry_organism(registry, organism)
3554
- filter_kwargs.update({"source": source} if source else {})
3331
+ filter_kwargs = get_current_filter_kwargs(
3332
+ registry, {"organism": organism, "source": source}
3333
+ )
3555
3334
  values = [i for i in values if isinstance(i, str) and i]
3556
3335
  if not values:
3557
3336
  return
3558
3337
 
3559
- verbosity = settings.verbosity
3560
- try:
3561
- settings.verbosity = "error"
3562
- labels_saved: dict = {"from public": [], "new": []}
3338
+ labels_saved: dict = {"from public": [], "new": []}
3563
3339
 
3564
- # inspect the default instance and save validated records from public
3565
- filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
3566
- existing_and_public_records = registry.from_values(
3567
- list(values), field=field, **filter_kwargs_current
3568
- )
3569
- existing_and_public_labels = [
3570
- getattr(r, field.field.name) for r in existing_and_public_records
3571
- ]
3572
- # public records that are not already in the database
3573
- public_records = [r for r in existing_and_public_records if r._state.adding]
3574
- # here we check to only save the public records if they are from the specified source
3575
- # we check the uid because r.source and source can be from different instances
3576
- if source:
3577
- public_records = [r for r in public_records if r.source.uid == source.uid]
3578
- if len(public_records) > 0:
3579
- settings.verbosity = "info"
3580
- logger.info(f"saving validated records of '{key}'")
3581
- settings.verbosity = "error"
3582
- ln_save(public_records)
3583
- labels_saved["from public"] = [
3584
- getattr(r, field.field.name) for r in public_records
3585
- ]
3586
- # non-validated records from the default instance
3587
- non_validated_labels = [
3588
- i for i in values if i not in existing_and_public_labels
3340
+ # inspect the default instance and save validated records from public
3341
+ existing_and_public_records = registry.from_values(
3342
+ list(values), field=field, **filter_kwargs, mute=True
3343
+ )
3344
+ existing_and_public_labels = [
3345
+ getattr(r, field.field.name) for r in existing_and_public_records
3346
+ ]
3347
+ # public records that are not already in the database
3348
+ public_records = [r for r in existing_and_public_records if r._state.adding]
3349
+ # here we check to only save the public records if they are from the specified source
3350
+ # we check the uid because r.source and source can be from different instances
3351
+ if source:
3352
+ public_records = [r for r in public_records if r.source.uid == source.uid]
3353
+ if len(public_records) > 0:
3354
+ logger.info(f"saving validated records of '{key}'")
3355
+ ln_save(public_records)
3356
+ labels_saved["from public"] = [
3357
+ getattr(r, field.field.name) for r in public_records
3589
3358
  ]
3359
+ # non-validated records from the default instance
3360
+ non_validated_labels = [i for i in values if i not in existing_and_public_labels]
3361
+
3362
+ # save non-validated/new records
3363
+ labels_saved["new"] = non_validated_labels
3364
+ if not validated_only:
3365
+ non_validated_records: RecordList[Any] = [] # type: ignore
3366
+ if df is not None and registry == Feature:
3367
+ nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
3368
+ non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
3369
+ else:
3370
+ if (
3371
+ organism
3372
+ and hasattr(registry, "organism")
3373
+ and registry._meta.get_field("organism").is_relation
3374
+ ):
3375
+ # make sure organism record is saved to the current instance
3376
+ create_kwargs["organism"] = _save_organism(name=organism)
3590
3377
 
3591
- # save non-validated/new records
3592
- labels_saved["new"] = non_validated_labels
3593
- if not validated_only:
3594
- non_validated_records: RecordList[Any] = [] # type: ignore
3595
- if df is not None and registry == Feature:
3596
- nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
3597
- non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
3598
- else:
3599
- if "organism" in filter_kwargs:
3600
- # make sure organism record is saved to the current instance
3601
- filter_kwargs["organism"] = _save_organism(name=organism)
3602
- init_kwargs = {}
3603
- for value in labels_saved["new"]:
3604
- init_kwargs[field.field.name] = value
3605
- if registry == Feature:
3606
- init_kwargs["dtype"] = "cat" if dtype is None else dtype
3607
- non_validated_records.append(
3608
- registry(
3609
- **init_kwargs,
3610
- **{k: v for k, v in filter_kwargs.items() if k != "source"},
3611
- **{k: v for k, v in kwargs.items() if k != "sources"},
3612
- )
3613
- )
3614
- ln_save(non_validated_records)
3615
-
3616
- # save parent labels for ulabels, for example a parent label "project" for label "project001"
3617
- if registry == ULabel and field.field.name == "name":
3618
- save_ulabels_parent(values, field=field, key=key)
3378
+ for value in labels_saved["new"]:
3379
+ init_kwargs = {field.field.name: value}
3380
+ if registry == Feature:
3381
+ init_kwargs["dtype"] = "cat" if dtype is None else dtype
3382
+ non_validated_records.append(registry(**init_kwargs, **create_kwargs))
3383
+ ln_save(non_validated_records)
3619
3384
 
3620
- finally:
3621
- settings.verbosity = verbosity
3385
+ # save parent labels for ulabels, for example a parent label "project" for label "project001"
3386
+ if registry == ULabel and field.field.name == "name":
3387
+ save_ulabels_type(values, field=field, key=key)
3622
3388
 
3623
3389
  log_saved_labels(
3624
3390
  labels_saved,
@@ -3653,16 +3419,18 @@ def log_saved_labels(
3653
3419
  )
3654
3420
 
3655
3421
 
3656
- def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None:
3657
- """Save a parent label for the given labels."""
3422
+ def save_ulabels_type(values: list[str], field: FieldAttr, key: str) -> None:
3423
+ """Save the ULabel type of the given labels."""
3658
3424
  registry = field.field.model
3659
3425
  assert registry == ULabel # noqa: S101
3660
- all_records = registry.from_values(list(values), field=field)
3661
- is_feature = registry.filter(name=f"{key}").one_or_none()
3662
- if is_feature is None:
3663
- is_feature = registry(name=f"{key}").save()
3664
- logger.important(f"Created a parent ULabel: {is_feature}")
3665
- is_feature.children.add(*all_records)
3426
+ all_records = registry.filter(**{field.field.name: list(values)}).all()
3427
+ # so `tissue_type` becomes `TissueType`
3428
+ type_name = "".join([i.capitalize() for i in key.lower().split("_")])
3429
+ ulabel_type = registry.filter(name=type_name, is_type=True).one_or_none()
3430
+ if ulabel_type is None:
3431
+ ulabel_type = registry(name=type_name, is_type=True).save()
3432
+ logger.important(f"Created a ULabel type: {ulabel_type}")
3433
+ all_records.update(type=ulabel_type)
3666
3434
 
3667
3435
 
3668
3436
  def _save_organism(name: str):
@@ -3674,8 +3442,9 @@ def _save_organism(name: str):
3674
3442
  organism = bt.Organism.from_source(name=name)
3675
3443
  if organism is None:
3676
3444
  raise ValidationError(
3677
- f'Organism "{name}" not found\n'
3678
- f' → please save it: bt.Organism(name="{name}").save()'
3445
+ f'Organism "{name}" not found from public reference\n'
3446
+ f' → please save it from a different source: bt.Organism.from_source(name="{name}", source).save()'
3447
+ f' → or manually save it without source: bt.Organism(name="{name}").save()'
3679
3448
  )
3680
3449
  organism.save()
3681
3450
  return organism
@@ -3761,7 +3530,6 @@ def from_tiledbsoma(
3761
3530
  obs_columns: FieldAttr = Feature.name,
3762
3531
  organism: str | None = None,
3763
3532
  sources: dict[str, Record] | None = None,
3764
- exclude: dict[str, str | list[str]] | None = None,
3765
3533
  ) -> TiledbsomaCatManager:
3766
3534
  return TiledbsomaCatManager(
3767
3535
  experiment_uri=experiment_uri,
@@ -3770,7 +3538,6 @@ def from_tiledbsoma(
3770
3538
  obs_columns=obs_columns,
3771
3539
  organism=organism,
3772
3540
  sources=sources,
3773
- exclude=exclude,
3774
3541
  )
3775
3542
 
3776
3543
 
@@ -3782,7 +3549,6 @@ def from_spatialdata(
3782
3549
  categoricals: dict[str, dict[str, FieldAttr]] | None = None,
3783
3550
  organism: str | None = None,
3784
3551
  sources: dict[str, dict[str, Record]] | None = None,
3785
- exclude: dict[str, dict] | None = None,
3786
3552
  verbosity: str = "hint",
3787
3553
  *,
3788
3554
  sample_metadata_key: str = "sample",
@@ -3799,7 +3565,6 @@ def from_spatialdata(
3799
3565
  verbosity=verbosity,
3800
3566
  organism=organism,
3801
3567
  sources=sources,
3802
- exclude=exclude,
3803
3568
  sample_metadata_key=sample_metadata_key,
3804
3569
  )
3805
3570