lamindb 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -42,6 +42,8 @@ from lamindb_setup.core import deprecated
42
42
  from lamindb_setup.core._docs import doc_args
43
43
  from lamindb_setup.core.upath import UPath
44
44
 
45
+ from lamindb.core._compat import is_package_installed
46
+
45
47
  if TYPE_CHECKING:
46
48
  from lamindb_setup.core.types import UPathStr
47
49
  from mudata import MuData
@@ -103,25 +105,26 @@ class CatLookup:
103
105
 
104
106
  def __init__(
105
107
  self,
106
- categoricals: dict[str, FieldAttr],
108
+ categoricals: list[Feature] | dict[str, FieldAttr],
107
109
  slots: dict[str, FieldAttr] = None,
108
110
  public: bool = False,
109
- organism: str | None = None,
110
111
  sources: dict[str, Record] | None = None,
111
112
  ) -> None:
112
113
  slots = slots or {}
114
+ if isinstance(categoricals, list):
115
+ categoricals = {
116
+ feature.name: parse_dtype(feature.dtype)[0]["field"]
117
+ for feature in categoricals
118
+ }
113
119
  self._categoricals = {**categoricals, **slots}
114
120
  self._public = public
115
- self._organism = organism
116
121
  self._sources = sources
117
122
 
118
123
  def __getattr__(self, name):
119
124
  if name in self._categoricals:
120
125
  registry = self._categoricals[name].field.model
121
126
  if self._public and hasattr(registry, "public"):
122
- return registry.public(
123
- organism=self._organism, source=self._sources.get(name)
124
- ).lookup()
127
+ return registry.public(source=self._sources.get(name)).lookup()
125
128
  else:
126
129
  return registry.lookup()
127
130
  raise AttributeError(
@@ -132,9 +135,7 @@ class CatLookup:
132
135
  if name in self._categoricals:
133
136
  registry = self._categoricals[name].field.model
134
137
  if self._public and hasattr(registry, "public"):
135
- return registry.public(
136
- organism=self._organism, source=self._sources.get(name)
137
- ).lookup()
138
+ return registry.public(source=self._sources.get(name)).lookup()
138
139
  else:
139
140
  return registry.lookup()
140
141
  raise AttributeError(
@@ -240,6 +241,7 @@ class Curator:
240
241
  pass # pragma: no cover
241
242
 
242
243
 
244
+ # default implementation for MuDataCurator and SpatialDataCurator
243
245
  class SlotsCurator(Curator):
244
246
  """Curator for a dataset with slots.
245
247
 
@@ -262,7 +264,7 @@ class SlotsCurator(Curator):
262
264
  # in form of {table/modality_key: var_field}
263
265
  self._var_fields: dict[str, FieldAttr] = {}
264
266
  # in form of {table/modality_key: categoricals}
265
- self._categoricals: dict[str, dict[str, FieldAttr]] = {}
267
+ self._cat_columns: dict[str, dict[str, CatColumn]] = {}
266
268
 
267
269
  @property
268
270
  @doc_args(SLOTS_DOCSTRING)
@@ -273,7 +275,8 @@ class SlotsCurator(Curator):
273
275
  @doc_args(VALIDATE_DOCSTRING)
274
276
  def validate(self) -> None:
275
277
  """{}""" # noqa: D415
276
- for _, curator in self._slots.items():
278
+ for slot, curator in self._slots.items():
279
+ logger.info(f"validating slot {slot} ...")
277
280
  curator.validate()
278
281
 
279
282
  @doc_args(SAVE_ARTIFACT_DOCSTRING)
@@ -288,18 +291,34 @@ class SlotsCurator(Curator):
288
291
  """{}""" # noqa: D415
289
292
  if not self._is_validated:
290
293
  self.validate()
291
-
292
- # default implementation for MuDataCurator and SpatialDataCurator
293
- return save_artifact( # type: ignore
294
- self._dataset,
295
- key=key,
296
- description=description,
297
- fields=self._categoricals,
294
+ if self._artifact is None:
295
+ if data_is_mudata(self._dataset):
296
+ self._artifact = Artifact.from_mudata(
297
+ self._dataset,
298
+ key=key,
299
+ description=description,
300
+ revises=revises,
301
+ run=run,
302
+ )
303
+ elif data_is_spatialdata(self._dataset):
304
+ self._artifact = Artifact.from_spatialdata(
305
+ self._dataset,
306
+ key=key,
307
+ description=description,
308
+ revises=revises,
309
+ run=run,
310
+ )
311
+ self._artifact.schema = self._schema
312
+ self._artifact.save()
313
+ cat_columns = {}
314
+ for curator in self._slots.values():
315
+ for key, cat_column in curator._cat_manager._cat_columns.items():
316
+ cat_columns[key] = cat_column
317
+ return annotate_artifact( # type: ignore
318
+ self._artifact,
298
319
  index_field=self._var_fields,
299
- artifact=self._artifact,
300
- revises=revises,
301
- run=run,
302
320
  schema=self._schema,
321
+ cat_columns=cat_columns,
303
322
  )
304
323
 
305
324
 
@@ -373,11 +392,34 @@ class DataFrameCurator(Curator):
373
392
  schema: Schema,
374
393
  ) -> None:
375
394
  super().__init__(dataset=dataset, schema=schema)
376
- categoricals = {}
395
+ categoricals = []
396
+ features = []
397
+ feature_ids: set[int] = set()
398
+ if schema.flexible and isinstance(self._dataset, pd.DataFrame):
399
+ features += Feature.filter(name__in=self._dataset.keys()).list()
400
+ feature_ids = {feature.id for feature in features}
377
401
  if schema.n > 0:
402
+ schema_features = schema.features.all().list()
403
+ if feature_ids:
404
+ features.extend(
405
+ feature
406
+ for feature in schema_features
407
+ if feature.id not in feature_ids
408
+ )
409
+ else:
410
+ features.extend(schema_features)
411
+ else:
412
+ assert schema.itype is not None # noqa: S101
413
+ if features:
378
414
  # populate features
379
415
  pandera_columns = {}
380
- for feature in schema.features.all():
416
+ if schema.minimal_set:
417
+ optional_feature_uids = set(schema.optionals.get_uids())
418
+ for feature in features:
419
+ if schema.minimal_set:
420
+ required = feature.uid not in optional_feature_uids
421
+ else:
422
+ required = False
381
423
  if feature.dtype in {"int", "float", "num"}:
382
424
  dtype = (
383
425
  self._dataset[feature.name].dtype
@@ -393,6 +435,7 @@ class DataFrameCurator(Curator):
393
435
  ),
394
436
  nullable=feature.nullable,
395
437
  coerce=feature.coerce_dtype,
438
+ required=required,
396
439
  )
397
440
  else:
398
441
  pandera_dtype = (
@@ -404,14 +447,18 @@ class DataFrameCurator(Curator):
404
447
  pandera_dtype,
405
448
  nullable=feature.nullable,
406
449
  coerce=feature.coerce_dtype,
450
+ required=required,
407
451
  )
408
452
  if feature.dtype.startswith("cat"):
409
- categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
453
+ # validate categoricals if the column is required or if the column is present
454
+ if required or feature.name in self._dataset.columns:
455
+ categoricals.append(feature)
410
456
  self._pandera_schema = pandera.DataFrameSchema(
411
- pandera_columns, coerce=schema.coerce_dtype
457
+ pandera_columns,
458
+ coerce=schema.coerce_dtype,
459
+ strict=schema.maximal_set,
460
+ ordered=schema.ordered_set,
412
461
  )
413
- else:
414
- assert schema.itype is not None # noqa: S101
415
462
  self._cat_manager = DataFrameCatManager(
416
463
  self._dataset,
417
464
  columns=parse_cat_dtype(schema.itype, is_itype=True)["field"],
@@ -504,16 +551,21 @@ class DataFrameCurator(Curator):
504
551
  if not self._is_validated:
505
552
  self.validate() # raises ValidationError if doesn't validate
506
553
  result = parse_cat_dtype(self._schema.itype, is_itype=True)
507
- return save_artifact( # type: ignore
508
- self._dataset,
509
- description=description,
510
- fields=self._cat_manager.categoricals,
554
+ if self._artifact is None:
555
+ self._artifact = Artifact.from_df(
556
+ self._dataset,
557
+ key=key,
558
+ description=description,
559
+ revises=revises,
560
+ run=run,
561
+ )
562
+ self._artifact.schema = self._schema
563
+ self._artifact.save()
564
+ return annotate_artifact( # type: ignore
565
+ self._artifact,
511
566
  index_field=result["field"],
512
- key=key,
513
- artifact=self._artifact,
514
- revises=revises,
515
- run=run,
516
567
  schema=self._schema,
568
+ cat_columns=self._cat_manager._cat_columns,
517
569
  )
518
570
 
519
571
 
@@ -548,7 +600,7 @@ class AnnDataCurator(SlotsCurator):
548
600
  ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
549
601
  ln.Feature(name="sample_note", dtype=str).save(),
550
602
  ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
551
- ln.Feature(name="cell_type_by_model", dtype=bt.CellType").save(),
603
+ ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
552
604
  ],
553
605
  ).save()
554
606
 
@@ -596,6 +648,12 @@ class AnnDataCurator(SlotsCurator):
596
648
  for slot, slot_schema in schema.slots.items()
597
649
  if slot in {"obs", "var", "uns"}
598
650
  }
651
+ # TODO: better way to handle this!
652
+ if "var" in self._slots:
653
+ self._slots["var"]._cat_manager._cat_columns["var_index"] = self._slots[
654
+ "var"
655
+ ]._cat_manager._cat_columns.pop("columns")
656
+ self._slots["var"]._cat_manager._cat_columns["var_index"]._key = "var_index"
599
657
 
600
658
  @doc_args(SAVE_ARTIFACT_DOCSTRING)
601
659
  def save_artifact(
@@ -609,23 +667,28 @@ class AnnDataCurator(SlotsCurator):
609
667
  """{}""" # noqa: D415
610
668
  if not self._is_validated:
611
669
  self.validate()
612
- if "obs" in self.slots:
613
- categoricals = self.slots["obs"]._cat_manager.categoricals
614
- else:
615
- categoricals = {}
616
- return save_artifact( # type: ignore
617
- self._dataset,
618
- description=description,
619
- fields=categoricals,
670
+ if self._artifact is None:
671
+ self._artifact = Artifact.from_anndata(
672
+ self._dataset,
673
+ key=key,
674
+ description=description,
675
+ revises=revises,
676
+ run=run,
677
+ )
678
+ self._artifact.schema = self._schema
679
+ self._artifact.save()
680
+ return annotate_artifact( # type: ignore
681
+ self._artifact,
682
+ cat_columns=(
683
+ self.slots["obs"]._cat_manager._cat_columns
684
+ if "obs" in self.slots
685
+ else {}
686
+ ),
620
687
  index_field=(
621
688
  parse_cat_dtype(self.slots["var"]._schema.itype, is_itype=True)["field"]
622
689
  if "var" in self._slots
623
690
  else None
624
691
  ),
625
- key=key,
626
- artifact=self._artifact,
627
- revises=revises,
628
- run=run,
629
692
  schema=self._schema,
630
693
  )
631
694
 
@@ -636,14 +699,14 @@ def _assign_var_fields_categoricals_multimodal(
636
699
  slot: str,
637
700
  slot_schema: Schema,
638
701
  var_fields: dict[str, FieldAttr],
639
- categoricals: dict[str, dict[str, FieldAttr]],
702
+ cat_columns: dict[str, dict[str, CatColumn]],
640
703
  slots: dict[str, DataFrameCurator],
641
704
  ) -> None:
642
705
  """Assigns var_fields and categoricals for multimodal data curators."""
643
706
  if modality is not None:
644
707
  # Makes sure that all tables are present
645
708
  var_fields[modality] = None
646
- categoricals[modality] = {}
709
+ cat_columns[modality] = {}
647
710
 
648
711
  if slot_type == "var":
649
712
  var_field = parse_cat_dtype(slot_schema.itype, is_itype=True)["field"]
@@ -654,12 +717,12 @@ def _assign_var_fields_categoricals_multimodal(
654
717
  # Note that this is NOT nested since the nested key is always "var"
655
718
  var_fields[modality] = var_field
656
719
  else:
657
- obs_fields = slots[slot]._cat_manager.categoricals
720
+ obs_fields = slots[slot]._cat_manager._cat_columns
658
721
  if modality is None:
659
- categoricals[slot] = obs_fields
722
+ cat_columns[slot] = obs_fields
660
723
  else:
661
724
  # Note that this is NOT nested since the nested key is always "obs"
662
- categoricals[modality] = obs_fields
725
+ cat_columns[modality] = obs_fields
663
726
 
664
727
 
665
728
  class MuDataCurator(SlotsCurator):
@@ -770,7 +833,7 @@ class MuDataCurator(SlotsCurator):
770
833
  slot=slot,
771
834
  slot_schema=slot_schema,
772
835
  var_fields=self._var_fields,
773
- categoricals=self._categoricals,
836
+ cat_columns=self._cat_columns,
774
837
  slots=self._slots,
775
838
  )
776
839
 
@@ -890,7 +953,7 @@ class SpatialDataCurator(SlotsCurator):
890
953
  slot=slot,
891
954
  slot_schema=slot_schema,
892
955
  var_fields=self._var_fields,
893
- categoricals=self._categoricals,
956
+ cat_columns=self._cat_columns,
894
957
  slots=self._slots,
895
958
  )
896
959
 
@@ -898,6 +961,296 @@ class SpatialDataCurator(SlotsCurator):
898
961
  self._columns_field = self._var_fields
899
962
 
900
963
 
964
+ class CatColumn:
965
+ """Categorical column for `DataFrame`.
966
+
967
+ Args:
968
+ values_getter: A callable or iterable that returns the values to validate.
969
+ field: The field to validate against.
970
+ key: The name of the column to validate. Only used for logging.
971
+ values_setter: A callable that sets the values.
972
+ source: The source to validate against.
973
+ """
974
+
975
+ def __init__(
976
+ self,
977
+ values_getter: Callable | Iterable[str],
978
+ field: FieldAttr,
979
+ key: str,
980
+ values_setter: Callable | None = None,
981
+ source: Record | None = None,
982
+ feature: Feature | None = None,
983
+ ) -> None:
984
+ self._values_getter = values_getter
985
+ self._values_setter = values_setter
986
+ self._field = field
987
+ self._key = key
988
+ self._source = source
989
+ self._organism = None
990
+ self._validated: None | list[str] = None
991
+ self._non_validated: None | list[str] = None
992
+ self._synonyms: None | dict[str, str] = None
993
+ self.feature = feature
994
+ self.labels = None
995
+ if hasattr(field.field.model, "_name_field"):
996
+ label_ref_is_name = field.field.name == field.field.model._name_field
997
+ else:
998
+ label_ref_is_name = field.field.name == "name"
999
+ self.label_ref_is_name = label_ref_is_name
1000
+
1001
+ @property
1002
+ def values(self):
1003
+ """Get the current values using the getter function."""
1004
+ if callable(self._values_getter):
1005
+ return self._values_getter()
1006
+ return self._values_getter
1007
+
1008
+ @values.setter
1009
+ def values(self, new_values):
1010
+ """Set new values using the setter function if available."""
1011
+ if callable(self._values_setter):
1012
+ self._values_setter(new_values)
1013
+ else:
1014
+ # If values_getter is not callable, it's a direct reference we can update
1015
+ self._values_getter = new_values
1016
+
1017
+ @property
1018
+ def is_validated(self) -> bool:
1019
+ """Return whether the column is validated."""
1020
+ return len(self._non_validated) == 0
1021
+
1022
+ def _replace_synonyms(self) -> list[str]:
1023
+ """Replace synonyms in the column with standardized values."""
1024
+ syn_mapper = self._synonyms
1025
+ # replace the values in df
1026
+ std_values = self.values.map(
1027
+ lambda unstd_val: syn_mapper.get(unstd_val, unstd_val)
1028
+ )
1029
+ # remove the standardized values from self.non_validated
1030
+ non_validated = [i for i in self._non_validated if i not in syn_mapper]
1031
+ if len(non_validated) == 0:
1032
+ self._non_validated = []
1033
+ else:
1034
+ self._non_validated = non_validated # type: ignore
1035
+ # logging
1036
+ n = len(syn_mapper)
1037
+ if n > 0:
1038
+ syn_mapper_print = _format_values(
1039
+ [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
1040
+ )
1041
+ s = "s" if n > 1 else ""
1042
+ logger.success(
1043
+ f'standardized {n} synonym{s} in "{self._key}": {colors.green(syn_mapper_print)}'
1044
+ )
1045
+ return std_values
1046
+
1047
+ def _add_validated(self) -> tuple[list, list]:
1048
+ """Save features or labels records in the default instance."""
1049
+ from lamindb.models.save import save as ln_save
1050
+
1051
+ registry = self._field.field.model
1052
+ field_name = self._field.field.name
1053
+ model_field = registry.__get_name_with_module__()
1054
+ filter_kwargs = get_current_filter_kwargs(
1055
+ registry, {"organism": self._organism, "source": self._source}
1056
+ )
1057
+ values = [i for i in self.values if isinstance(i, str) and i]
1058
+ if not values:
1059
+ return [], []
1060
+
1061
+ # inspect the default instance and save validated records from public
1062
+ existing_and_public_records = registry.from_values(
1063
+ list(values), field=self._field, **filter_kwargs, mute=True
1064
+ )
1065
+ existing_and_public_labels = [
1066
+ getattr(r, field_name) for r in existing_and_public_records
1067
+ ]
1068
+ # public records that are not already in the database
1069
+ public_records = [r for r in existing_and_public_records if r._state.adding]
1070
+ # here we check to only save the public records if they are from the specified source
1071
+ # we check the uid because r.source and source can be from different instances
1072
+ if self._source:
1073
+ public_records = [
1074
+ r for r in public_records if r.source.uid == self._source.uid
1075
+ ]
1076
+ if len(public_records) > 0:
1077
+ logger.info(f"saving validated records of '{self._key}'")
1078
+ ln_save(public_records)
1079
+ labels_saved_public = [getattr(r, field_name) for r in public_records]
1080
+ # log the saved public labels
1081
+ # the term "transferred" stresses that this is always in the context of transferring
1082
+ # labels from a public ontology or a different instance to the present instance
1083
+ if len(labels_saved_public) > 0:
1084
+ s = "s" if len(labels_saved_public) > 1 else ""
1085
+ logger.success(
1086
+ f'added {len(labels_saved_public)} record{s} {colors.green("from_public")} with {model_field} for "{self._key}": {_format_values(labels_saved_public)}'
1087
+ )
1088
+ self.labels = existing_and_public_records
1089
+
1090
+ # non-validated records from the default instance
1091
+ non_validated_labels = [
1092
+ i for i in values if i not in existing_and_public_labels
1093
+ ]
1094
+
1095
+ # validated, non-validated
1096
+ return existing_and_public_labels, non_validated_labels
1097
+
1098
+ def _add_new(
1099
+ self,
1100
+ values: list[str],
1101
+ df: pd.DataFrame | None = None, # remove when all users use schema
1102
+ dtype: str | None = None,
1103
+ **create_kwargs,
1104
+ ) -> None:
1105
+ """Add new labels to the registry."""
1106
+ from lamindb.models.save import save as ln_save
1107
+
1108
+ registry = self._field.field.model
1109
+ field_name = self._field.field.name
1110
+ non_validated_records: RecordList[Any] = [] # type: ignore
1111
+ if df is not None and registry == Feature:
1112
+ nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
1113
+ non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
1114
+ else:
1115
+ if (
1116
+ self._organism
1117
+ and hasattr(registry, "organism")
1118
+ and registry._meta.get_field("organism").is_relation
1119
+ ):
1120
+ # make sure organism record is saved to the current instance
1121
+ create_kwargs["organism"] = _save_organism(name=self._organism)
1122
+
1123
+ for value in values:
1124
+ init_kwargs = {field_name: value}
1125
+ if registry == Feature:
1126
+ init_kwargs["dtype"] = "cat" if dtype is None else dtype
1127
+ non_validated_records.append(registry(**init_kwargs, **create_kwargs))
1128
+ if len(non_validated_records) > 0:
1129
+ ln_save(non_validated_records)
1130
+ model_field = colors.italic(registry.__get_name_with_module__())
1131
+ s = "s" if len(values) > 1 else ""
1132
+ logger.success(
1133
+ f'added {len(values)} record{s} with {model_field} for "{self._key}": {_format_values(values)}'
1134
+ )
1135
+
1136
+ def _validate(
1137
+ self,
1138
+ values: list[str],
1139
+ curator: CatManager | None = None, # TODO: not yet used
1140
+ ) -> tuple[list[str], dict]:
1141
+ """Validate ontology terms using LaminDB registries."""
1142
+ registry = self._field.field.model
1143
+ field_name = self._field.field.name
1144
+ model_field = f"{registry.__name__}.{field_name}"
1145
+
1146
+ def _log_mapping_info():
1147
+ logger.indent = ""
1148
+ logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}')
1149
+ logger.indent = " "
1150
+
1151
+ kwargs_current = get_current_filter_kwargs(
1152
+ registry, {"organism": self._organism, "source": self._source}
1153
+ )
1154
+
1155
+ # inspect values from the default instance, excluding public
1156
+ inspect_result = registry.inspect(
1157
+ values, field=self._field, mute=True, from_source=False, **kwargs_current
1158
+ )
1159
+ non_validated = inspect_result.non_validated
1160
+ syn_mapper = inspect_result.synonyms_mapper
1161
+
1162
+ # inspect the non-validated values from public (BioRecord only)
1163
+ values_validated = []
1164
+ if hasattr(registry, "public"):
1165
+ public_records = registry.from_values(
1166
+ non_validated,
1167
+ field=self._field,
1168
+ mute=True,
1169
+ **kwargs_current,
1170
+ )
1171
+ values_validated += [getattr(r, field_name) for r in public_records]
1172
+
1173
+ # logging messages
1174
+ non_validated_hint_print = f'.add_new_from("{self._key}")'
1175
+ non_validated = [i for i in non_validated if i not in values_validated]
1176
+ n_non_validated = len(non_validated)
1177
+ if n_non_validated == 0:
1178
+ logger.indent = ""
1179
+ logger.success(
1180
+ f'"{self._key}" is validated against {colors.italic(model_field)}'
1181
+ )
1182
+ return [], {}
1183
+ else:
1184
+ are = "is" if n_non_validated == 1 else "are"
1185
+ s = "" if n_non_validated == 1 else "s"
1186
+ print_values = _format_values(non_validated)
1187
+ warning_message = f"{colors.red(f'{n_non_validated} term{s}')} {are} not validated: {colors.red(print_values)}\n"
1188
+ if syn_mapper:
1189
+ s = "" if len(syn_mapper) == 1 else "s"
1190
+ syn_mapper_print = _format_values(
1191
+ [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
1192
+ )
1193
+ hint_msg = f'.standardize("{self._key}")'
1194
+ warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}"
1195
+ if n_non_validated > len(syn_mapper):
1196
+ if syn_mapper:
1197
+ warning_message += "\n for remaining terms:\n"
1198
+ warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}"
1199
+
1200
+ if logger.indent == "":
1201
+ _log_mapping_info()
1202
+ logger.warning(warning_message)
1203
+ if curator is not None:
1204
+ curator._validate_category_error_messages = strip_ansi_codes(
1205
+ warning_message
1206
+ )
1207
+ logger.indent = ""
1208
+ return non_validated, syn_mapper
1209
+
1210
+ def validate(self) -> None:
1211
+ """Validate the column."""
1212
+ # add source-validated values to the registry
1213
+ self._validated, self._non_validated = self._add_validated()
1214
+ self._non_validated, self._synonyms = self._validate(values=self._non_validated)
1215
+ # always register new Features if they are columns
1216
+ if self._key == "columns" and self._field == Feature.name:
1217
+ self.add_new()
1218
+
1219
+ def standardize(self) -> None:
1220
+ """Standardize the column."""
1221
+ registry = self._field.field.model
1222
+ if not hasattr(registry, "standardize"):
1223
+ return self.values
1224
+ if self._synonyms is None:
1225
+ self.validate()
1226
+ # get standardized values
1227
+ std_values = self._replace_synonyms()
1228
+ # update non_validated values
1229
+ self._non_validated = [
1230
+ i for i in self._non_validated if i not in self._synonyms.keys()
1231
+ ]
1232
+ # remove synonyms since they are now standardized
1233
+ self._synonyms = {}
1234
+ # update the values with the standardized values
1235
+ self.values = std_values
1236
+
1237
+ def add_new(self, **create_kwargs) -> None:
1238
+ """Add new values to the registry."""
1239
+ if self._non_validated is None:
1240
+ self.validate()
1241
+ if len(self._synonyms) > 0:
1242
+ # raise error because .standardize modifies the input dataset
1243
+ raise ValidationError(
1244
+ "Please run `.standardize()` before adding new values."
1245
+ )
1246
+ self._add_new(
1247
+ values=self._non_validated,
1248
+ **create_kwargs,
1249
+ )
1250
+ # remove the non_validated values since they are now registered
1251
+ self._non_validated = []
1252
+
1253
+
901
1254
  class CatManager:
902
1255
  """Manage categoricals by updating registries.
903
1256
 
@@ -909,14 +1262,16 @@ class CatManager:
909
1262
  - non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
910
1263
  """
911
1264
 
912
- def __init__(self, *, dataset, categoricals, sources, organism, columns_field=None):
1265
+ def __init__(self, *, dataset, categoricals, sources, columns_field=None):
913
1266
  # the below is shared with Curator
914
1267
  self._artifact: Artifact = None # pass the dataset as an artifact
915
1268
  self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
916
1269
  if isinstance(self._dataset, Artifact):
917
1270
  self._artifact = self._dataset
918
1271
  if self._artifact.otype in {"DataFrame", "AnnData"}:
919
- self._dataset = self._dataset.load()
1272
+ self._dataset = self._dataset.load(
1273
+ is_run_input=False # we already track this in the Curator constructor
1274
+ )
920
1275
  self._is_validated: bool = False
921
1276
  # shared until here
922
1277
  self._categoricals = categoricals or {}
@@ -924,49 +1279,24 @@ class CatManager:
924
1279
  self._sources = sources or {}
925
1280
  self._columns_field = columns_field
926
1281
  self._validate_category_error_messages: str = ""
927
- # make sure to only fetch organism once at the beginning
928
- if organism:
929
- self._organism = organism
930
- else:
931
- fields = list(self._categoricals.values()) + [columns_field]
932
- organisms = {get_organism_kwargs(field).get("organism") for field in fields}
933
- self._organism = organisms.pop() if len(organisms) > 0 else None
1282
+ self._cat_columns: dict[str, CatColumn] = {}
934
1283
 
935
1284
  @property
936
1285
  def non_validated(self) -> dict[str, list[str]]:
937
1286
  """Return the non-validated features and labels."""
938
1287
  if self._non_validated is None:
939
1288
  raise ValidationError("Please run validate() first!")
940
- return self._non_validated
1289
+ return {
1290
+ key: cat_column._non_validated
1291
+ for key, cat_column in self._cat_columns.items()
1292
+ if cat_column._non_validated and key != "columns"
1293
+ }
941
1294
 
942
1295
  @property
943
1296
  def categoricals(self) -> dict:
944
1297
  """Return the columns fields to validate against."""
945
1298
  return self._categoricals
946
1299
 
947
- def _replace_synonyms(
948
- self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
949
- ):
950
- # replace the values in df
951
- std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
952
- # remove the standardized values from self.non_validated
953
- non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
954
- if len(non_validated) == 0:
955
- self._non_validated.pop(key, None) # type: ignore
956
- else:
957
- self._non_validated[key] = non_validated # type: ignore
958
- # logging
959
- n = len(syn_mapper)
960
- if n > 0:
961
- syn_mapper_print = _format_values(
962
- [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
963
- )
964
- s = "s" if n > 1 else ""
965
- logger.success(
966
- f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
967
- )
968
- return std_values
969
-
970
1300
  def validate(self) -> bool:
971
1301
  """Validate dataset.
972
1302
 
@@ -1006,19 +1336,49 @@ class CatManager:
1006
1336
  if not self._is_validated: # need to raise error manually
1007
1337
  raise ValidationError("Dataset does not validate. Please curate.")
1008
1338
 
1009
- self._artifact = save_artifact( # type: ignore
1010
- self._dataset,
1011
- key=key,
1012
- description=description,
1013
- fields=self.categoricals,
1339
+ if self._artifact is None:
1340
+ if isinstance(self._dataset, pd.DataFrame):
1341
+ artifact = Artifact.from_df(
1342
+ self._dataset,
1343
+ key=key,
1344
+ description=description,
1345
+ revises=revises,
1346
+ run=run,
1347
+ )
1348
+ elif isinstance(self._dataset, AnnData):
1349
+ artifact = Artifact.from_anndata(
1350
+ self._dataset,
1351
+ key=key,
1352
+ description=description,
1353
+ revises=revises,
1354
+ run=run,
1355
+ )
1356
+ elif data_is_mudata(self._dataset):
1357
+ artifact = Artifact.from_mudata(
1358
+ self._dataset,
1359
+ key=key,
1360
+ description=description,
1361
+ revises=revises,
1362
+ run=run,
1363
+ )
1364
+ elif data_is_spatialdata(self._dataset):
1365
+ artifact = Artifact.from_spatialdata(
1366
+ self._dataset,
1367
+ key=key,
1368
+ description=description,
1369
+ revises=revises,
1370
+ run=run,
1371
+ )
1372
+ else:
1373
+ raise InvalidArgument( # pragma: no cover
1374
+ "data must be one of pd.Dataframe, AnnData, MuData, SpatialData."
1375
+ )
1376
+ self._artifact = artifact.save()
1377
+ annotate_artifact( # type: ignore
1378
+ self._artifact,
1014
1379
  index_field=self._columns_field,
1015
- artifact=self._artifact,
1016
- revises=revises,
1017
- run=run,
1018
- schema=None,
1019
- organism=self._organism,
1380
+ cat_columns=self._cat_columns,
1020
1381
  )
1021
-
1022
1382
  return self._artifact
1023
1383
 
1024
1384
 
@@ -1029,24 +1389,73 @@ class DataFrameCatManager(CatManager):
1029
1389
  self,
1030
1390
  df: pd.DataFrame | Artifact,
1031
1391
  columns: FieldAttr = Feature.name,
1032
- categoricals: dict[str, FieldAttr] | None = None,
1033
- verbosity: str = "hint",
1034
- organism: str | None = None,
1392
+ categoricals: list[Feature] | dict[str, FieldAttr] | None = None,
1035
1393
  sources: dict[str, Record] | None = None,
1036
1394
  ) -> None:
1037
- if organism is not None and not isinstance(organism, str):
1038
- raise ValueError("organism must be a string such as 'human' or 'mouse'!")
1039
-
1040
- settings.verbosity = verbosity
1041
1395
  self._non_validated = None
1042
1396
  super().__init__(
1043
1397
  dataset=df,
1044
1398
  columns_field=columns,
1045
- organism=organism,
1046
1399
  categoricals=categoricals,
1047
1400
  sources=sources,
1048
1401
  )
1049
- self._save_columns()
1402
+ if columns == Feature.name:
1403
+ if isinstance(self._categoricals, list):
1404
+ values = [feature.name for feature in self._categoricals]
1405
+ else:
1406
+ values = list(self._categoricals.keys())
1407
+ self._cat_columns["columns"] = CatColumn(
1408
+ values_getter=values,
1409
+ field=self._columns_field,
1410
+ key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
1411
+ source=self._sources.get("columns"),
1412
+ )
1413
+ self._cat_columns["columns"].validate()
1414
+ else:
1415
+ # NOTE: for var_index right now
1416
+ self._cat_columns["columns"] = CatColumn(
1417
+ values_getter=lambda: self._dataset.columns, # lambda ensures the inplace update
1418
+ values_setter=lambda new_values: setattr(
1419
+ self._dataset, "columns", pd.Index(new_values)
1420
+ ),
1421
+ field=self._columns_field,
1422
+ key="columns",
1423
+ source=self._sources.get("columns"),
1424
+ )
1425
+ if isinstance(self._categoricals, list):
1426
+ for feature in self._categoricals:
1427
+ result = parse_dtype(feature.dtype)[
1428
+ 0
1429
+ ] # TODO: support composite dtypes for categoricals
1430
+ key = feature.name
1431
+ field = result["field"]
1432
+ self._cat_columns[key] = CatColumn(
1433
+ values_getter=lambda k=key: self._dataset[
1434
+ k
1435
+ ], # Capture key as default argument
1436
+ values_setter=lambda new_values, k=key: self._dataset.__setitem__(
1437
+ k, new_values
1438
+ ),
1439
+ field=field,
1440
+ key=key,
1441
+ source=self._sources.get(key),
1442
+ feature=feature,
1443
+ )
1444
+ else:
1445
+ # below is for backward compat of ln.Curator.from_df()
1446
+ for key, field in self._categoricals.items():
1447
+ self._cat_columns[key] = CatColumn(
1448
+ values_getter=lambda k=key: self._dataset[
1449
+ k
1450
+ ], # Capture key as default argument
1451
+ values_setter=lambda new_values, k=key: self._dataset.__setitem__(
1452
+ k, new_values
1453
+ ),
1454
+ field=field,
1455
+ key=key,
1456
+ source=self._sources.get(key),
1457
+ feature=Feature.get(name=key),
1458
+ )
1050
1459
 
1051
1460
  def lookup(self, public: bool = False) -> CatLookup:
1052
1461
  """Lookup categories.
@@ -1058,59 +1467,20 @@ class DataFrameCatManager(CatManager):
1058
1467
  categoricals=self._categoricals,
1059
1468
  slots={"columns": self._columns_field},
1060
1469
  public=public,
1061
- organism=self._organism,
1062
1470
  sources=self._sources,
1063
1471
  )
1064
1472
 
1065
- def _save_columns(self, validated_only: bool = True) -> None:
1066
- """Save column name records."""
1067
- # Always save features specified as the fields keys
1068
- update_registry(
1069
- values=list(self.categoricals.keys()),
1070
- field=self._columns_field,
1071
- key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
1072
- validated_only=False,
1073
- source=self._sources.get("columns"),
1074
- )
1075
-
1076
- # Save the rest of the columns based on validated_only
1077
- additional_columns = set(self._dataset.keys()) - set(self.categoricals.keys())
1078
- if additional_columns:
1079
- update_registry(
1080
- values=list(additional_columns),
1081
- field=self._columns_field,
1082
- key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
1083
- validated_only=validated_only,
1084
- df=self._dataset, # Get the Feature type from df
1085
- source=self._sources.get("columns"),
1086
- )
1087
-
1088
- @deprecated(new_name="is run by default")
1089
- def add_new_from_columns(self, organism: str | None = None, **kwargs):
1090
- pass # pragma: no cover
1091
-
1092
1473
  def validate(self) -> bool:
1093
- """Validate variables and categorical observations.
1094
-
1095
- This method also registers the validated records in the current instance:
1096
- - from public sources
1474
+ """Validate variables and categorical observations."""
1475
+ self._validate_category_error_messages = "" # reset the error messages
1097
1476
 
1098
- Args:
1099
- organism: The organism name.
1477
+ validated = True
1478
+ for _, cat_column in self._cat_columns.items():
1479
+ cat_column.validate()
1480
+ validated &= cat_column.is_validated
1481
+ self._is_validated = validated
1482
+ self._non_validated = {} # so it's no longer None
1100
1483
 
1101
- Returns:
1102
- Whether the DataFrame is validated.
1103
- """
1104
- # add all validated records to the current instance
1105
- self._update_registry_all()
1106
- self._validate_category_error_messages = "" # reset the error messages
1107
- self._is_validated, self._non_validated = validate_categories_in_df( # type: ignore
1108
- self._dataset,
1109
- fields=self.categoricals,
1110
- sources=self._sources,
1111
- curator=self,
1112
- organism=self._organism,
1113
- )
1114
1484
  return self._is_validated
1115
1485
 
1116
1486
  def standardize(self, key: str) -> None:
@@ -1123,82 +1493,37 @@ class DataFrameCatManager(CatManager):
1123
1493
  """
1124
1494
  if self._artifact is not None:
1125
1495
  raise RuntimeError("can't mutate the dataset when an artifact is passed!")
1126
- # list is needed to avoid RuntimeError: dictionary changed size during iteration
1127
- avail_keys = list(self.non_validated.keys())
1128
- if len(avail_keys) == 0:
1129
- logger.warning("values are already standardized")
1130
- return
1131
1496
 
1132
1497
  if key == "all":
1133
- for k in avail_keys:
1134
- if k in self._categoricals: # needed to exclude var_index
1135
- syn_mapper = standardize_categories(
1136
- self.non_validated[k],
1137
- field=self._categoricals[k],
1138
- source=self._sources.get(k),
1139
- )
1140
- self._dataset[k] = self._replace_synonyms(
1141
- k, syn_mapper, self._dataset[k]
1142
- )
1143
- else:
1144
- if key not in avail_keys:
1145
- if key in self._categoricals:
1146
- logger.warning(f"No non-standardized values found for {key!r}")
1147
- else:
1148
- raise KeyError(
1149
- f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
1150
- )
1151
- else:
1152
- if key in self._categoricals: # needed to exclude var_index
1153
- syn_mapper = standardize_categories(
1154
- self.non_validated[key],
1155
- field=self._categoricals[key],
1156
- source=self._sources.get(key),
1157
- organism=self._organism,
1158
- )
1159
- self._dataset[key] = self._replace_synonyms(
1160
- key, syn_mapper, self._dataset[key]
1161
- )
1162
-
1163
- def _update_registry_all(self, validated_only: bool = True, **kwargs):
1164
- """Save labels for all features."""
1165
- for name in self.categoricals.keys():
1166
- self._update_registry(name, validated_only=validated_only, **kwargs)
1167
-
1168
- def _update_registry(
1169
- self, categorical: str, validated_only: bool = True, **kwargs
1170
- ) -> None:
1171
- if categorical == "all":
1172
- self._update_registry_all(validated_only=validated_only, **kwargs)
1173
- else:
1174
- if categorical not in self.categoricals:
1175
- raise ValidationError(
1176
- f"Feature {categorical} is not part of the fields!"
1177
- )
1178
- update_registry(
1179
- values=_flatten_unique(self._dataset[categorical]),
1180
- field=self.categoricals[categorical],
1181
- key=categorical,
1182
- validated_only=validated_only,
1183
- source=self._sources.get(categorical),
1184
- organism=self._organism,
1498
+ logger.warning(
1499
+ "'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!"
1185
1500
  )
1186
- # adding new records removes them from non_validated
1187
- if not validated_only and self._non_validated:
1188
- self._non_validated.pop(categorical, None) # type: ignore
1501
+ for k in self.non_validated.keys():
1502
+ self._cat_columns[k].standardize()
1503
+ else:
1504
+ self._cat_columns[key].standardize()
1189
1505
 
1190
1506
  def add_new_from(self, key: str, **kwargs):
1191
1507
  """Add validated & new categories.
1192
1508
 
1193
1509
  Args:
1194
1510
  key: The key referencing the slot in the DataFrame from which to draw terms.
1195
- organism: The organism name.
1196
1511
  **kwargs: Additional keyword arguments to pass to create new records
1197
1512
  """
1198
1513
  if len(kwargs) > 0 and key == "all":
1199
1514
  raise ValueError("Cannot pass additional arguments to 'all' key!")
1200
- self._update_registry(key, validated_only=False, **kwargs)
1515
+ if key == "all":
1516
+ logger.warning(
1517
+ "'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!"
1518
+ )
1519
+ for k in self.non_validated.keys():
1520
+ self._cat_columns[k].add_new(**kwargs)
1521
+ else:
1522
+ self._cat_columns[key].add_new(**kwargs)
1201
1523
 
1524
+ @deprecated(
1525
+ new_name="Run.filter(transform=context.run.transform, output_artifacts=None)"
1526
+ )
1202
1527
  def clean_up_failed_runs(self):
1203
1528
  """Clean up previous failed runs that don't save any outputs."""
1204
1529
  from lamindb.core._context import context
@@ -1218,8 +1543,6 @@ class AnnDataCatManager(CatManager):
1218
1543
  var_index: FieldAttr | None = None,
1219
1544
  categoricals: dict[str, FieldAttr] | None = None,
1220
1545
  obs_columns: FieldAttr = Feature.name,
1221
- verbosity: str = "hint",
1222
- organism: str | None = None,
1223
1546
  sources: dict[str, Record] | None = None,
1224
1547
  ) -> None:
1225
1548
  if isinstance(var_index, str):
@@ -1242,7 +1565,6 @@ class AnnDataCatManager(CatManager):
1242
1565
  dataset=data,
1243
1566
  categoricals=categoricals,
1244
1567
  sources=self._sources,
1245
- organism=organism,
1246
1568
  columns_field=var_index,
1247
1569
  )
1248
1570
  self._adata = self._dataset
@@ -1250,10 +1572,19 @@ class AnnDataCatManager(CatManager):
1250
1572
  df=self._adata.obs,
1251
1573
  categoricals=self.categoricals,
1252
1574
  columns=obs_columns,
1253
- verbosity=verbosity,
1254
- organism=None,
1255
1575
  sources=self._sources,
1256
1576
  )
1577
+ self._cat_columns = self._obs_df_curator._cat_columns.copy()
1578
+ if var_index is not None:
1579
+ self._cat_columns["var_index"] = CatColumn(
1580
+ values_getter=lambda: self._adata.var.index,
1581
+ values_setter=lambda new_values: setattr(
1582
+ self._adata.var, "index", pd.Index(new_values)
1583
+ ),
1584
+ field=self._var_field,
1585
+ key="var_index",
1586
+ source=self._sources.get("var_index"),
1587
+ )
1257
1588
 
1258
1589
  @property
1259
1590
  def var_index(self) -> FieldAttr:
@@ -1275,76 +1606,51 @@ class AnnDataCatManager(CatManager):
1275
1606
  categoricals=self._obs_fields,
1276
1607
  slots={"columns": self._columns_field, "var_index": self._var_field},
1277
1608
  public=public,
1278
- organism=self._organism,
1279
1609
  sources=self._sources,
1280
1610
  )
1281
1611
 
1282
- def _save_from_var_index(
1283
- self,
1284
- validated_only: bool = True,
1285
- ):
1286
- """Save variable records."""
1287
- if self.var_index is not None:
1288
- update_registry(
1289
- values=list(self._adata.var.index),
1290
- field=self.var_index,
1291
- key="var_index",
1292
- validated_only=validated_only,
1293
- organism=self._organism,
1294
- source=self._sources.get("var_index"),
1295
- )
1296
-
1297
1612
  def add_new_from(self, key: str, **kwargs):
1298
1613
  """Add validated & new categories.
1299
1614
 
1300
1615
  Args:
1301
1616
  key: The key referencing the slot in the DataFrame from which to draw terms.
1302
- organism: The organism name.
1303
1617
  **kwargs: Additional keyword arguments to pass to create new records
1304
1618
  """
1305
- self._obs_df_curator.add_new_from(key, **kwargs)
1619
+ if key == "all":
1620
+ logger.warning(
1621
+ "'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!"
1622
+ )
1623
+ for k in self.non_validated.keys():
1624
+ self._cat_columns[k].add_new(**kwargs)
1625
+ else:
1626
+ self._cat_columns[key].add_new(**kwargs)
1306
1627
 
1628
+ @deprecated(new_name="add_new_from('var_index')")
1307
1629
  def add_new_from_var_index(self, **kwargs):
1308
1630
  """Update variable records.
1309
1631
 
1310
1632
  Args:
1311
- organism: The organism name.
1312
1633
  **kwargs: Additional keyword arguments to pass to create new records.
1313
1634
  """
1314
- self._save_from_var_index(validated_only=False, **kwargs)
1635
+ self.add_new_from(key="var_index", **kwargs)
1315
1636
 
1316
1637
  def validate(self) -> bool:
1317
1638
  """Validate categories.
1318
1639
 
1319
1640
  This method also registers the validated records in the current instance.
1320
1641
 
1321
- Args:
1322
- organism: The organism name.
1323
-
1324
1642
  Returns:
1325
1643
  Whether the AnnData object is validated.
1326
1644
  """
1327
1645
  self._validate_category_error_messages = "" # reset the error messages
1328
1646
 
1329
- # add all validated records to the current instance
1330
- self._save_from_var_index(validated_only=True)
1331
- if self.var_index is not None:
1332
- validated_var, non_validated_var = validate_categories(
1333
- self._adata.var.index,
1334
- field=self._var_field,
1335
- key="var_index",
1336
- source=self._sources.get("var_index"),
1337
- hint_print=".add_new_from_var_index()",
1338
- organism=self._organism, # type: ignore
1339
- )
1340
- else:
1341
- validated_var = True
1342
- non_validated_var = []
1343
- validated_obs = self._obs_df_curator.validate()
1344
- self._non_validated = self._obs_df_curator._non_validated # type: ignore
1345
- if len(non_validated_var) > 0:
1346
- self._non_validated["var_index"] = non_validated_var # type: ignore
1347
- self._is_validated = validated_var and validated_obs
1647
+ validated = True
1648
+ for _, cat_column in self._cat_columns.items():
1649
+ cat_column.validate()
1650
+ validated &= cat_column.is_validated
1651
+
1652
+ self._non_validated = {} # so it's no longer None
1653
+ self._is_validated = validated
1348
1654
  return self._is_validated
1349
1655
 
1350
1656
  def standardize(self, key: str):
@@ -1360,23 +1666,17 @@ class AnnDataCatManager(CatManager):
1360
1666
  """
1361
1667
  if self._artifact is not None:
1362
1668
  raise RuntimeError("can't mutate the dataset when an artifact is passed!")
1363
- if key in self._adata.obs.columns or key == "all":
1364
- # standardize obs columns
1365
- self._obs_df_curator.standardize(key)
1366
- # in addition to the obs columns, standardize the var.index
1367
- if key == "var_index" or key == "all":
1368
- syn_mapper = standardize_categories(
1369
- self._adata.var.index,
1370
- field=self.var_index,
1371
- source=self._sources.get("var_index"),
1372
- organism=self._organism,
1669
+ if key == "all":
1670
+ logger.warning(
1671
+ "'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!"
1373
1672
  )
1374
- if "var_index" in self._non_validated: # type: ignore
1375
- self._adata.var.index = self._replace_synonyms(
1376
- "var_index", syn_mapper, self._adata.var.index
1377
- )
1673
+ for k in self.non_validated.keys():
1674
+ self._cat_columns[k].standardize()
1675
+ else:
1676
+ self._cat_columns[key].standardize()
1378
1677
 
1379
1678
 
1679
+ @deprecated(new_name="MuDataCurator")
1380
1680
  class MuDataCatManager(CatManager):
1381
1681
  """Categorical manager for `MuData`."""
1382
1682
 
@@ -1385,15 +1685,12 @@ class MuDataCatManager(CatManager):
1385
1685
  mdata: MuData | Artifact,
1386
1686
  var_index: dict[str, FieldAttr] | None = None,
1387
1687
  categoricals: dict[str, FieldAttr] | None = None,
1388
- verbosity: str = "hint",
1389
- organism: str | None = None,
1390
1688
  sources: dict[str, Record] | None = None,
1391
1689
  ) -> None:
1392
1690
  super().__init__(
1393
1691
  dataset=mdata,
1394
1692
  categoricals={},
1395
1693
  sources=sources,
1396
- organism=organism,
1397
1694
  )
1398
1695
  self._columns_field = (
1399
1696
  var_index or {}
@@ -1402,25 +1699,20 @@ class MuDataCatManager(CatManager):
1402
1699
  self._verify_modality(self._var_fields.keys())
1403
1700
  self._obs_fields = self._parse_categoricals(categoricals or {})
1404
1701
  self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
1405
- self._verbosity = verbosity
1406
1702
  self._obs_df_curator = None
1407
1703
  if "obs" in self._modalities:
1408
1704
  self._obs_df_curator = DataFrameCatManager(
1409
1705
  df=self._dataset.obs,
1410
1706
  columns=Feature.name,
1411
1707
  categoricals=self._obs_fields.get("obs", {}),
1412
- verbosity=verbosity,
1413
1708
  sources=self._sources.get("obs"),
1414
- organism=organism,
1415
1709
  )
1416
1710
  self._mod_adata_curators = {
1417
1711
  modality: AnnDataCatManager(
1418
1712
  data=self._dataset[modality],
1419
1713
  var_index=var_index.get(modality),
1420
1714
  categoricals=self._obs_fields.get(modality),
1421
- verbosity=verbosity,
1422
1715
  sources=self._sources.get(modality),
1423
- organism=organism,
1424
1716
  )
1425
1717
  for modality in self._modalities
1426
1718
  if modality != "obs"
@@ -1442,6 +1734,16 @@ class MuDataCatManager(CatManager):
1442
1734
  """Return the non-validated features and labels."""
1443
1735
  if self._non_validated is None:
1444
1736
  raise ValidationError("Please run validate() first!")
1737
+ non_validated = {}
1738
+ if (
1739
+ self._obs_df_curator is not None
1740
+ and len(self._obs_df_curator.non_validated) > 0
1741
+ ):
1742
+ non_validated["obs"] = self._obs_df_curator.non_validated
1743
+ for modality, adata_curator in self._mod_adata_curators.items():
1744
+ if len(adata_curator.non_validated) > 0:
1745
+ non_validated[modality] = adata_curator.non_validated
1746
+ self._non_validated = non_validated
1445
1747
  return self._non_validated
1446
1748
 
1447
1749
  def _verify_modality(self, modalities: Iterable[str]):
@@ -1487,35 +1789,18 @@ class MuDataCatManager(CatManager):
1487
1789
  **{f"{k}_var_index": v for k, v in self._var_fields.items()},
1488
1790
  },
1489
1791
  public=public,
1490
- organism=self._organism,
1491
1792
  sources=self._sources,
1492
1793
  )
1493
1794
 
1494
- @deprecated(new_name="is run by default")
1495
- def add_new_from_columns(
1496
- self,
1497
- modality: str,
1498
- column_names: list[str] | None = None,
1499
- **kwargs,
1500
- ):
1501
- pass # pragma: no cover
1502
-
1795
+ @deprecated(new_name="add_new_from('var_index')")
1503
1796
  def add_new_from_var_index(self, modality: str, **kwargs):
1504
1797
  """Update variable records.
1505
1798
 
1506
1799
  Args:
1507
1800
  modality: The modality name.
1508
- organism: The organism name.
1509
1801
  **kwargs: Additional keyword arguments to pass to create new records.
1510
1802
  """
1511
- self._mod_adata_curators[modality].add_new_from_var_index(**kwargs)
1512
-
1513
- def _update_registry_all(self):
1514
- """Update all registries."""
1515
- if self._obs_df_curator is not None:
1516
- self._obs_df_curator._update_registry_all(validated_only=True)
1517
- for _, adata_curator in self._mod_adata_curators.items():
1518
- adata_curator._obs_df_curator._update_registry_all(validated_only=True)
1803
+ self._mod_adata_curators[modality].add_new_from(key="var_index", **kwargs)
1519
1804
 
1520
1805
  def add_new_from(
1521
1806
  self,
@@ -1528,39 +1813,30 @@ class MuDataCatManager(CatManager):
1528
1813
  Args:
1529
1814
  key: The key referencing the slot in the DataFrame.
1530
1815
  modality: The modality name.
1531
- organism: The organism name.
1532
1816
  **kwargs: Additional keyword arguments to pass to create new records.
1533
1817
  """
1534
- if len(kwargs) > 0 and key == "all":
1535
- raise ValueError("Cannot pass additional arguments to 'all' key!")
1536
1818
  modality = modality or "obs"
1537
1819
  if modality in self._mod_adata_curators:
1538
1820
  adata_curator = self._mod_adata_curators[modality]
1539
1821
  adata_curator.add_new_from(key=key, **kwargs)
1540
1822
  if modality == "obs":
1541
1823
  self._obs_df_curator.add_new_from(key=key, **kwargs)
1824
+ if key == "var_index":
1825
+ self._mod_adata_curators[modality].add_new_from(key=key, **kwargs)
1542
1826
 
1543
1827
  def validate(self) -> bool:
1544
1828
  """Validate categories."""
1545
- # add all validated records to the current instance
1546
- self._update_registry_all()
1547
- self._non_validated = {} # type: ignore
1548
-
1549
1829
  obs_validated = True
1550
1830
  if "obs" in self._modalities:
1551
1831
  logger.info('validating categoricals in "obs"...')
1552
1832
  obs_validated &= self._obs_df_curator.validate()
1553
- self._non_validated["obs"] = self._obs_df_curator.non_validated # type: ignore
1554
- logger.print("")
1555
1833
 
1556
1834
  mods_validated = True
1557
1835
  for modality, adata_curator in self._mod_adata_curators.items():
1558
1836
  logger.info(f'validating categoricals in modality "{modality}"...')
1559
1837
  mods_validated &= adata_curator.validate()
1560
- if len(adata_curator.non_validated) > 0:
1561
- self._non_validated[modality] = adata_curator.non_validated # type: ignore
1562
- logger.print("")
1563
1838
 
1839
+ self._non_validated = {} # so it's no longer None
1564
1840
  self._is_validated = obs_validated & mods_validated
1565
1841
  return self._is_validated
1566
1842
 
@@ -1592,6 +1868,7 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
1592
1868
  )
1593
1869
 
1594
1870
 
1871
+ @deprecated(new_name="SpatialDataCurator")
1595
1872
  class SpatialDataCatManager(CatManager):
1596
1873
  """Categorical manager for `SpatialData`."""
1597
1874
 
@@ -1600,8 +1877,6 @@ class SpatialDataCatManager(CatManager):
1600
1877
  sdata: Any,
1601
1878
  var_index: dict[str, FieldAttr],
1602
1879
  categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1603
- verbosity: str = "hint",
1604
- organism: str | None = None,
1605
1880
  sources: dict[str, dict[str, Record]] | None = None,
1606
1881
  *,
1607
1882
  sample_metadata_key: str | None = "sample",
@@ -1610,7 +1885,6 @@ class SpatialDataCatManager(CatManager):
1610
1885
  dataset=sdata,
1611
1886
  categoricals={},
1612
1887
  sources=sources,
1613
- organism=organism,
1614
1888
  )
1615
1889
  if isinstance(sdata, Artifact):
1616
1890
  self._sdata = sdata.load()
@@ -1624,7 +1898,6 @@ class SpatialDataCatManager(CatManager):
1624
1898
  self._table_keys = set(self._var_fields.keys()) | set(
1625
1899
  self._categoricals.keys() - {self._sample_metadata_key}
1626
1900
  )
1627
- self._verbosity = verbosity
1628
1901
  self._sample_df_curator = None
1629
1902
  if self._sample_metadata_key is not None:
1630
1903
  self._sample_metadata = self._sdata.get_attrs(
@@ -1675,18 +1948,14 @@ class SpatialDataCatManager(CatManager):
1675
1948
  df=self._sample_metadata,
1676
1949
  columns=Feature.name,
1677
1950
  categoricals=self._categoricals.get(self._sample_metadata_key, {}),
1678
- verbosity=verbosity,
1679
1951
  sources=self._sources.get(self._sample_metadata_key),
1680
- organism=organism,
1681
1952
  )
1682
1953
  self._table_adata_curators = {
1683
1954
  table: AnnDataCatManager(
1684
1955
  data=self._sdata[table],
1685
1956
  var_index=var_index.get(table),
1686
1957
  categoricals=self._categoricals.get(table),
1687
- verbosity=verbosity,
1688
1958
  sources=self._sources.get(table),
1689
- organism=organism,
1690
1959
  )
1691
1960
  for table in self._table_keys
1692
1961
  }
@@ -1708,7 +1977,15 @@ class SpatialDataCatManager(CatManager):
1708
1977
  """Return the non-validated features and labels."""
1709
1978
  if self._non_validated is None:
1710
1979
  raise ValidationError("Please run validate() first!")
1711
- return self._non_validated
1980
+ non_curated = {}
1981
+ if len(self._sample_df_curator.non_validated) > 0:
1982
+ non_curated[self._sample_metadata_key] = (
1983
+ self._sample_df_curator.non_validated
1984
+ )
1985
+ for table, adata_curator in self._table_adata_curators.items():
1986
+ if len(adata_curator.non_validated) > 0:
1987
+ non_curated[table] = adata_curator.non_validated
1988
+ return non_curated
1712
1989
 
1713
1990
  def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
1714
1991
  """Verify that the accessors exist (either a valid table or in attrs)."""
@@ -1734,38 +2011,19 @@ class SpatialDataCatManager(CatManager):
1734
2011
  categoricals=cat_values_dict,
1735
2012
  slots={"accessors": cat_values_dict.keys()},
1736
2013
  public=public,
1737
- organism=self._organism,
1738
2014
  sources=self._sources,
1739
2015
  )
1740
2016
 
1741
- def _update_registry_all(self) -> None:
1742
- """Saves labels of all features for sample and table metadata."""
1743
- if self._sample_df_curator is not None:
1744
- self._sample_df_curator._update_registry_all(
1745
- validated_only=True,
1746
- )
1747
- for _, adata_curator in self._table_adata_curators.items():
1748
- adata_curator._obs_df_curator._update_registry_all(
1749
- validated_only=True,
1750
- )
1751
-
2017
+ @deprecated(new_name="add_new_from('var_index')")
1752
2018
  def add_new_from_var_index(self, table: str, **kwargs) -> None:
1753
2019
  """Save new values from ``.var.index`` of table.
1754
2020
 
1755
2021
  Args:
1756
2022
  table: The table key.
1757
- organism: The organism name.
1758
2023
  **kwargs: Additional keyword arguments to pass to create new records.
1759
2024
  """
1760
- if self._non_validated is None:
1761
- raise ValidationError("Run .validate() first.")
1762
- self._table_adata_curators[table].add_new_from_var_index(**kwargs)
1763
2025
  if table in self.non_validated.keys():
1764
- if "var_index" in self._non_validated[table]:
1765
- self._non_validated[table].pop("var_index")
1766
-
1767
- if len(self.non_validated[table].values()) == 0:
1768
- self.non_validated.pop(table)
2026
+ self._table_adata_curators[table].add_new_from(key="var_index", **kwargs)
1769
2027
 
1770
2028
  def add_new_from(
1771
2029
  self,
@@ -1778,29 +2036,17 @@ class SpatialDataCatManager(CatManager):
1778
2036
  Args:
1779
2037
  key: The key referencing the slot in the DataFrame.
1780
2038
  accessor: The accessor key such as 'sample' or 'table x'.
1781
- organism: The organism name.
1782
2039
  **kwargs: Additional keyword arguments to pass to create new records.
1783
2040
  """
1784
- if self._non_validated is None:
1785
- raise ValidationError("Run .validate() first.")
1786
-
1787
- if len(kwargs) > 0 and key == "all":
1788
- raise ValueError("Cannot pass additional arguments to 'all' key!")
1789
-
1790
- if accessor not in self.categoricals:
1791
- raise ValueError(
1792
- f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
1793
- )
1794
-
1795
- if accessor in self._table_adata_curators:
1796
- adata_curator = self._table_adata_curators[accessor]
1797
- adata_curator.add_new_from(key=key, **kwargs)
1798
- if accessor == self._sample_metadata_key:
1799
- self._sample_df_curator.add_new_from(key=key, **kwargs)
1800
-
1801
2041
  if accessor in self.non_validated.keys():
1802
- if len(self.non_validated[accessor].values()) == 0:
1803
- self.non_validated.pop(accessor)
2042
+ if accessor in self._table_adata_curators:
2043
+ adata_curator = self._table_adata_curators[accessor]
2044
+ adata_curator.add_new_from(key=key, **kwargs)
2045
+ if accessor == self._sample_metadata_key:
2046
+ self._sample_df_curator.add_new_from(key=key, **kwargs)
2047
+
2048
+ if key == "var_index":
2049
+ self._table_adata_curators[accessor].add_new_from(key=key, **kwargs)
1804
2050
 
1805
2051
  def standardize(self, key: str, accessor: str | None = None) -> None:
1806
2052
  """Replace synonyms with canonical values.
@@ -1835,42 +2081,27 @@ class SpatialDataCatManager(CatManager):
1835
2081
  if accessor == self._sample_metadata_key:
1836
2082
  self._sample_df_curator.standardize(key)
1837
2083
 
1838
- if len(self.non_validated[accessor].values()) == 0:
1839
- self.non_validated.pop(accessor)
1840
-
1841
2084
  def validate(self) -> bool:
1842
2085
  """Validate variables and categorical observations.
1843
2086
 
1844
2087
  This method also registers the validated records in the current instance:
1845
2088
  - from public sources
1846
2089
 
1847
- Args:
1848
- organism: The organism name.
1849
-
1850
2090
  Returns:
1851
2091
  Whether the SpatialData object is validated.
1852
2092
  """
1853
2093
  # add all validated records to the current instance
1854
- self._update_registry_all()
1855
-
1856
- self._non_validated = {} # type: ignore
1857
-
1858
2094
  sample_validated = True
1859
2095
  if self._sample_df_curator:
1860
2096
  logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
1861
2097
  sample_validated &= self._sample_df_curator.validate()
1862
- if len(self._sample_df_curator.non_validated) > 0:
1863
- self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
1864
- logger.print("")
1865
2098
 
1866
2099
  mods_validated = True
1867
2100
  for table, adata_curator in self._table_adata_curators.items():
1868
2101
  logger.info(f"validating categoricals of table '{table}' ...")
1869
2102
  mods_validated &= adata_curator.validate()
1870
- if len(adata_curator.non_validated) > 0:
1871
- self._non_validated[table] = adata_curator.non_validated # type: ignore
1872
- logger.print("")
1873
2103
 
2104
+ self._non_validated = {} # so it's no longer None
1874
2105
  self._is_validated = sample_validated & mods_validated
1875
2106
  return self._is_validated
1876
2107
 
@@ -1899,17 +2130,12 @@ class SpatialDataCatManager(CatManager):
1899
2130
  if not self._is_validated:
1900
2131
  raise ValidationError("Dataset does not validate. Please curate.")
1901
2132
 
1902
- return save_artifact(
1903
- self._sdata,
1904
- description=description,
1905
- fields=self.categoricals,
2133
+ self._artifact = Artifact.from_spatialdata(
2134
+ self._dataset, key=key, description=description, revises=revises, run=run
2135
+ ).save()
2136
+ return annotate_artifact(
2137
+ self._artifact,
1906
2138
  index_field=self.var_index,
1907
- key=key,
1908
- artifact=self._artifact,
1909
- revises=revises,
1910
- run=run,
1911
- schema=None,
1912
- organism=self._organism,
1913
2139
  sample_metadata_key=self._sample_metadata_key,
1914
2140
  )
1915
2141
 
@@ -1923,7 +2149,6 @@ class TiledbsomaCatManager(CatManager):
1923
2149
  var_index: dict[str, tuple[str, FieldAttr]],
1924
2150
  categoricals: dict[str, FieldAttr] | None = None,
1925
2151
  obs_columns: FieldAttr = Feature.name,
1926
- organism: str | None = None,
1927
2152
  sources: dict[str, Record] | None = None,
1928
2153
  ):
1929
2154
  self._obs_fields = categoricals or {}
@@ -1935,7 +2160,6 @@ class TiledbsomaCatManager(CatManager):
1935
2160
  else:
1936
2161
  self._dataset = UPath(experiment_uri)
1937
2162
  self._artifact = None
1938
- self._organism = organism
1939
2163
  self._sources = sources or {}
1940
2164
 
1941
2165
  self._is_validated: bool | None = False
@@ -2004,28 +2228,14 @@ class TiledbsomaCatManager(CatManager):
2004
2228
 
2005
2229
  # register obs columns' names
2006
2230
  register_columns = list(self._obs_fields.keys())
2007
- update_registry(
2008
- values=register_columns,
2231
+ # register categorical keys as features
2232
+ cat_column = CatColumn(
2233
+ values_getter=register_columns,
2009
2234
  field=self._columns_field,
2010
2235
  key="columns",
2011
- validated_only=False,
2012
- organism=self._organism,
2013
2236
  source=self._sources.get("columns"),
2014
2237
  )
2015
- additional_columns = [k for k in valid_obs_keys if k not in register_columns]
2016
- # no need to register with validated_only=True if columns are features
2017
- if (
2018
- len(additional_columns) > 0
2019
- and self._columns_field.field.model is not Feature
2020
- ):
2021
- update_registry(
2022
- values=additional_columns,
2023
- field=self._columns_field,
2024
- key="columns",
2025
- validated_only=True,
2026
- organism=self._organism,
2027
- source=self._sources.get("columns"),
2028
- )
2238
+ cat_column.add_new()
2029
2239
 
2030
2240
  def validate(self):
2031
2241
  """Validate categories."""
@@ -2043,21 +2253,14 @@ class TiledbsomaCatManager(CatManager):
2043
2253
  var_ms_values = (
2044
2254
  var_ms.read(column_names=[key]).concat()[key].to_pylist()
2045
2255
  )
2046
- update_registry(
2047
- values=var_ms_values,
2048
- field=field,
2049
- key=var_ms_key,
2050
- validated_only=True,
2051
- organism=self._organism,
2052
- source=self._sources.get(var_ms_key),
2053
- )
2054
- _, non_val = validate_categories(
2055
- values=var_ms_values,
2256
+ cat_column = CatColumn(
2257
+ values_getter=var_ms_values,
2056
2258
  field=field,
2057
2259
  key=var_ms_key,
2058
- organism=self._organism,
2059
2260
  source=self._sources.get(var_ms_key),
2060
2261
  )
2262
+ cat_column.validate()
2263
+ non_val = cat_column._non_validated
2061
2264
  if len(non_val) > 0:
2062
2265
  validated = False
2063
2266
  self._non_validated_values[var_ms_key] = non_val
@@ -2072,21 +2275,14 @@ class TiledbsomaCatManager(CatManager):
2072
2275
  values = pa.compute.unique(
2073
2276
  obs.read(column_names=[key]).concat()[key]
2074
2277
  ).to_pylist()
2075
- update_registry(
2076
- values=values,
2077
- field=field,
2078
- key=key,
2079
- validated_only=True,
2080
- organism=self._organism,
2081
- source=self._sources.get(key),
2082
- )
2083
- _, non_val = validate_categories(
2084
- values=values,
2278
+ cat_column = CatColumn(
2279
+ values_getter=values,
2085
2280
  field=field,
2086
2281
  key=key,
2087
- organism=self._organism,
2088
2282
  source=self._sources.get(key),
2089
2283
  )
2284
+ cat_column.validate()
2285
+ non_val = cat_column._non_validated
2090
2286
  if len(non_val) > 0:
2091
2287
  validated = False
2092
2288
  self._non_validated_values[key] = non_val
@@ -2133,15 +2329,13 @@ class TiledbsomaCatManager(CatManager):
2133
2329
  values, field = self._non_validated_values_field(k)
2134
2330
  if len(values) == 0:
2135
2331
  continue
2136
- update_registry(
2137
- values=values,
2332
+ cat_column = CatColumn(
2333
+ values_getter=values,
2138
2334
  field=field,
2139
2335
  key=k,
2140
- validated_only=False,
2141
- organism=self._organism,
2142
2336
  source=self._sources.get(k),
2143
- **kwargs,
2144
2337
  )
2338
+ cat_column.add_new()
2145
2339
  # update non-validated values list but keep the key there
2146
2340
  # it will be removed by .validate()
2147
2341
  if k in self._non_validated_values:
@@ -2173,7 +2367,6 @@ class TiledbsomaCatManager(CatManager):
2173
2367
  categoricals=self._obs_fields,
2174
2368
  slots={"columns": self._columns_field, **self._var_fields_flat},
2175
2369
  public=public,
2176
- organism=self._organism,
2177
2370
  sources=self._sources,
2178
2371
  )
2179
2372
 
@@ -2210,12 +2403,14 @@ class TiledbsomaCatManager(CatManager):
2210
2403
  else:
2211
2404
  slot = lambda experiment: experiment.obs
2212
2405
  slot_key = k
2213
- syn_mapper = standardize_categories(
2214
- values=values,
2406
+ cat_column = CatColumn(
2407
+ values_getter=values,
2215
2408
  field=field,
2409
+ key=k,
2216
2410
  source=self._sources.get(k),
2217
- organism=self._organism,
2218
2411
  )
2412
+ cat_column.validate()
2413
+ syn_mapper = cat_column._synonyms
2219
2414
  if (n_syn_mapper := len(syn_mapper)) == 0:
2220
2415
  continue
2221
2416
 
@@ -2300,14 +2495,12 @@ class TiledbsomaCatManager(CatManager):
2300
2495
  df=mock_df,
2301
2496
  field=self._columns_field,
2302
2497
  mute=True,
2303
- organism=self._organism,
2304
2498
  )
2305
2499
  for ms in self._var_fields:
2306
2500
  var_key, var_field = self._var_fields[ms]
2307
2501
  feature_sets[f"{ms}__var"] = Schema.from_values(
2308
2502
  values=self._validated_values[f"{ms}__{var_key}"],
2309
2503
  field=var_field,
2310
- organism=self._organism,
2311
2504
  raise_validation_error=False,
2312
2505
  )
2313
2506
  artifact._staged_feature_sets = feature_sets
@@ -2320,7 +2513,6 @@ class TiledbsomaCatManager(CatManager):
2320
2513
  labels = registry.from_values(
2321
2514
  values=self._validated_values[key],
2322
2515
  field=field,
2323
- organism=self._organism,
2324
2516
  )
2325
2517
  if len(labels) == 0:
2326
2518
  continue
@@ -2359,12 +2551,10 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2359
2551
  self,
2360
2552
  adata: ad.AnnData,
2361
2553
  categoricals: dict[str, FieldAttr] | None = None,
2362
- organism: Literal["human", "mouse"] = "human",
2363
2554
  *,
2364
2555
  schema_version: Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
2365
2556
  defaults: dict[str, str] = None,
2366
2557
  extra_sources: dict[str, Record] = None,
2367
- verbosity: str = "hint",
2368
2558
  ) -> None:
2369
2559
  """CELLxGENE schema curator.
2370
2560
 
@@ -2372,13 +2562,11 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2372
2562
  adata: Path to or AnnData object to curate against the CELLxGENE schema.
2373
2563
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
2374
2564
  The CELLxGENE Curator maps against the required CELLxGENE fields by default.
2375
- organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
2376
2565
  schema_version: The CELLxGENE schema version to curate against.
2377
2566
  defaults: Default values that are set if columns or column values are missing.
2378
2567
  extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
2379
2568
  These extra sources are joined with the CELLxGENE fixed sources.
2380
2569
  Use this parameter when subclassing.
2381
- verbosity: The verbosity level.
2382
2570
  """
2383
2571
  import bionty as bt
2384
2572
 
@@ -2399,6 +2587,7 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2399
2587
  categoricals = _restrict_obs_fields(adata.obs, categoricals)
2400
2588
 
2401
2589
  # Configure sources
2590
+ organism: Literal["human", "mouse"] = "human"
2402
2591
  sources = _create_sources(categoricals, schema_version, organism)
2403
2592
  self.schema_version = schema_version
2404
2593
  self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
@@ -2413,16 +2602,9 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2413
2602
  data=adata,
2414
2603
  var_index=bt.Gene.ensembl_gene_id,
2415
2604
  categoricals=categoricals,
2416
- verbosity=verbosity,
2417
- organism=organism,
2418
2605
  sources=sources,
2419
2606
  )
2420
2607
 
2421
- @classmethod
2422
- @deprecated(new_name="cxg_categoricals_defaults")
2423
- def _get_categoricals_defaults(cls) -> dict[str, str]:
2424
- return cls.cxg_categoricals_defaults
2425
-
2426
2608
  @classmethod
2427
2609
  def _get_cxg_categoricals(cls) -> dict[str, FieldAttr]:
2428
2610
  """Returns the CELLxGENE schema mapped fields."""
@@ -2695,7 +2877,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2695
2877
  pert_time: bool = True,
2696
2878
  *,
2697
2879
  cxg_schema_version: Literal["5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
2698
- verbosity: str = "hint",
2699
2880
  ):
2700
2881
  """Initialize the curator with configuration and validation settings."""
2701
2882
  self._pert_time = pert_time
@@ -2708,10 +2889,8 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2708
2889
  adata=adata,
2709
2890
  categoricals=categoricals,
2710
2891
  defaults=categoricals_defaults,
2711
- organism=organism,
2712
2892
  extra_sources=self._configure_sources(adata),
2713
2893
  schema_version=cxg_schema_version,
2714
- verbosity=verbosity,
2715
2894
  )
2716
2895
 
2717
2896
  def _configure_categoricals(self, adata: ad.AnnData):
@@ -2952,7 +3131,7 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
2952
3131
 
2953
3132
 
2954
3133
  def get_organism_kwargs(
2955
- field: FieldAttr, organism: str | None = None
3134
+ field: FieldAttr, organism: str | None = None, values: Any = None
2956
3135
  ) -> dict[str, str]:
2957
3136
  """Check if a registry needs an organism and return the organism name."""
2958
3137
  registry = field.field.model
@@ -2967,245 +3146,47 @@ def get_organism_kwargs(
2967
3146
  return {"organism": organism or bt.settings.organism.name}
2968
3147
  else:
2969
3148
  organism_record = get_organism_record_from_field(
2970
- field, organism=organism
3149
+ field, organism=organism, values=values
2971
3150
  )
2972
3151
  if organism_record is not None:
2973
3152
  return {"organism": organism_record.name}
2974
3153
  return {}
2975
3154
 
2976
3155
 
2977
- def validate_categories(
2978
- values: Iterable[str],
2979
- field: FieldAttr,
2980
- key: str,
2981
- organism: str | None = None,
2982
- source: Record | None = None,
2983
- hint_print: str | None = None,
2984
- curator: CatManager | None = None,
2985
- ) -> tuple[bool, list[str]]:
2986
- """Validate ontology terms using LaminDB registries.
2987
-
2988
- Args:
2989
- values: The values to validate.
2990
- field: The field attribute.
2991
- key: The key referencing the slot in the DataFrame.
2992
- organism: The organism name.
2993
- source: The source record.
2994
- standardize: Whether to standardize the values.
2995
- hint_print: The hint to print that suggests fixing non-validated values.
2996
- """
2997
- model_field = f"{field.field.model.__name__}.{field.field.name}"
2998
-
2999
- def _log_mapping_info():
3000
- logger.indent = ""
3001
- logger.info(f'mapping "{key}" on {colors.italic(model_field)}')
3002
- logger.indent = " "
3003
-
3004
- registry = field.field.model
3005
-
3006
- kwargs_current = get_current_filter_kwargs(
3007
- registry, {"organism": organism, "source": source}
3008
- )
3009
-
3010
- # inspect values from the default instance
3011
- inspect_result = registry.inspect(values, field=field, mute=True, **kwargs_current)
3012
- non_validated = inspect_result.non_validated
3013
- syn_mapper = inspect_result.synonyms_mapper
3014
-
3015
- # inspect the non-validated values from public (BioRecord only)
3016
- values_validated = []
3017
- if hasattr(registry, "public"):
3018
- public_records = registry.from_values(
3019
- non_validated,
3020
- field=field,
3021
- mute=True,
3022
- **kwargs_current,
3023
- )
3024
- values_validated += [getattr(r, field.field.name) for r in public_records]
3025
-
3026
- # logging messages
3027
- non_validated_hint_print = hint_print or f'.add_new_from("{key}")'
3028
- non_validated = [i for i in non_validated if i not in values_validated]
3029
- n_non_validated = len(non_validated)
3030
- if n_non_validated == 0:
3031
- logger.indent = ""
3032
- logger.success(f'"{key}" is validated against {colors.italic(model_field)}')
3033
- return True, []
3034
- else:
3035
- are = "is" if n_non_validated == 1 else "are"
3036
- s = "" if n_non_validated == 1 else "s"
3037
- print_values = _format_values(non_validated)
3038
- warning_message = f"{colors.red(f'{n_non_validated} term{s}')} {are} not validated: {colors.red(print_values)}\n"
3039
- if syn_mapper:
3040
- s = "" if len(syn_mapper) == 1 else "s"
3041
- syn_mapper_print = _format_values(
3042
- [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
3043
- )
3044
- hint_msg = f'.standardize("{key}")'
3045
- warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}"
3046
- if n_non_validated > len(syn_mapper):
3047
- if syn_mapper:
3048
- warning_message += "\n for remaining terms:\n"
3049
- warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}"
3050
-
3051
- if logger.indent == "":
3052
- _log_mapping_info()
3053
- logger.warning(warning_message)
3054
- if curator is not None:
3055
- curator._validate_category_error_messages = strip_ansi_codes(
3056
- warning_message
3057
- )
3058
- logger.indent = ""
3059
- return False, non_validated
3060
-
3061
-
3062
- def standardize_categories(
3063
- values: Iterable[str],
3064
- field: FieldAttr,
3065
- organism: str | None = None,
3066
- source: Record | None = None,
3067
- ) -> dict:
3068
- """Get a synonym mapper."""
3069
- registry = field.field.model
3070
- if not hasattr(registry, "standardize"):
3071
- return {}
3072
- # standardize values using the default instance
3073
- syn_mapper = registry.standardize(
3074
- values,
3075
- field=field.field.name,
3076
- organism=organism,
3077
- source=source,
3078
- mute=True,
3079
- return_mapper=True,
3080
- )
3081
- return syn_mapper
3082
-
3083
-
3084
- def validate_categories_in_df(
3085
- df: pd.DataFrame,
3086
- fields: dict[str, FieldAttr],
3087
- sources: dict[str, Record] = None,
3088
- curator: CatManager | None = None,
3089
- **kwargs,
3090
- ) -> tuple[bool, dict]:
3091
- """Validate categories in DataFrame columns using LaminDB registries."""
3092
- if not fields:
3093
- return True, {}
3094
-
3095
- if sources is None:
3096
- sources = {}
3097
- validated = True
3098
- non_validated = {}
3099
- for key, field in fields.items():
3100
- is_val, non_val = validate_categories(
3101
- df[key],
3102
- field=field,
3103
- key=key,
3104
- source=sources.get(key),
3105
- curator=curator,
3106
- **kwargs,
3107
- )
3108
- validated &= is_val
3109
- if len(non_val) > 0:
3110
- non_validated[key] = non_val
3111
- return validated, non_validated
3112
-
3113
-
3114
- def save_artifact(
3115
- data: pd.DataFrame | ScverseDataStructures,
3156
+ def annotate_artifact(
3157
+ artifact: Artifact,
3116
3158
  *,
3117
- fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
3118
- index_field: FieldAttr | dict[str, FieldAttr] | None = None,
3119
- description: str | None = None,
3120
- organism: str | None = None,
3121
- key: str | None = None,
3122
- artifact: Artifact | None = None,
3123
- revises: Artifact | None = None,
3124
- run: Run | None = None,
3125
3159
  schema: Schema | None = None,
3160
+ cat_columns: dict[str, CatColumn] | None = None,
3161
+ index_field: FieldAttr | dict[str, FieldAttr] | None = None,
3126
3162
  **kwargs,
3127
3163
  ) -> Artifact:
3128
- """Save all metadata with an Artifact.
3129
-
3130
- Args:
3131
- data: The object to save.
3132
- fields: A dictionary mapping obs_column to registry_field.
3133
- index_field: The registry field to validate variables index against.
3134
- description: A description of the artifact.
3135
- organism: The organism name.
3136
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
3137
- artifact: A already registered artifact. Passing this will not save a new artifact from data.
3138
- revises: Previous version of the artifact. Triggers a revision.
3139
- run: The run that creates the artifact.
3140
- schema: The Schema to associate with the Artifact.
3141
-
3142
- Returns:
3143
- The saved Artifact.
3144
- """
3145
3164
  from ..models.artifact import add_labels
3146
3165
 
3147
- if artifact is None:
3148
- if isinstance(data, pd.DataFrame):
3149
- artifact = Artifact.from_df(
3150
- data, description=description, key=key, revises=revises, run=run
3151
- )
3152
- elif isinstance(data, AnnData):
3153
- artifact = Artifact.from_anndata(
3154
- data, description=description, key=key, revises=revises, run=run
3155
- )
3156
- elif data_is_mudata(data):
3157
- artifact = Artifact.from_mudata(
3158
- data, description=description, key=key, revises=revises, run=run
3159
- )
3160
- elif data_is_spatialdata(data):
3161
- artifact = Artifact.from_spatialdata(
3162
- data, description=description, key=key, revises=revises, run=run
3163
- )
3164
- else:
3165
- raise InvalidArgument( # pragma: no cover
3166
- "data must be one of pd.Dataframe, AnnData, MuData, SpatialData."
3167
- )
3168
- artifact.save()
3166
+ if cat_columns is None:
3167
+ cat_columns = {}
3169
3168
 
3170
- def _add_labels(
3171
- data: pd.DataFrame | ScverseDataStructures,
3172
- artifact: Artifact,
3173
- fields: dict[str, FieldAttr],
3174
- feature_ref_is_name: bool | None = None,
3175
- ):
3176
- features = Feature.lookup().dict()
3177
- for key, field in fields.items():
3178
- feature = features.get(key)
3179
- registry = field.field.model
3180
- # we don't need source here because all records are already in the DB
3181
- filter_kwargs = get_current_filter_kwargs(registry, {"organism": organism})
3182
- df = data if isinstance(data, pd.DataFrame) else data.obs
3183
- # multi-value columns are separated by "|"
3184
- if not df[key].isna().all() and df[key].str.contains("|").any():
3185
- values = df[key].str.split("|").explode().unique()
3186
- else:
3187
- values = df[key].unique()
3188
- labels = registry.from_values(values, field=field, **filter_kwargs)
3189
- if len(labels) == 0:
3190
- continue
3191
- label_ref_is_name = None
3192
- if hasattr(registry, "_name_field"):
3193
- label_ref_is_name = field.field.name == registry._name_field
3194
- add_labels(
3195
- artifact,
3196
- records=labels,
3197
- feature=feature,
3198
- feature_ref_is_name=feature_ref_is_name,
3199
- label_ref_is_name=label_ref_is_name,
3200
- from_curator=True,
3201
- )
3169
+ # annotate with labels
3170
+ for key, cat_column in cat_columns.items():
3171
+ if (
3172
+ cat_column._field.field.model == Feature
3173
+ or key == "columns"
3174
+ or key == "var_index"
3175
+ ):
3176
+ continue
3177
+ add_labels(
3178
+ artifact,
3179
+ records=cat_column.labels,
3180
+ feature=cat_column.feature,
3181
+ feature_ref_is_name=None, # do not need anymore
3182
+ label_ref_is_name=cat_column.label_ref_is_name,
3183
+ from_curator=True,
3184
+ )
3202
3185
 
3186
+ # annotate with inferred feature sets
3203
3187
  match artifact.otype:
3204
3188
  case "DataFrame":
3205
- artifact.features._add_set_from_df(field=index_field, organism=organism) # type: ignore
3206
- _add_labels(
3207
- data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
3208
- )
3189
+ artifact.features._add_set_from_df(field=index_field) # type: ignore
3209
3190
  case "AnnData":
3210
3191
  if schema is not None and "uns" in schema.slots:
3211
3192
  uns_field = parse_cat_dtype(schema.slots["uns"].itype, is_itype=True)[
@@ -3214,80 +3195,25 @@ def save_artifact(
3214
3195
  else:
3215
3196
  uns_field = None
3216
3197
  artifact.features._add_set_from_anndata( # type: ignore
3217
- var_field=index_field, uns_field=uns_field, organism=organism
3218
- )
3219
- _add_labels(
3220
- data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
3198
+ var_field=index_field, uns_field=uns_field
3221
3199
  )
3222
3200
  case "MuData":
3223
- artifact.features._add_set_from_mudata( # type: ignore
3224
- var_fields=index_field, organism=organism
3225
- )
3226
- for modality, modality_fields in fields.items():
3227
- column_field_modality = index_field.get(modality)
3228
- if modality == "obs":
3229
- _add_labels(
3230
- data,
3231
- artifact,
3232
- modality_fields,
3233
- feature_ref_is_name=(
3234
- None
3235
- if column_field_modality is None
3236
- else _ref_is_name(column_field_modality)
3237
- ),
3238
- )
3239
- else:
3240
- _add_labels(
3241
- data[modality],
3242
- artifact,
3243
- modality_fields,
3244
- feature_ref_is_name=(
3245
- None
3246
- if column_field_modality is None
3247
- else _ref_is_name(column_field_modality)
3248
- ),
3249
- )
3201
+ artifact.features._add_set_from_mudata(var_fields=index_field) # type: ignore
3250
3202
  case "SpatialData":
3251
3203
  artifact.features._add_set_from_spatialdata( # type: ignore
3252
3204
  sample_metadata_key=kwargs.get("sample_metadata_key", "sample"),
3253
3205
  var_fields=index_field,
3254
- organism=organism,
3255
3206
  )
3256
- sample_metadata_key = kwargs.get("sample_metadata_key", "sample")
3257
- for accessor, accessor_fields in fields.items():
3258
- column_field = index_field.get(accessor)
3259
- if accessor == sample_metadata_key:
3260
- _add_labels(
3261
- data.get_attrs(
3262
- key=sample_metadata_key, return_as="df", flatten=True
3263
- ),
3264
- artifact,
3265
- accessor_fields,
3266
- feature_ref_is_name=(
3267
- None if column_field is None else _ref_is_name(column_field)
3268
- ),
3269
- )
3270
- else:
3271
- _add_labels(
3272
- data.tables[accessor],
3273
- artifact,
3274
- accessor_fields,
3275
- feature_ref_is_name=(
3276
- None if column_field is None else _ref_is_name(column_field)
3277
- ),
3278
- )
3279
3207
  case _:
3280
3208
  raise NotImplementedError # pragma: no cover
3281
3209
 
3282
- artifact.schema = schema
3283
- artifact.save()
3284
-
3285
3210
  slug = ln_setup.settings.instance.slug
3286
3211
  if ln_setup.settings.instance.is_remote: # pdagma: no cover
3287
3212
  logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
3288
3213
  return artifact
3289
3214
 
3290
3215
 
3216
+ # TODO: need this function to support mutli-value columns
3291
3217
  def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:
3292
3218
  """Flatten a Pandas series containing lists or single items into a unique list of elements."""
3293
3219
  result = set()
@@ -3301,138 +3227,6 @@ def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:
3301
3227
  return list(result)
3302
3228
 
3303
3229
 
3304
- def update_registry(
3305
- values: list[str],
3306
- field: FieldAttr,
3307
- key: str,
3308
- validated_only: bool = True,
3309
- df: pd.DataFrame | None = None,
3310
- organism: str | None = None,
3311
- dtype: str | None = None,
3312
- source: Record | None = None,
3313
- **create_kwargs,
3314
- ) -> None:
3315
- """Save features or labels records in the default instance..
3316
-
3317
- Args:
3318
- values: A list of values to be saved as labels.
3319
- field: The FieldAttr object representing the field for which labels are being saved.
3320
- key: The name of the feature to save.
3321
- validated_only: If True, only save validated labels.
3322
- df: A DataFrame to save labels from.
3323
- organism: The organism name.
3324
- dtype: The type of the feature.
3325
- source: The source record.
3326
- **create_kwargs: Additional keyword arguments to pass to the registry model to create new records.
3327
- """
3328
- from lamindb.models.save import save as ln_save
3329
-
3330
- registry = field.field.model
3331
- filter_kwargs = get_current_filter_kwargs(
3332
- registry, {"organism": organism, "source": source}
3333
- )
3334
- values = [i for i in values if isinstance(i, str) and i]
3335
- if not values:
3336
- return
3337
-
3338
- labels_saved: dict = {"from public": [], "new": []}
3339
-
3340
- # inspect the default instance and save validated records from public
3341
- existing_and_public_records = registry.from_values(
3342
- list(values), field=field, **filter_kwargs, mute=True
3343
- )
3344
- existing_and_public_labels = [
3345
- getattr(r, field.field.name) for r in existing_and_public_records
3346
- ]
3347
- # public records that are not already in the database
3348
- public_records = [r for r in existing_and_public_records if r._state.adding]
3349
- # here we check to only save the public records if they are from the specified source
3350
- # we check the uid because r.source and source can be from different instances
3351
- if source:
3352
- public_records = [r for r in public_records if r.source.uid == source.uid]
3353
- if len(public_records) > 0:
3354
- logger.info(f"saving validated records of '{key}'")
3355
- ln_save(public_records)
3356
- labels_saved["from public"] = [
3357
- getattr(r, field.field.name) for r in public_records
3358
- ]
3359
- # non-validated records from the default instance
3360
- non_validated_labels = [i for i in values if i not in existing_and_public_labels]
3361
-
3362
- # save non-validated/new records
3363
- labels_saved["new"] = non_validated_labels
3364
- if not validated_only:
3365
- non_validated_records: RecordList[Any] = [] # type: ignore
3366
- if df is not None and registry == Feature:
3367
- nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
3368
- non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
3369
- else:
3370
- if (
3371
- organism
3372
- and hasattr(registry, "organism")
3373
- and registry._meta.get_field("organism").is_relation
3374
- ):
3375
- # make sure organism record is saved to the current instance
3376
- create_kwargs["organism"] = _save_organism(name=organism)
3377
-
3378
- for value in labels_saved["new"]:
3379
- init_kwargs = {field.field.name: value}
3380
- if registry == Feature:
3381
- init_kwargs["dtype"] = "cat" if dtype is None else dtype
3382
- non_validated_records.append(registry(**init_kwargs, **create_kwargs))
3383
- ln_save(non_validated_records)
3384
-
3385
- # save parent labels for ulabels, for example a parent label "project" for label "project001"
3386
- if registry == ULabel and field.field.name == "name":
3387
- save_ulabels_type(values, field=field, key=key)
3388
-
3389
- log_saved_labels(
3390
- labels_saved,
3391
- key=key,
3392
- model_field=f"{registry.__name__}.{field.field.name}",
3393
- validated_only=validated_only,
3394
- )
3395
-
3396
-
3397
- def log_saved_labels(
3398
- labels_saved: dict,
3399
- key: str,
3400
- model_field: str,
3401
- validated_only: bool = True,
3402
- ) -> None:
3403
- """Log the saved labels."""
3404
- from ..models._from_values import _format_values
3405
-
3406
- model_field = colors.italic(model_field)
3407
- for k, labels in labels_saved.items():
3408
- if not labels:
3409
- continue
3410
- if k == "new" and validated_only:
3411
- continue
3412
- else:
3413
- k = "" if k == "new" else f"{colors.green(k)} "
3414
- # the term "transferred" stresses that this is always in the context of transferring
3415
- # labels from a public ontology or a different instance to the present instance
3416
- s = "s" if len(labels) > 1 else ""
3417
- logger.success(
3418
- f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {_format_values(labels)}'
3419
- )
3420
-
3421
-
3422
- def save_ulabels_type(values: list[str], field: FieldAttr, key: str) -> None:
3423
- """Save the ULabel type of the given labels."""
3424
- registry = field.field.model
3425
- assert registry == ULabel # noqa: S101
3426
- all_records = registry.filter(**{field.field.name: list(values)}).all()
3427
- # so `tissue_type` becomes `TissueType`
3428
- type_name = "".join([i.capitalize() for i in key.lower().split("_")])
3429
- ulabel_type = registry.filter(name=type_name, is_type=True).one_or_none()
3430
- if ulabel_type is None:
3431
- ulabel_type = registry(name=type_name, is_type=True).save()
3432
- logger.important(f"Created a ULabel type: {ulabel_type}")
3433
- all_records.update(type=ulabel_type)
3434
-
3435
-
3436
3230
  def _save_organism(name: str):
3437
3231
  """Save an organism record."""
3438
3232
  import bionty as bt
@@ -3469,15 +3263,14 @@ def from_df(
3469
3263
  df: pd.DataFrame,
3470
3264
  categoricals: dict[str, FieldAttr] | None = None,
3471
3265
  columns: FieldAttr = Feature.name,
3472
- verbosity: str = "hint",
3473
3266
  organism: str | None = None,
3474
3267
  ) -> DataFrameCatManager:
3268
+ if organism is not None:
3269
+ logger.warning("organism is ignored, define it on the dtype level")
3475
3270
  return DataFrameCatManager(
3476
3271
  df=df,
3477
3272
  categoricals=categoricals,
3478
3273
  columns=columns,
3479
- verbosity=verbosity,
3480
- organism=organism,
3481
3274
  )
3482
3275
 
3483
3276
 
@@ -3488,17 +3281,16 @@ def from_anndata(
3488
3281
  var_index: FieldAttr,
3489
3282
  categoricals: dict[str, FieldAttr] | None = None,
3490
3283
  obs_columns: FieldAttr = Feature.name,
3491
- verbosity: str = "hint",
3492
3284
  organism: str | None = None,
3493
3285
  sources: dict[str, Record] | None = None,
3494
3286
  ) -> AnnDataCatManager:
3287
+ if organism is not None:
3288
+ logger.warning("organism is ignored, define it on the dtype level")
3495
3289
  return AnnDataCatManager(
3496
3290
  data=data,
3497
3291
  var_index=var_index,
3498
3292
  categoricals=categoricals,
3499
3293
  obs_columns=obs_columns,
3500
- verbosity=verbosity,
3501
- organism=organism,
3502
3294
  sources=sources,
3503
3295
  )
3504
3296
 
@@ -3509,15 +3301,16 @@ def from_mudata(
3509
3301
  mdata: MuData | UPathStr,
3510
3302
  var_index: dict[str, dict[str, FieldAttr]],
3511
3303
  categoricals: dict[str, FieldAttr] | None = None,
3512
- verbosity: str = "hint",
3513
3304
  organism: str | None = None,
3514
3305
  ) -> MuDataCatManager:
3306
+ if not is_package_installed("mudata"):
3307
+ raise ImportError("Please install mudata: pip install mudata")
3308
+ if organism is not None:
3309
+ logger.warning("organism is ignored, define it on the dtype level")
3515
3310
  return MuDataCatManager(
3516
3311
  mdata=mdata,
3517
3312
  var_index=var_index,
3518
3313
  categoricals=categoricals,
3519
- verbosity=verbosity,
3520
- organism=organism,
3521
3314
  )
3522
3315
 
3523
3316
 
@@ -3531,12 +3324,13 @@ def from_tiledbsoma(
3531
3324
  organism: str | None = None,
3532
3325
  sources: dict[str, Record] | None = None,
3533
3326
  ) -> TiledbsomaCatManager:
3327
+ if organism is not None:
3328
+ logger.warning("organism is ignored, define it on the dtype level")
3534
3329
  return TiledbsomaCatManager(
3535
3330
  experiment_uri=experiment_uri,
3536
3331
  var_index=var_index,
3537
3332
  categoricals=categoricals,
3538
3333
  obs_columns=obs_columns,
3539
- organism=organism,
3540
3334
  sources=sources,
3541
3335
  )
3542
3336
 
@@ -3549,21 +3343,17 @@ def from_spatialdata(
3549
3343
  categoricals: dict[str, dict[str, FieldAttr]] | None = None,
3550
3344
  organism: str | None = None,
3551
3345
  sources: dict[str, dict[str, Record]] | None = None,
3552
- verbosity: str = "hint",
3553
3346
  *,
3554
3347
  sample_metadata_key: str = "sample",
3555
3348
  ):
3556
- try:
3557
- import spatialdata
3558
- except ImportError as e:
3559
- raise ImportError("Please install spatialdata: pip install spatialdata") from e
3560
-
3349
+ if not is_package_installed("spatialdata"):
3350
+ raise ImportError("Please install spatialdata: pip install spatialdata")
3351
+ if organism is not None:
3352
+ logger.warning("organism is ignored, define it on the dtype level")
3561
3353
  return SpatialDataCatManager(
3562
3354
  sdata=sdata,
3563
3355
  var_index=var_index,
3564
3356
  categoricals=categoricals,
3565
- verbosity=verbosity,
3566
- organism=organism,
3567
3357
  sources=sources,
3568
3358
  sample_metadata_key=sample_metadata_key,
3569
3359
  )