lamindb 0.75.0__py3-none-any.whl → 0.75.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_curate.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Iterable
3
+ import copy
4
+ from typing import TYPE_CHECKING, Iterable, Type
4
5
 
5
6
  import anndata as ad
6
7
  import lamindb_setup as ln_setup
@@ -30,23 +31,25 @@ class CurateLookup:
30
31
  self,
31
32
  categoricals: dict[str, FieldAttr],
32
33
  slots: dict[str, FieldAttr] = None,
33
- using: str | None = None,
34
+ using_key: str | None = None,
34
35
  ) -> None:
35
36
  if slots is None:
36
37
  slots = {}
37
38
  self._fields = {**categoricals, **slots}
38
- self._using = None if using == "default" else using
39
- self._using_name = self._using or ln_setup.settings.instance.slug
40
- debug_message = f"Lookup objects from the " f"{colors.italic(self._using_name)}"
39
+ self._using_key = None if using_key == "default" else using_key
40
+ self._using_key_name = self._using_key or ln_setup.settings.instance.slug
41
+ debug_message = (
42
+ f"Lookup objects from the " f"{colors.italic(self._using_key_name)}"
43
+ )
41
44
  logger.debug(debug_message)
42
45
 
43
46
  def __getattr__(self, name):
44
47
  if name in self._fields:
45
48
  registry = self._fields[name].field.model
46
- if self._using == "public":
49
+ if self._using_key == "public":
47
50
  return registry.public().lookup()
48
51
  else:
49
- return get_registry_instance(registry, self._using).lookup()
52
+ return get_registry_instance(registry, self._using_key).lookup()
50
53
  raise AttributeError(
51
54
  f"'{self.__class__.__name__}' object has no attribute '{name}'"
52
55
  )
@@ -54,10 +57,10 @@ class CurateLookup:
54
57
  def __getitem__(self, name):
55
58
  if name in self._fields:
56
59
  registry = self._fields[name].field.model
57
- if self._using == "public":
60
+ if self._using_key == "public":
58
61
  return registry.public().lookup()
59
62
  else:
60
- return get_registry_instance(registry, self._using).lookup()
63
+ return get_registry_instance(registry, self._using_key).lookup()
61
64
  raise AttributeError(
62
65
  f"'{self.__class__.__name__}' object has no attribute '{name}'"
63
66
  )
@@ -71,7 +74,7 @@ class CurateLookup:
71
74
  [str([key]) for key in self._fields if not key.isidentifier()]
72
75
  )
73
76
  return (
74
- f"Lookup objects from the {colors.italic(self._using_name)}:\n "
77
+ f"Lookup objects from the {colors.italic(self._using_key_name)}:\n "
75
78
  f"{colors.green(getattr_keys)}\n "
76
79
  f"{colors.green(getitem_keys)}\n\n"
77
80
  "Example:\n → categories = validator.lookup().cell_type\n"
@@ -82,16 +85,19 @@ class CurateLookup:
82
85
 
83
86
 
84
87
  class DataFrameCurator:
85
- """Annotation flow for a DataFrame object.
88
+ """Curation flow for a DataFrame object.
89
+
90
+ See also :class:`~lamindb.Curate`.
86
91
 
87
92
  Args:
88
93
  df: The DataFrame object to curate.
89
94
  columns: The field attribute for the feature column.
90
95
  categoricals: A dictionary mapping column names to registry_field.
91
- using: The reference instance containing registries to validate against.
96
+ using_key: The reference instance containing registries to validate against.
92
97
  verbosity: The verbosity level.
93
98
  organism: The organism name.
94
99
  sources: A dictionary mapping column names to Source records.
100
+ exclude: A dictionary mapping column names to values to exclude.
95
101
 
96
102
  Examples:
97
103
  >>> import bionty as bt
@@ -106,17 +112,18 @@ class DataFrameCurator:
106
112
  df: pd.DataFrame,
107
113
  columns: FieldAttr = Feature.name,
108
114
  categoricals: dict[str, FieldAttr] | None = None,
109
- using: str | None = None,
115
+ using_key: str | None = None,
110
116
  verbosity: str = "hint",
111
117
  organism: str | None = None,
112
118
  sources: dict[str, Record] | None = None,
119
+ exclude: dict | None = None,
113
120
  ) -> None:
114
121
  from lamindb.core._settings import settings
115
122
 
116
123
  self._df = df
117
124
  self._fields = categoricals or {}
118
125
  self._columns_field = columns
119
- self._using = using
126
+ self._using_key = using_key
120
127
  settings.verbosity = verbosity
121
128
  self._artifact = None
122
129
  self._collection = None
@@ -125,25 +132,36 @@ class DataFrameCurator:
125
132
  if sources is None:
126
133
  sources = {}
127
134
  self._sources = sources
135
+ if exclude is None:
136
+ exclude = {}
137
+ self._exclude = exclude
138
+ self._non_validated = None
128
139
  self._save_columns()
129
140
 
141
+ @property
142
+ def non_validated(self) -> list:
143
+ """Return the non-validated features and labels."""
144
+ if self._non_validated is None:
145
+ raise ValueError("Please run validate() first!")
146
+ return self._non_validated
147
+
130
148
  @property
131
149
  def fields(self) -> dict:
132
150
  """Return the columns fields to validate against."""
133
151
  return self._fields
134
152
 
135
- def lookup(self, using: str | None = None) -> CurateLookup:
153
+ def lookup(self, using_key: str | None = None) -> CurateLookup:
136
154
  """Lookup categories.
137
155
 
138
156
  Args:
139
- using: The instance where the lookup is performed.
140
- if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
157
+ using_key: The instance where the lookup is performed.
158
+ if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
141
159
  if "public", the lookup is performed on the public reference.
142
160
  """
143
161
  return CurateLookup(
144
162
  categoricals=self._fields,
145
163
  slots={"columns": self._columns_field},
146
- using=using or self._using,
164
+ using_key=using_key or self._using_key,
147
165
  )
148
166
 
149
167
  def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
@@ -160,7 +178,7 @@ class DataFrameCurator:
160
178
  field=self._columns_field,
161
179
  key="columns",
162
180
  save_function="add_new_from_columns",
163
- using=self._using,
181
+ using_key=self._using_key,
164
182
  validated_only=False,
165
183
  source=self._sources.get("columns"),
166
184
  **kwargs,
@@ -174,10 +192,11 @@ class DataFrameCurator:
174
192
  field=self._columns_field,
175
193
  key="columns",
176
194
  save_function="add_new_from_columns",
177
- using=self._using,
195
+ using_key=self._using_key,
178
196
  validated_only=validated_only,
179
197
  df=self._df, # Get the Feature type from df
180
198
  source=self._sources.get("columns"),
199
+ warning=False, # Do not warn about missing columns, just an info message
181
200
  **kwargs,
182
201
  )
183
202
 
@@ -226,7 +245,7 @@ class DataFrameCurator:
226
245
  values=self._df[categorical].unique().tolist(),
227
246
  field=self.fields[categorical],
228
247
  key=categorical,
229
- using=self._using,
248
+ using_key=self._using_key,
230
249
  validated_only=validated_only,
231
250
  sources=self._sources.get(categorical),
232
251
  **kwargs,
@@ -245,11 +264,12 @@ class DataFrameCurator:
245
264
  Whether the DataFrame is validated.
246
265
  """
247
266
  self._kwargs.update({"organism": organism} if organism else {})
248
- self._validated = validate_categories_in_df(
267
+ self._validated, self._non_validated = validate_categories_in_df( # type: ignore
249
268
  self._df,
250
269
  fields=self.fields,
251
- using=self._using,
270
+ using_key=self._using_key,
252
271
  sources=self._sources,
272
+ exclude=self._exclude,
253
273
  **self._kwargs,
254
274
  )
255
275
  return self._validated
@@ -302,16 +322,21 @@ class DataFrameCurator:
302
322
 
303
323
 
304
324
  class AnnDataCurator(DataFrameCurator):
305
- """Annotation flow for ``AnnData``.
325
+ """Curation flow for ``AnnData``.
326
+
327
+ See also :class:`~lamindb.Curate`.
328
+
329
+ Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curate.from_anndata`.
306
330
 
307
331
  Args:
308
332
  data: The AnnData object or an AnnData-like path.
309
333
  var_index: The registry field for mapping the ``.var`` index.
310
334
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
311
- using: A reference LaminDB instance.
335
+ using_key: A reference LaminDB instance.
312
336
  verbosity: The verbosity level.
313
337
  organism: The organism name.
314
338
  sources: A dictionary mapping ``.obs.columns`` to Source records.
339
+ exclude: A dictionary mapping column names to values to exclude.
315
340
 
316
341
  Examples:
317
342
  >>> import bionty as bt
@@ -328,10 +353,12 @@ class AnnDataCurator(DataFrameCurator):
328
353
  data: ad.AnnData | UPathStr,
329
354
  var_index: FieldAttr,
330
355
  categoricals: dict[str, FieldAttr] | None = None,
331
- using: str = "default",
356
+ obs_columns: FieldAttr = Feature.name,
357
+ using_key: str = "default",
332
358
  verbosity: str = "hint",
333
359
  organism: str | None = None,
334
360
  sources: dict[str, Record] | None = None,
361
+ exclude: dict | None = None,
335
362
  ) -> None:
336
363
  from lamindb_setup.core import upath
337
364
 
@@ -355,13 +382,14 @@ class AnnDataCurator(DataFrameCurator):
355
382
  super().__init__(
356
383
  df=self._adata.obs,
357
384
  categoricals=categoricals,
358
- using=using,
385
+ columns=obs_columns,
386
+ using_key=using_key,
359
387
  verbosity=verbosity,
360
388
  organism=organism,
361
389
  sources=sources,
390
+ exclude=exclude,
362
391
  )
363
392
  self._obs_fields = categoricals
364
- self._save_from_var_index(validated_only=True, **self._kwargs)
365
393
 
366
394
  @property
367
395
  def var_index(self) -> FieldAttr:
@@ -373,18 +401,18 @@ class AnnDataCurator(DataFrameCurator):
373
401
  """Return the obs fields to validate against."""
374
402
  return self._obs_fields
375
403
 
376
- def lookup(self, using: str | None = None) -> CurateLookup:
404
+ def lookup(self, using_key: str | None = None) -> CurateLookup:
377
405
  """Lookup categories.
378
406
 
379
407
  Args:
380
- using: The instance where the lookup is performed.
408
+ using_key: The instance where the lookup is performed.
381
409
  if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
382
410
  if "public", the lookup is performed on the public reference.
383
411
  """
384
412
  return CurateLookup(
385
413
  categoricals=self._obs_fields,
386
414
  slots={"columns": self._columns_field, "var_index": self._var_field},
387
- using=using or self._using,
415
+ using_key=using_key or self._using_key,
388
416
  )
389
417
 
390
418
  def _save_from_var_index(
@@ -392,16 +420,25 @@ class AnnDataCurator(DataFrameCurator):
392
420
  ):
393
421
  """Save variable records."""
394
422
  update_registry(
395
- values=self._adata.var.index,
423
+ values=list(self._adata.var.index),
396
424
  field=self.var_index,
397
425
  key="var_index",
398
426
  save_function="add_new_from_var_index",
399
- using=self._using,
427
+ using_key=self._using_key,
400
428
  validated_only=validated_only,
401
429
  organism=organism,
402
430
  source=self._sources.get("var_index"),
403
431
  )
404
432
 
433
+ def _update_registry_all(self, validated_only: bool = True, **kwargs):
434
+ """Save labels for all features."""
435
+ for name in self.fields.keys():
436
+ logger.info(f"saving labels for '{name}'")
437
+ if name == "var_index":
438
+ self._save_from_var_index(validated_only=validated_only, **kwargs)
439
+ else:
440
+ self._update_registry(name, validated_only=validated_only, **kwargs)
441
+
405
442
  def add_new_from_var_index(self, organism: str | None = None, **kwargs):
406
443
  """Update variable records.
407
444
 
@@ -412,6 +449,15 @@ class AnnDataCurator(DataFrameCurator):
412
449
  self._kwargs.update({"organism": organism} if organism else {})
413
450
  self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
414
451
 
452
+ def add_validated_from_var_index(self, organism: str | None = None):
453
+ """Add validated variable records.
454
+
455
+ Args:
456
+ organism: The organism name.
457
+ """
458
+ self._kwargs.update({"organism": organism} if organism else {})
459
+ self._save_from_var_index(validated_only=True, **self._kwargs)
460
+
415
461
  def validate(self, organism: str | None = None) -> bool:
416
462
  """Validate categories.
417
463
 
@@ -422,24 +468,32 @@ class AnnDataCurator(DataFrameCurator):
422
468
  Whether the AnnData object is validated.
423
469
  """
424
470
  self._kwargs.update({"organism": organism} if organism else {})
425
- if self._using is not None and self._using != "default":
471
+ if self._using_key is not None and self._using_key != "default":
426
472
  logger.important(
427
- f"validating metadata using registries of instance {colors.italic(self._using)}"
473
+ f"validating metadata using registries of instance {colors.italic(self._using_key)}"
428
474
  )
429
- validated_var = validate_categories(
475
+
476
+ validated_var, non_validated_var = validate_categories(
430
477
  self._adata.var.index,
431
478
  field=self._var_field,
432
479
  key="var_index",
433
- using=self._using,
434
- **self._kwargs,
480
+ using_key=self._using_key,
481
+ source=self._sources.get("var_index"),
482
+ validated_hint_print=".add_validated_from_var_index()",
483
+ exclude=self._exclude.get("var_index"),
484
+ **self._kwargs, # type: ignore
435
485
  )
436
- validated_obs = validate_categories_in_df(
486
+ validated_obs, non_validated_obs = validate_categories_in_df(
437
487
  self._adata.obs,
438
488
  fields=self.categoricals,
439
- using=self._using,
489
+ using_key=self._using_key,
440
490
  sources=self._sources,
491
+ exclude=self._exclude,
441
492
  **self._kwargs,
442
493
  )
494
+ self._non_validated = non_validated_obs # type: ignore
495
+ if len(non_validated_var) > 0:
496
+ self._non_validated["var_index"] = non_validated_var # type: ignore
443
497
  self._validated = validated_var and validated_obs
444
498
  return self._validated
445
499
 
@@ -471,7 +525,12 @@ class AnnDataCurator(DataFrameCurator):
471
525
 
472
526
 
473
527
  class MuDataCurator:
474
- """Annotation flow for a ``MuData`` object.
528
+ """Curation flow for a ``MuData`` object.
529
+
530
+ See also :class:`~lamindb.Curate`.
531
+
532
+ Note that if genes or other measurements are removed from the MuData object,
533
+ the object should be recreated using :meth:`~lamindb.Curate.from_mudata`.
475
534
 
476
535
  Args:
477
536
  mdata: The MuData object to curate.
@@ -480,9 +539,11 @@ class MuDataCurator:
480
539
  ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
481
540
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
482
541
  Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
483
- using: A reference LaminDB instance.
542
+ using_key: A reference LaminDB instance.
484
543
  verbosity: The verbosity level.
485
544
  organism: The organism name.
545
+ sources: A dictionary mapping ``.obs.columns`` to Source records.
546
+ exclude: A dictionary mapping column names to values to exclude.
486
547
 
487
548
  Examples:
488
549
  >>> import bionty as bt
@@ -499,29 +560,34 @@ class MuDataCurator:
499
560
  mdata: MuData,
500
561
  var_index: dict[str, dict[str, FieldAttr]],
501
562
  categoricals: dict[str, FieldAttr] | None = None,
502
- using: str = "default",
563
+ using_key: str = "default",
503
564
  verbosity: str = "hint",
504
565
  organism: str | None = None,
505
566
  sources: dict[str, Record] | None = None,
567
+ exclude: dict | None = None,
506
568
  ) -> None:
507
569
  if sources is None:
508
570
  sources = {}
509
571
  self._sources = sources
572
+ if exclude is None:
573
+ exclude = {}
574
+ self._exclude = exclude
510
575
  self._mdata = mdata
511
576
  self._kwargs = {"organism": organism} if organism else {}
512
577
  self._var_fields = var_index
513
578
  self._verify_modality(self._var_fields.keys())
514
579
  self._obs_fields = self._parse_categoricals(categoricals)
515
580
  self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
516
- self._using = using
581
+ self._using_key = using_key
517
582
  self._verbosity = verbosity
518
583
  self._df_annotators = {
519
584
  modality: DataFrameCurator(
520
585
  df=mdata[modality].obs if modality != "obs" else mdata.obs,
521
586
  categoricals=self._obs_fields.get(modality, {}),
522
- using=using,
587
+ using_key=using_key,
523
588
  verbosity=verbosity,
524
589
  sources=self._sources.get(modality),
590
+ exclude=self._exclude.get(modality),
525
591
  **self._kwargs,
526
592
  )
527
593
  for modality in self._modalities
@@ -552,11 +618,11 @@ class MuDataCurator:
552
618
  ):
553
619
  """Save variable records."""
554
620
  update_registry(
555
- values=self._mdata[modality].var.index,
621
+ values=list(self._mdata[modality].var.index),
556
622
  field=self._var_fields[modality],
557
623
  key="var_index",
558
624
  save_function="add_new_from_var_index",
559
- using=self._using,
625
+ using_key=self._using_key,
560
626
  validated_only=validated_only,
561
627
  dtype="number",
562
628
  **kwargs,
@@ -580,12 +646,12 @@ class MuDataCurator:
580
646
  obs_fields["obs"][k] = v
581
647
  return obs_fields
582
648
 
583
- def lookup(self, using: str | None = None) -> CurateLookup:
649
+ def lookup(self, using_key: str | None = None) -> CurateLookup:
584
650
  """Lookup categories.
585
651
 
586
652
  Args:
587
- using: The instance where the lookup is performed.
588
- if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
653
+ using_key: The instance where the lookup is performed.
654
+ if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
589
655
  if "public", the lookup is performed on the public reference.
590
656
  """
591
657
  return CurateLookup(
@@ -594,7 +660,7 @@ class MuDataCurator:
594
660
  **self._obs_fields,
595
661
  **{f"{k}_var_index": v for k, v in self._var_fields.items()},
596
662
  },
597
- using=using or self._using,
663
+ using_key=using_key or self._using_key,
598
664
  )
599
665
 
600
666
  def add_new_from_columns(
@@ -613,14 +679,15 @@ class MuDataCurator:
613
679
  **kwargs: Additional keyword arguments to pass to the registry model.
614
680
  """
615
681
  self._kwargs.update({"organism": organism} if organism else {})
682
+ values = column_names or self._mdata[modality].obs.columns
616
683
  update_registry(
617
- values=column_names or self._mdata[modality].obs.columns,
684
+ values=list(values),
618
685
  field=Feature.name,
619
686
  key=f"{modality} obs columns",
620
- using=self._using,
687
+ using_key=self._using_key,
621
688
  validated_only=False,
622
689
  df=self._mdata[modality].obs,
623
- **self._kwargs,
690
+ **self._kwargs, # type: ignore
624
691
  **kwargs,
625
692
  )
626
693
 
@@ -639,6 +706,18 @@ class MuDataCurator:
639
706
  modality=modality, validated_only=False, **self._kwargs, **kwargs
640
707
  )
641
708
 
709
+ def add_validated_from_var_index(self, modality: str, organism: str | None = None):
710
+ """Add validated variable records.
711
+
712
+ Args:
713
+ modality: The modality name.
714
+ organism: The organism name.
715
+ """
716
+ self._kwargs.update({"organism": organism} if organism else {})
717
+ self._save_from_var_index_modality(
718
+ modality=modality, validated_only=True, **self._kwargs
719
+ )
720
+
642
721
  def add_validated_from(
643
722
  self, key: str, modality: str | None = None, organism: str | None = None
644
723
  ):
@@ -681,32 +760,48 @@ class MuDataCurator:
681
760
  def validate(self, organism: str | None = None) -> bool:
682
761
  """Validate categories."""
683
762
  self._kwargs.update({"organism": organism} if organism else {})
684
- if self._using is not None and self._using != "default":
763
+ if self._using_key is not None and self._using_key != "default":
685
764
  logger.important(
686
- f"validating metadata using registries of instance {colors.italic(self._using)}"
765
+ f"validating metadata using registries of instance {colors.italic(self._using_key)}"
687
766
  )
688
767
  validated_var = True
768
+ non_validated_var_modality = {}
689
769
  for modality, var_field in self._var_fields.items():
690
- validated_var &= validate_categories(
770
+ is_validated_var, non_validated_var = validate_categories(
691
771
  self._mdata[modality].var.index,
692
772
  field=var_field,
693
773
  key=f"{modality}_var_index",
694
- using=self._using,
695
- **self._kwargs,
774
+ using_key=self._using_key,
775
+ exclude=self._exclude.get(f"{modality}_var_index"),
776
+ **self._kwargs, # type: ignore
696
777
  )
778
+ validated_var &= is_validated_var
779
+ if len(non_validated_var) > 0:
780
+ non_validated_var_modality[modality] = non_validated_var
781
+
697
782
  validated_obs = True
783
+ non_validated_obs_modality = {}
698
784
  for modality, fields in self._obs_fields.items():
699
785
  if modality == "obs":
700
786
  obs = self._mdata.obs
701
787
  else:
702
788
  obs = self._mdata[modality].obs
703
- validated_obs &= validate_categories_in_df(
789
+ is_validated_obs, non_validated_obs = validate_categories_in_df(
704
790
  obs,
705
791
  fields=fields,
706
- using=self._using,
792
+ using_key=self._using_key,
707
793
  sources=self._sources.get(modality),
794
+ exclude=self._exclude.get(modality),
708
795
  **self._kwargs,
709
796
  )
797
+ validated_obs &= is_validated_obs
798
+ non_validated_obs_modality[modality] = non_validated_obs
799
+ if modality in non_validated_var_modality:
800
+ non_validated_obs_modality[modality]["var_index"] = (
801
+ non_validated_var_modality[modality]
802
+ )
803
+ if len(non_validated_obs_modality[modality]) > 0:
804
+ self._non_validated = non_validated_obs_modality[modality]
710
805
  self._validated = validated_var and validated_obs
711
806
  return self._validated
712
807
 
@@ -735,7 +830,32 @@ class MuDataCurator:
735
830
 
736
831
 
737
832
  class Curate:
738
- """Annotation flow."""
833
+ """Curation flow.
834
+
835
+ Data curation entails accurately labeling datasets with standardized metadata
836
+ to facilitate data integration, interpretation and analysis.
837
+
838
+ The curation flow has several steps:
839
+
840
+ 1. Create a :class:`Curate` object corresponding to the object type that you want to curate:
841
+
842
+ - :meth:`~lamindb.Curate.from_df`
843
+ - :meth:`~lamindb.Curate.from_anndata`
844
+ - :meth:`~lamindb.Curate.from_mudata`
845
+
846
+ During object creation, any passed categoricals found in the object will be saved.
847
+
848
+ 2. Run :meth:`~lamindb.core.DataFrameCurator.validate` to check the data against the defined criteria. This method identifies:
849
+
850
+ - Values that can successfully validated and already exist in the registry.
851
+ - Values which are new and not yet validated or potentially problematic values.
852
+
853
+ 3. Determine how to handle validated and unvalidated values:
854
+
855
+ - Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
856
+ - Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
857
+ - All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
858
+ """
739
859
 
740
860
  @classmethod
741
861
  @doc_args(DataFrameCurator.__doc__)
@@ -744,7 +864,7 @@ class Curate:
744
864
  df: pd.DataFrame,
745
865
  categoricals: dict[str, FieldAttr] | None = None,
746
866
  columns: FieldAttr = Feature.name,
747
- using: str | None = None,
867
+ using_key: str | None = None,
748
868
  verbosity: str = "hint",
749
869
  organism: str | None = None,
750
870
  ) -> DataFrameCurator:
@@ -753,7 +873,7 @@ class Curate:
753
873
  df=df,
754
874
  categoricals=categoricals,
755
875
  columns=columns,
756
- using=using,
876
+ using_key=using_key,
757
877
  verbosity=verbosity,
758
878
  organism=organism,
759
879
  )
@@ -765,7 +885,8 @@ class Curate:
765
885
  data: ad.AnnData | UPathStr,
766
886
  var_index: FieldAttr,
767
887
  categoricals: dict[str, FieldAttr] | None = None,
768
- using: str = "default",
888
+ obs_columns: FieldAttr = Feature.name,
889
+ using_key: str = "default",
769
890
  verbosity: str = "hint",
770
891
  organism: str | None = None,
771
892
  sources: dict[str, Record] | None = None,
@@ -775,7 +896,8 @@ class Curate:
775
896
  data=data,
776
897
  var_index=var_index,
777
898
  categoricals=categoricals,
778
- using=using,
899
+ obs_columns=obs_columns,
900
+ using_key=using_key,
779
901
  verbosity=verbosity,
780
902
  organism=organism,
781
903
  sources=sources,
@@ -788,7 +910,7 @@ class Curate:
788
910
  mdata: MuData,
789
911
  var_index: dict[str, dict[str, FieldAttr]],
790
912
  categoricals: dict[str, FieldAttr] | None = None,
791
- using: str = "default",
913
+ using_key: str = "default",
792
914
  verbosity: str = "hint",
793
915
  organism: str | None = None,
794
916
  ) -> MuDataCurator:
@@ -797,29 +919,68 @@ class Curate:
797
919
  mdata=mdata,
798
920
  var_index=var_index,
799
921
  categoricals=categoricals,
800
- using=using,
922
+ using_key=using_key,
801
923
  verbosity=verbosity,
802
924
  organism=organism,
803
925
  )
804
926
 
805
927
 
806
- def get_registry_instance(registry: Record, using: str | None = None) -> Record:
928
+ def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
807
929
  """Get a registry instance using a specific instance."""
808
- if using is not None and using != "default":
809
- return registry.using(using)
930
+ if using_key is not None and using_key != "default":
931
+ return registry.using(using_key)
810
932
  return registry
811
933
 
812
934
 
935
+ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
936
+ """Make sure the source and organism are saved in the same database as the registry."""
937
+ from lamindb.core._settings import settings
938
+
939
+ db = registry.filter().db
940
+ source = kwargs.get("source")
941
+ organism = kwargs.get("organism")
942
+ filter_kwargs = kwargs.copy()
943
+ try:
944
+ verbosity = settings.verbosity
945
+ settings.verbosity = "error"
946
+ if isinstance(organism, Record) and organism._state.db != "default":
947
+ if db is None or db == "default":
948
+ organism_default = copy.copy(organism)
949
+ # save the organism record in the default database
950
+ organism_default.save()
951
+ filter_kwargs["organism"] = organism_default
952
+ if isinstance(source, Record) and source._state.db != "default":
953
+ if db is None or db == "default":
954
+ source_default = copy.copy(source)
955
+ # save the source record in the default database
956
+ source_default.save()
957
+ filter_kwargs["source"] = source_default
958
+ finally:
959
+ settings.verbosity = verbosity
960
+ return filter_kwargs
961
+
962
+
813
963
  def standardize_and_inspect(
814
- values: Iterable[str], field: FieldAttr, registry: Record, **kwargs
964
+ values: Iterable[str],
965
+ field: FieldAttr,
966
+ registry: type[Record],
967
+ standardize: bool = False,
968
+ **kwargs,
815
969
  ):
816
970
  """Standardize and inspect values using a registry."""
817
- if hasattr(registry, "standardize") and hasattr(
818
- registry,
819
- "synonyms", # https://github.com/laminlabs/lamindb/issues/1685
820
- ):
821
- values = registry.standardize(values, field=field, mute=True, **kwargs)
822
- return registry.inspect(values, field=field, mute=True, **kwargs)
971
+ filter_kwargs = get_current_filter_kwargs(registry, kwargs)
972
+
973
+ if standardize:
974
+ if hasattr(registry, "standardize") and hasattr(
975
+ registry,
976
+ "synonyms", # https://github.com/laminlabs/lamindb/issues/1685
977
+ ):
978
+ standardized_values = registry.standardize(
979
+ values, field=field, mute=True, **filter_kwargs
980
+ )
981
+ values = standardized_values
982
+
983
+ return registry.inspect(values, field=field, mute=True, **filter_kwargs)
823
984
 
824
985
 
825
986
  def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
@@ -840,11 +1001,26 @@ def validate_categories(
840
1001
  values: Iterable[str],
841
1002
  field: FieldAttr,
842
1003
  key: str,
843
- using: str | None = None,
1004
+ using_key: str | None = None,
844
1005
  organism: str | None = None,
845
1006
  source: Record | None = None,
846
- ) -> bool:
847
- """Validate ontology terms in a pandas series using LaminDB registries."""
1007
+ exclude: str | list | None = None,
1008
+ standardize: bool = True,
1009
+ validated_hint_print: str | None = None,
1010
+ ) -> tuple[bool, list]:
1011
+ """Validate ontology terms in a pandas series using LaminDB registries.
1012
+
1013
+ Args:
1014
+ values: The values to validate.
1015
+ field: The field attribute.
1016
+ key: The key referencing the slot in the DataFrame.
1017
+ using_key: A reference LaminDB instance.
1018
+ organism: The organism name.
1019
+ source: The source record.
1020
+ exclude: Exclude specific values.
1021
+ standardize: Standardize the values.
1022
+ validated_hint_print: The hint to print for validated values.
1023
+ """
848
1024
  from lamindb._from_values import _print_values
849
1025
  from lamindb.core._settings import settings
850
1026
 
@@ -856,43 +1032,60 @@ def validate_categories(
856
1032
  logger.indent = " "
857
1033
 
858
1034
  registry = field.field.model
859
- filter_kwargs = check_registry_organism(registry, organism)
860
- filter_kwargs.update({"source": source} if source else {})
1035
+ kwargs = check_registry_organism(registry, organism)
1036
+ kwargs.update({"source": source} if source else {})
1037
+
1038
+ # inspect the default instance
1039
+ if exclude is not None:
1040
+ exclude = [exclude] if isinstance(exclude, str) else exclude
1041
+ # exclude values are validated without source and organism
1042
+ inspect_result = registry.inspect(exclude, field=field, mute=True)
1043
+ # if exclude values are validated, remove them from the values
1044
+ values = [i for i in values if i not in inspect_result.validated]
861
1045
 
862
- # Inspect the default instance
863
1046
  inspect_result = standardize_and_inspect(
864
- values=values, field=field, registry=registry, **filter_kwargs
1047
+ values=values,
1048
+ field=field,
1049
+ registry=registry,
1050
+ standardize=standardize,
1051
+ **kwargs,
865
1052
  )
866
1053
  non_validated = inspect_result.non_validated
867
1054
 
868
1055
  values_validated = []
869
- if using is not None and using != "default" and non_validated:
870
- registry = get_registry_instance(registry, using)
871
- # Inspect the using instance
1056
+ if using_key is not None and using_key != "default" and non_validated:
1057
+ registry_using = get_registry_instance(registry, using_key)
1058
+ # inspect the using instance
872
1059
  inspect_result = standardize_and_inspect(
873
- values=non_validated, field=field, registry=registry, **filter_kwargs
1060
+ values=non_validated,
1061
+ field=field,
1062
+ registry=registry_using,
1063
+ standardize=standardize,
1064
+ **kwargs,
874
1065
  )
875
1066
  non_validated = inspect_result.non_validated
876
1067
  values_validated += inspect_result.validated
877
1068
 
878
- # Inspect from public (bionty only)
1069
+ # inspect from public (bionty only)
879
1070
  if hasattr(registry, "public"):
880
1071
  verbosity = settings.verbosity
881
1072
  try:
882
1073
  settings.verbosity = "error"
883
1074
  public_records = registry.from_values(
884
- non_validated, field=field, **filter_kwargs
1075
+ non_validated,
1076
+ field=field,
1077
+ **get_current_filter_kwargs(registry, kwargs),
885
1078
  )
886
1079
  values_validated += [getattr(r, field.field.name) for r in public_records]
887
1080
  finally:
888
1081
  settings.verbosity = verbosity
889
1082
 
890
- validated_hint_print = f".add_validated_from('{key}')"
1083
+ validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
891
1084
  n_validated = len(values_validated)
892
1085
  if n_validated > 0:
893
1086
  _log_mapping_info()
894
1087
  logger.warning(
895
- f"found {colors.yellow(f'{n_validated} terms')} validated terms: "
1088
+ f"found {colors.yellow(n_validated)} validated terms: "
896
1089
  f"{colors.yellow(values_validated)}\n → save terms via "
897
1090
  f"{colors.yellow(validated_hint_print)}"
898
1091
  )
@@ -903,43 +1096,49 @@ def validate_categories(
903
1096
  if n_non_validated == 0:
904
1097
  logger.indent = ""
905
1098
  logger.success(f"{key} is validated against {colors.italic(model_field)}")
906
- return True
1099
+ return True, []
907
1100
  else:
908
1101
  are = "are" if n_non_validated > 1 else "is"
909
1102
  print_values = _print_values(non_validated)
910
1103
  warning_message = (
911
- f"{colors.yellow(f'{n_non_validated} terms')} {are} not validated: "
912
- f"{colors.yellow(print_values)}\n → save terms via "
913
- f"{colors.yellow(non_validated_hint_print)}"
1104
+ f"{colors.red(f'{n_non_validated} terms')} {are} not validated: "
1105
+ f"{colors.red(print_values)}\n → save terms via "
1106
+ f"{colors.red(non_validated_hint_print)}"
914
1107
  )
915
1108
  if logger.indent == "":
916
1109
  _log_mapping_info()
917
1110
  logger.warning(warning_message)
918
1111
  logger.indent = ""
919
- return False
1112
+ return False, non_validated
920
1113
 
921
1114
 
922
1115
  def validate_categories_in_df(
923
1116
  df: pd.DataFrame,
924
1117
  fields: dict[str, FieldAttr],
925
- using: str | None = None,
1118
+ using_key: str | None = None,
926
1119
  sources: dict[str, Record] = None,
1120
+ exclude: dict | None = None,
927
1121
  **kwargs,
928
- ) -> bool:
1122
+ ) -> tuple[bool, dict]:
929
1123
  """Validate categories in DataFrame columns using LaminDB registries."""
930
1124
  if sources is None:
931
1125
  sources = {}
932
1126
  validated = True
1127
+ non_validated = {}
933
1128
  for key, field in fields.items():
934
- validated &= validate_categories(
1129
+ is_val, non_val = validate_categories(
935
1130
  df[key],
936
1131
  field=field,
937
1132
  key=key,
938
- using=using,
1133
+ using_key=using_key,
939
1134
  source=sources.get(key),
1135
+ exclude=exclude.get(key) if exclude else None,
940
1136
  **kwargs,
941
1137
  )
942
- return validated
1138
+ validated &= is_val
1139
+ if len(non_val) > 0:
1140
+ non_validated[key] = non_val
1141
+ return validated, non_validated
943
1142
 
944
1143
 
945
1144
  def save_artifact(
@@ -1017,8 +1216,13 @@ def save_artifact(
1017
1216
  feature = features.get(key)
1018
1217
  registry = field.field.model
1019
1218
  filter_kwargs = check_registry_organism(registry, organism)
1219
+ filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1020
1220
  df = data if isinstance(data, pd.DataFrame) else data.obs
1021
- labels = registry.from_values(df[key], field=field, **filter_kwargs)
1221
+ labels = registry.from_values(
1222
+ df[key],
1223
+ field=field,
1224
+ **filter_kwargs_current,
1225
+ )
1022
1226
  artifact.labels.add(labels, feature)
1023
1227
 
1024
1228
  if artifact._accessor == "MuData":
@@ -1041,22 +1245,24 @@ def update_registry(
1041
1245
  field: FieldAttr,
1042
1246
  key: str,
1043
1247
  save_function: str = "add_new_from",
1044
- using: str | None = None,
1248
+ using_key: str | None = None,
1045
1249
  validated_only: bool = True,
1046
1250
  df: pd.DataFrame | None = None,
1047
1251
  organism: str | None = None,
1048
1252
  dtype: str | None = None,
1049
1253
  source: Record | None = None,
1254
+ standardize: bool = True,
1255
+ warning: bool = True,
1050
1256
  **kwargs,
1051
- ) -> list[Record]:
1052
- """Save features or labels records in the default instance from the using instance.
1257
+ ) -> None:
1258
+ """Save features or labels records in the default instance from the using_key instance.
1053
1259
 
1054
1260
  Args:
1055
1261
  values: A list of values to be saved as labels.
1056
1262
  field: The FieldAttr object representing the field for which labels are being saved.
1057
1263
  key: The name of the feature to save.
1058
1264
  save_function: The name of the function to save the labels.
1059
- using: The name of the instance from which to transfer labels (if applicable).
1265
+ using_key: The name of the instance from which to transfer labels (if applicable).
1060
1266
  validated_only: If True, only save validated labels.
1061
1267
  df: A DataFrame to save labels from.
1062
1268
  organism: The organism name.
@@ -1074,51 +1280,74 @@ def update_registry(
1074
1280
  verbosity = settings.verbosity
1075
1281
  try:
1076
1282
  settings.verbosity = "error"
1283
+
1284
+ # save from public
1285
+ filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1286
+ existing_and_public_records = (
1287
+ registry.from_values(
1288
+ list(values),
1289
+ field=field,
1290
+ **filter_kwargs_current,
1291
+ )
1292
+ if values
1293
+ else []
1294
+ )
1295
+
1296
+ labels_saved: dict = {"from public": [], "without reference": []}
1297
+
1298
+ public_records = [r for r in existing_and_public_records if r._state.adding]
1299
+ # here we check to only save the public records if they are from the specified source
1300
+ # we check the uid because r.source and soruce can be from different instances
1301
+ if source:
1302
+ public_records = [r for r in public_records if r.source.uid == source.uid]
1303
+ ln_save(public_records)
1304
+ labels_saved["from public"] = [
1305
+ getattr(r, field.field.name) for r in public_records
1306
+ ]
1307
+ non_public_labels = [i for i in values if i not in labels_saved["from public"]]
1308
+
1309
+ # inspect the default instance
1077
1310
  inspect_result_current = standardize_and_inspect(
1078
- values=values, field=field, registry=registry, **filter_kwargs
1311
+ values=non_public_labels,
1312
+ field=field,
1313
+ registry=registry,
1314
+ standardize=standardize,
1315
+ **filter_kwargs,
1079
1316
  )
1080
1317
  if not inspect_result_current.non_validated:
1081
1318
  all_labels = registry.from_values(
1082
- inspect_result_current.validated, field=field, **filter_kwargs
1319
+ inspect_result_current.validated,
1320
+ field=field,
1321
+ **filter_kwargs_current,
1083
1322
  )
1084
1323
  settings.verbosity = verbosity
1085
1324
  return all_labels
1086
1325
 
1087
- labels_saved: dict = {"from public": [], "without reference": []}
1088
-
1326
+ # inspect the using_key instance
1089
1327
  (
1090
- labels_saved[f"from {using}"],
1328
+ labels_saved[f"from {using_key}"],
1091
1329
  non_validated_labels,
1092
1330
  ) = update_registry_from_using_instance(
1093
1331
  inspect_result_current.non_validated,
1094
1332
  field=field,
1095
- using=using,
1333
+ using_key=using_key,
1096
1334
  **filter_kwargs,
1097
1335
  )
1098
1336
 
1099
- public_records = (
1100
- registry.from_values(non_validated_labels, field=field, **filter_kwargs)
1101
- if non_validated_labels
1102
- else []
1103
- )
1104
- # here we check to only save the public records if they are from the specified source
1105
- # TODO: this if shouldn't be needed
1106
- if source:
1107
- public_records = [r for r in public_records if r.source == source]
1108
- ln_save(public_records)
1109
- labels_saved["from public"] = [
1110
- getattr(r, field.field.name) for r in public_records
1111
- ]
1112
1337
  labels_saved["without reference"] = [
1113
- i for i in non_validated_labels if i not in labels_saved["from public"]
1338
+ i
1339
+ for i in non_validated_labels
1340
+ if i not in labels_saved[f"from {using_key}"]
1114
1341
  ]
1115
1342
 
1343
+ # save non-validated records
1116
1344
  if not validated_only:
1117
1345
  non_validated_records = []
1118
1346
  if df is not None and registry == Feature:
1119
1347
  non_validated_records = Feature.from_df(df)
1120
1348
  else:
1121
1349
  if "organism" in filter_kwargs:
1350
+ # make sure organism record is saved to the current instance
1122
1351
  filter_kwargs["organism"] = _save_organism(name=organism)
1123
1352
  init_kwargs = {}
1124
1353
  for value in labels_saved["without reference"]:
@@ -1134,15 +1363,16 @@ def update_registry(
1134
1363
  )
1135
1364
  ln_save(non_validated_records)
1136
1365
 
1366
+ # save parent labels for ulabels
1137
1367
  if registry == ULabel and field.field.name == "name":
1138
1368
  save_ulabels_with_parent(values, field=field, key=key)
1139
1369
 
1140
- # get all records
1141
- all_labels = registry.from_values(
1142
- inspect_result_current.validated + inspect_result_current.non_validated,
1143
- field=field,
1144
- **filter_kwargs,
1145
- )
1370
+ # # get all records that are now validated in the current instance
1371
+ # all_labels = registry.from_values(
1372
+ # inspect_result_current.validated + inspect_result_current.non_validated,
1373
+ # field=field,
1374
+ # **get_current_filter_kwargs(registry, filter_kwargs),
1375
+ # )
1146
1376
  finally:
1147
1377
  settings.verbosity = verbosity
1148
1378
 
@@ -1152,9 +1382,10 @@ def update_registry(
1152
1382
  save_function=save_function,
1153
1383
  model_field=f"{registry.__name__}.{field.field.name}",
1154
1384
  validated_only=validated_only,
1385
+ warning=warning,
1155
1386
  )
1156
1387
 
1157
- return all_labels
1388
+ # return all_labels
1158
1389
 
1159
1390
 
1160
1391
  def log_saved_labels(
@@ -1163,6 +1394,7 @@ def log_saved_labels(
1163
1394
  save_function: str,
1164
1395
  model_field: str,
1165
1396
  validated_only: bool = True,
1397
+ warning: bool = True,
1166
1398
  ) -> None:
1167
1399
  """Log the saved labels."""
1168
1400
  from ._from_values import _print_values
@@ -1187,7 +1419,10 @@ def log_saved_labels(
1187
1419
  if save_function == "add_new_from"
1188
1420
  else f"\n → to save, run {colors.yellow(save_function)}"
1189
1421
  )
1190
- logger.warning(msg)
1422
+ if warning:
1423
+ logger.warning(msg)
1424
+ else:
1425
+ logger.info(msg)
1191
1426
  else:
1192
1427
  k = "" if k == "without reference" else f"{colors.green(k)} "
1193
1428
  # the term "transferred" stresses that this is always in the context of transferring
@@ -1202,7 +1437,7 @@ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> N
1202
1437
  """Save a parent label for the given labels."""
1203
1438
  registry = field.field.model
1204
1439
  assert registry == ULabel # noqa: S101
1205
- all_records = registry.from_values(values, field=field)
1440
+ all_records = registry.from_values(list(values), field=field)
1206
1441
  is_feature = registry.filter(name=f"is_{key}").one_or_none()
1207
1442
  if is_feature is None:
1208
1443
  is_feature = registry(name=f"is_{key}")
@@ -1213,15 +1448,16 @@ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> N
1213
1448
  def update_registry_from_using_instance(
1214
1449
  values: list[str],
1215
1450
  field: FieldAttr,
1216
- using: str | None = None,
1451
+ using_key: str | None = None,
1452
+ standardize: bool = False,
1217
1453
  **kwargs,
1218
1454
  ) -> tuple[list[str], list[str]]:
1219
- """Save features or labels records from the using instance.
1455
+ """Save features or labels records from the using_key instance.
1220
1456
 
1221
1457
  Args:
1222
1458
  values: A list of values to be saved as labels.
1223
1459
  field: The FieldAttr object representing the field for which labels are being saved.
1224
- using: The name of the instance from which to transfer labels (if applicable).
1460
+ using_key: The name of the instance from which to transfer labels (if applicable).
1225
1461
  kwargs: Additional keyword arguments to pass to the registry model.
1226
1462
 
1227
1463
  Returns:
@@ -1230,11 +1466,15 @@ def update_registry_from_using_instance(
1230
1466
  labels_saved = []
1231
1467
  not_saved = values
1232
1468
 
1233
- if using is not None and using != "default":
1234
- registry = field.field.model
1235
- registry_using = get_registry_instance(registry, using)
1469
+ if using_key is not None and using_key != "default":
1470
+ registry_using = get_registry_instance(field.field.model, using_key)
1471
+
1236
1472
  inspect_result_using = standardize_and_inspect(
1237
- values=values, field=field, registry=registry_using, **kwargs
1473
+ values=values,
1474
+ field=field,
1475
+ registry=registry_using,
1476
+ standardize=standardize,
1477
+ **kwargs,
1238
1478
  )
1239
1479
  labels_using = registry_using.filter(
1240
1480
  **{f"{field.field.name}__in": inspect_result_using.validated}