lamindb 0.77.0__py3-none-any.whl → 0.77.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_curate.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import copy
4
+ import warnings
4
5
  from typing import TYPE_CHECKING
5
6
 
6
7
  import anndata as ad
@@ -16,6 +17,7 @@ from lnschema_core import (
16
17
  ULabel,
17
18
  )
18
19
 
20
+ from ._from_values import _print_values
19
21
  from .core.exceptions import ValidationError
20
22
 
21
23
  if TYPE_CHECKING:
@@ -28,7 +30,21 @@ if TYPE_CHECKING:
28
30
 
29
31
 
30
32
  class CurateLookup:
31
- """Lookup categories from the reference instance."""
33
+ """Lookup categories from the reference instance.
34
+
35
+ Args:
36
+ categoricals: A dictionary of categorical fields to lookup.
37
+ slots: A dictionary of slot fields to lookup.
38
+ using_key: The key of the instance to lookup from. Defaults to the
39
+ current instance if not specified.
40
+ public: Whether to lookup from the public instance. Defaults to False.
41
+
42
+ Example:
43
+ >>> validator = ln.Validator()
44
+ >>> validator.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
45
+ <Category: alveolar_type_1_fibroblast_cell>
46
+
47
+ """
32
48
 
33
49
  def __init__(
34
50
  self,
@@ -37,8 +53,7 @@ class CurateLookup:
37
53
  using_key: str | None = None,
38
54
  public: bool = False,
39
55
  ) -> None:
40
- if slots is None:
41
- slots = {}
56
+ slots = slots or {}
42
57
  self._fields = {**categoricals, **slots}
43
58
  self._using_key = None if using_key == "default" else using_key
44
59
  self._using_key_name = self._using_key or ln_setup.settings.instance.slug
@@ -54,7 +69,7 @@ class CurateLookup:
54
69
  else:
55
70
  return get_registry_instance(registry, self._using_key).lookup()
56
71
  raise AttributeError(
57
- f"'{self.__class__.__name__}' object has no attribute '{name}'"
72
+ f'"{self.__class__.__name__}" object has no attribute "{name}"'
58
73
  )
59
74
 
60
75
  def __getitem__(self, name):
@@ -65,7 +80,7 @@ class CurateLookup:
65
80
  else:
66
81
  return get_registry_instance(registry, self._using_key).lookup()
67
82
  raise AttributeError(
68
- f"'{self.__class__.__name__}' object has no attribute '{name}'"
83
+ f'"{self.__class__.__name__}" object has no attribute "{name}"'
69
84
  )
70
85
 
71
86
  def __repr__(self) -> str:
@@ -81,7 +96,7 @@ class CurateLookup:
81
96
  f"Lookup objects from the {colors.italic(ref)}:\n "
82
97
  f"{colors.green(getattr_keys)}\n "
83
98
  f"{colors.green(getitem_keys)}\n"
84
- "Example:\n → categories = validator.lookup()['cell_type']\n"
99
+ 'Example:\n → categories = validator.lookup()["cell_type"]\n'
85
100
  " → categories.alveolar_type_1_fibroblast_cell\n\n"
86
101
  "To look up public ontologies, use .lookup(public=True)"
87
102
  )
@@ -95,10 +110,25 @@ class BaseCurator:
95
110
  def validate(self) -> bool:
96
111
  """Validate dataset.
97
112
 
113
+ This method also registers the validated records in the current instance.
114
+
98
115
  Returns:
99
116
  Boolean indicating whether the dataset is validated.
100
117
  """
101
- pass
118
+ pass # pragma: no cover
119
+
120
+ def standardize(self, key: str) -> None:
121
+ """Replace synonyms with standardized values.
122
+
123
+ Inplace modification of the dataset.
124
+
125
+ Args:
126
+ key: The name of the column to standardize.
127
+
128
+ Returns:
129
+ None
130
+ """
131
+ pass # pragma: no cover
102
132
 
103
133
  def save_artifact(
104
134
  self,
@@ -110,15 +140,15 @@ class BaseCurator:
110
140
  """Save the dataset as artifact.
111
141
 
112
142
  Args:
113
- description: `str | None = None` A description of the DataFrame object.
114
- key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
115
- revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
116
- run: `Run | None = None` The run that creates the artifact.
143
+ description: A description of the DataFrame object.
144
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
145
+ revises: Previous version of the artifact. Triggers a revision.
146
+ run: The run that creates the artifact.
117
147
 
118
148
  Returns:
119
149
  A saved artifact record.
120
150
  """
121
- pass
151
+ pass # pragma: no cover
122
152
 
123
153
 
124
154
  class DataFrameCurator(BaseCurator):
@@ -136,6 +166,9 @@ class DataFrameCurator(BaseCurator):
136
166
  sources: A dictionary mapping column names to Source records.
137
167
  exclude: A dictionary mapping column names to values to exclude.
138
168
 
169
+ Returns:
170
+ A curator object.
171
+
139
172
  Examples:
140
173
  >>> import bionty as bt
141
174
  >>> curate = ln.Curator.from_df(
@@ -165,24 +198,21 @@ class DataFrameCurator(BaseCurator):
165
198
  self._fields = categoricals or {}
166
199
  self._columns_field = columns
167
200
  self._using_key = using_key
201
+ # TODO: change verbosity back
168
202
  settings.verbosity = verbosity
169
203
  self._artifact = None
170
204
  self._collection = None
171
205
  self._validated = False
172
206
  self._kwargs = {"organism": organism} if organism else {}
173
- if sources is None:
174
- sources = {}
175
- self._sources = sources
176
- if exclude is None:
177
- exclude = {}
178
- self._exclude = exclude
207
+ self._sources = sources or {}
208
+ self._exclude = exclude or {}
179
209
  self._non_validated = None
180
210
  if check_valid_keys:
181
211
  self._check_valid_keys()
182
212
  self._save_columns()
183
213
 
184
214
  @property
185
- def non_validated(self) -> list:
215
+ def non_validated(self) -> dict[str, list[str]]:
186
216
  """Return the non-validated features and labels."""
187
217
  if self._non_validated is None:
188
218
  raise ValidationError("Please run validate() first!")
@@ -200,7 +230,6 @@ class DataFrameCurator(BaseCurator):
200
230
 
201
231
  Args:
202
232
  using_key: The instance where the lookup is performed.
203
- if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
204
233
  if "public", the lookup is performed on the public reference.
205
234
  """
206
235
  return CurateLookup(
@@ -210,9 +239,8 @@ class DataFrameCurator(BaseCurator):
210
239
  public=public,
211
240
  )
212
241
 
213
- def _check_valid_keys(self, extra: set = None) -> None:
214
- if extra is None:
215
- extra = set()
242
+ def _check_valid_keys(self, extra: set | None = None) -> None:
243
+ extra = extra or set()
216
244
  for name, d in {
217
245
  "categoricals": self._fields,
218
246
  "sources": self._sources,
@@ -222,9 +250,12 @@ class DataFrameCurator(BaseCurator):
222
250
  raise TypeError(f"{name} must be a dictionary!")
223
251
  valid_keys = set(self._df.columns) | {"columns"} | extra
224
252
  nonval_keys = [key for key in d.keys() if key not in valid_keys]
253
+ n = len(nonval_keys)
254
+ s = "s" if n > 1 else ""
255
+ are = "are" if n > 1 else "is"
225
256
  if len(nonval_keys) > 0:
226
257
  raise ValidationError(
227
- f"the following keys passed to {name} are not allowed: {nonval_keys}"
258
+ f"the following {n} key{s} passed to {name} {are} not allowed: {colors.yellow(_print_values(nonval_keys))}"
228
259
  )
229
260
 
230
261
  def _save_columns(self, validated_only: bool = True) -> None:
@@ -234,7 +265,6 @@ class DataFrameCurator(BaseCurator):
234
265
  values=list(self.fields.keys()),
235
266
  field=self._columns_field,
236
267
  key="columns",
237
- save_function="add_new_from_columns",
238
268
  using_key=self._using_key,
239
269
  validated_only=False,
240
270
  source=self._sources.get("columns"),
@@ -249,13 +279,11 @@ class DataFrameCurator(BaseCurator):
249
279
  values=list(additional_columns),
250
280
  field=self._columns_field,
251
281
  key="columns",
252
- save_function="add_new_from_columns",
253
282
  using_key=self._using_key,
254
283
  validated_only=validated_only,
255
284
  df=self._df, # Get the Feature type from df
256
285
  source=self._sources.get("columns"),
257
286
  exclude=self._exclude.get("columns"),
258
- warning=False, # Do not warn about missing columns, just an info message
259
287
  **self._kwargs, # type: ignore
260
288
  )
261
289
 
@@ -265,7 +293,7 @@ class DataFrameCurator(BaseCurator):
265
293
  Args:
266
294
  key: The key referencing the slot in the DataFrame from which to draw terms.
267
295
  organism: The organism name.
268
- **kwargs: Additional keyword arguments to pass to the registry model.
296
+ **kwargs: Additional keyword arguments to pass to create new records
269
297
  """
270
298
  if len(kwargs) > 0 and key == "all":
271
299
  raise ValueError("Cannot pass additional arguments to 'all' key!")
@@ -273,20 +301,83 @@ class DataFrameCurator(BaseCurator):
273
301
  self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
274
302
 
275
303
  def add_new_from_columns(self, organism: str | None = None, **kwargs):
276
- """Add validated & new column names to its registry.
304
+ """Deprecated to run by default during init."""
305
+ warnings.warn(
306
+ "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
307
+ DeprecationWarning,
308
+ stacklevel=2,
309
+ )
310
+ pass
311
+
312
+ def _replace_synonyms(
313
+ self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
314
+ ):
315
+ # replace the values in df
316
+ std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
317
+ # remove the standardized values from self.non_validated
318
+ non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
319
+ if len(non_validated) == 0:
320
+ self._non_validated.pop(key, None) # type: ignore
321
+ else:
322
+ self._non_validated[key] = non_validated # type: ignore
323
+ # logging
324
+ n = len(syn_mapper)
325
+ if n > 0:
326
+ syn_mapper_print = _print_values(
327
+ [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
328
+ )
329
+ s = "s" if n > 1 else ""
330
+ logger.success(
331
+ f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
332
+ )
333
+ return std_values
334
+
335
+ def standardize(self, key: str):
336
+ """Replace synonyms with standardized values.
277
337
 
278
338
  Args:
279
- organism: The organism name.
280
- **kwargs: Additional keyword arguments to pass to the registry model.
339
+ key: The key referencing the slot in the DataFrame from which to draw terms.
340
+
341
+ Modifies the input dataset inplace.
281
342
  """
282
- self._kwargs.update({"organism": organism} if organism else {})
283
- self._save_columns(validated_only=False, **self._kwargs, **kwargs)
343
+ # list is needed to avoid RuntimeError: dictionary changed size during iteration
344
+ avail_keys = list(self.non_validated.keys())
345
+ if len(avail_keys) == 0:
346
+ logger.warning("values are already standardized")
347
+ return
348
+
349
+ if key == "all":
350
+ for k in avail_keys:
351
+ if k in self._fields: # needed to exclude var_index
352
+ syn_mapper = standardize_categories(
353
+ self.non_validated[k],
354
+ field=self._fields[k],
355
+ using_key=self._using_key,
356
+ source=self._sources.get(k),
357
+ **self._kwargs,
358
+ )
359
+ self._df[k] = self._replace_synonyms(k, syn_mapper, self._df[k])
360
+ else:
361
+ if key not in avail_keys:
362
+ raise KeyError(
363
+ f'"{key}" is not a valid key, available keys are: {_print_values(avail_keys)}!'
364
+ )
365
+ else:
366
+ if key in self._fields: # needed to exclude var_index
367
+ syn_mapper = standardize_categories(
368
+ self.non_validated[key],
369
+ field=self._fields[key],
370
+ using_key=self._using_key,
371
+ source=self._sources.get(key),
372
+ **self._kwargs,
373
+ )
374
+ self._df[key] = self._replace_synonyms(
375
+ key, syn_mapper, self._df[key]
376
+ )
284
377
 
285
378
  def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs):
286
379
  if categorical == "all":
287
380
  self._update_registry_all(validated_only=validated_only, **kwargs)
288
- elif categorical == "columns":
289
- self._save_columns(validated_only=validated_only, **kwargs)
290
381
  else:
291
382
  if categorical not in self.fields:
292
383
  raise ValidationError(
@@ -302,6 +393,9 @@ class DataFrameCurator(BaseCurator):
302
393
  exclude=self._exclude.get(categorical),
303
394
  **kwargs,
304
395
  )
396
+ # adding new records removes them from non_validated
397
+ if not validated_only and self._non_validated:
398
+ self._non_validated.pop(categorical, None) # type: ignore
305
399
 
306
400
  def _update_registry_all(self, validated_only: bool = True, **kwargs):
307
401
  """Save labels for all features."""
@@ -311,6 +405,10 @@ class DataFrameCurator(BaseCurator):
311
405
  def validate(self, organism: str | None = None) -> bool:
312
406
  """Validate variables and categorical observations.
313
407
 
408
+ This method also registers the validated records in the current instance:
409
+ - from public sources
410
+ - from the using_key instance
411
+
314
412
  Args:
315
413
  organism: The organism name.
316
414
 
@@ -342,10 +440,10 @@ class DataFrameCurator(BaseCurator):
342
440
  """Save the validated DataFrame and metadata.
343
441
 
344
442
  Args:
345
- description: `str | None = None` Description of the DataFrame object.
346
- key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
347
- revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
348
- run: `Run | None = None` The run that creates the artifact.
443
+ description: Description of the DataFrame object.
444
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
445
+ revises: Previous version of the artifact. Triggers a revision.
446
+ run: The run that creates the artifact.
349
447
 
350
448
  Returns:
351
449
  A saved artifact record.
@@ -361,10 +459,6 @@ class DataFrameCurator(BaseCurator):
361
459
  verbosity = settings.verbosity
362
460
  try:
363
461
  settings.verbosity = "warning"
364
- if not self._validated:
365
- # save all validated records to the current instance
366
- self._update_registry_all()
367
-
368
462
  self._artifact = save_artifact(
369
463
  self._df,
370
464
  description=description,
@@ -403,6 +497,7 @@ class AnnDataCurator(DataFrameCurator):
403
497
  data: The AnnData object or an AnnData-like path.
404
498
  var_index: The registry field for mapping the ``.var`` index.
405
499
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
500
+ obs_columns: The registry field for mapping the ``.obs.columns``.
406
501
  using_key: A reference LaminDB instance.
407
502
  verbosity: The verbosity level.
408
503
  organism: The organism name.
@@ -428,7 +523,7 @@ class AnnDataCurator(DataFrameCurator):
428
523
  var_index: FieldAttr,
429
524
  categoricals: dict[str, FieldAttr] | None = None,
430
525
  obs_columns: FieldAttr = Feature.name,
431
- using_key: str = "default",
526
+ using_key: str | None = None,
432
527
  verbosity: str = "hint",
433
528
  organism: str | None = None,
434
529
  sources: dict[str, Record] | None = None,
@@ -456,7 +551,7 @@ class AnnDataCurator(DataFrameCurator):
456
551
 
457
552
  if "symbol" in str(var_index):
458
553
  logger.warning(
459
- "Curating gene symbols is discouraged. See FAQ for more details."
554
+ "indexing datasets with gene symbols can be problematic: https://docs.lamin.ai/faq/symbol-mapping"
460
555
  )
461
556
 
462
557
  self._data = data
@@ -492,7 +587,6 @@ class AnnDataCurator(DataFrameCurator):
492
587
 
493
588
  Args:
494
589
  using_key: The instance where the lookup is performed.
495
- if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
496
590
  if "public", the lookup is performed on the public reference.
497
591
  """
498
592
  return CurateLookup(
@@ -510,7 +604,6 @@ class AnnDataCurator(DataFrameCurator):
510
604
  values=list(self._adata.var.index),
511
605
  field=self.var_index,
512
606
  key="var_index",
513
- save_function=".add_new_from_var_index()",
514
607
  using_key=self._using_key,
515
608
  validated_only=validated_only,
516
609
  organism=organism,
@@ -529,7 +622,7 @@ class AnnDataCurator(DataFrameCurator):
529
622
 
530
623
  Args:
531
624
  organism: The organism name.
532
- **kwargs: Additional keyword arguments to pass to the registry model.
625
+ **kwargs: Additional keyword arguments to pass to create new records.
533
626
  """
534
627
  self._kwargs.update({"organism": organism} if organism else {})
535
628
  self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
@@ -537,6 +630,8 @@ class AnnDataCurator(DataFrameCurator):
537
630
  def validate(self, organism: str | None = None) -> bool:
538
631
  """Validate categories.
539
632
 
633
+ This method also registers the validated records in the current instance.
634
+
540
635
  Args:
541
636
  organism: The organism name.
542
637
 
@@ -558,7 +653,7 @@ class AnnDataCurator(DataFrameCurator):
558
653
  key="var_index",
559
654
  using_key=self._using_key,
560
655
  source=self._sources.get("var_index"),
561
- validated_hint_print=".add_validated_from_var_index()",
656
+ hint_print=".add_new_from_var_index()",
562
657
  exclude=self._exclude.get("var_index"),
563
658
  **self._kwargs, # type: ignore
564
659
  )
@@ -576,6 +671,34 @@ class AnnDataCurator(DataFrameCurator):
576
671
  self._validated = validated_var and validated_obs
577
672
  return self._validated
578
673
 
674
+ def standardize(self, key: str):
675
+ """Replace synonyms with standardized values.
676
+
677
+ Args:
678
+ key: The key referencing the slot in `adata.obs` from which to draw terms. Same as the key in `categoricals`.
679
+
680
+ - If "var_index", standardize the var.index.
681
+ - If "all", standardize all obs columns and var.index.
682
+
683
+ Inplace modification of the dataset.
684
+ """
685
+ if key in self._adata.obs.columns or key == "all":
686
+ # standardize obs columns
687
+ super().standardize(key)
688
+ # in addition to the obs columns, standardize the var.index
689
+ if key == "var_index" or key == "all":
690
+ syn_mapper = standardize_categories(
691
+ self._adata.var.index,
692
+ field=self.var_index,
693
+ using_key=self._using_key,
694
+ source=self._sources.get("var_index"),
695
+ **self._kwargs,
696
+ )
697
+ if "var_index" in self._non_validated: # type: ignore
698
+ self._adata.var.index = self._replace_synonyms(
699
+ "var_index", syn_mapper, self._adata.var.index
700
+ )
701
+
579
702
  def save_artifact(
580
703
  self,
581
704
  description: str | None = None,
@@ -586,10 +709,10 @@ class AnnDataCurator(DataFrameCurator):
586
709
  """Save the validated ``AnnData`` and metadata.
587
710
 
588
711
  Args:
589
- description: `str | None = None` A description of the ``AnnData`` object.
590
- key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
591
- revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
592
- run: `Run | None = None` The run that creates the artifact.
712
+ description: A description of the ``AnnData`` object.
713
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
714
+ revises: Previous version of the artifact. Triggers a revision.
715
+ run: The run that creates the artifact.
593
716
 
594
717
  Returns:
595
718
  A saved artifact record.
@@ -603,9 +726,6 @@ class AnnDataCurator(DataFrameCurator):
603
726
  verbosity = settings.verbosity
604
727
  try:
605
728
  settings.verbosity = "warning"
606
- if not self._validated:
607
- # save all validated records to the current instance
608
- self._update_registry_all()
609
729
  self._artifact = save_artifact(
610
730
  self._data,
611
731
  adata=self._adata,
@@ -662,13 +782,13 @@ class MuDataCurator:
662
782
  def __init__(
663
783
  self,
664
784
  mdata: MuData,
665
- var_index: dict[str, dict[str, FieldAttr]],
785
+ var_index: dict[str, FieldAttr],
666
786
  categoricals: dict[str, FieldAttr] | None = None,
667
- using_key: str = "default",
787
+ using_key: str | None = None,
668
788
  verbosity: str = "hint",
669
789
  organism: str | None = None,
670
790
  sources: dict[str, Record] | None = None,
671
- exclude: dict | None = None,
791
+ exclude: dict | None = None, # {modality: {field: [values]}}
672
792
  ) -> None:
673
793
  if sources is None:
674
794
  sources = {}
@@ -684,19 +804,34 @@ class MuDataCurator:
684
804
  self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
685
805
  self._using_key = using_key
686
806
  self._verbosity = verbosity
687
- self._df_annotators = {
688
- modality: DataFrameCurator(
689
- df=mdata[modality].obs if modality != "obs" else mdata.obs,
690
- categoricals=self._obs_fields.get(modality, {}),
807
+ self._obs_df_curator = None
808
+ if "obs" in self._modalities:
809
+ self._obs_df_curator = DataFrameCurator(
810
+ df=mdata.obs,
811
+ columns=Feature.name,
812
+ categoricals=self._obs_fields.get("obs", {}),
813
+ using_key=using_key,
814
+ verbosity=verbosity,
815
+ sources=self._sources.get("obs"),
816
+ exclude=self._exclude.get("obs"),
817
+ check_valid_keys=False,
818
+ **self._kwargs,
819
+ )
820
+ self._mod_adata_curators = {
821
+ modality: AnnDataCurator(
822
+ data=mdata[modality],
823
+ var_index=var_index.get(modality),
824
+ categoricals=self._obs_fields.get(modality),
691
825
  using_key=using_key,
692
826
  verbosity=verbosity,
693
827
  sources=self._sources.get(modality),
694
828
  exclude=self._exclude.get(modality),
695
- check_valid_keys=False,
696
829
  **self._kwargs,
697
830
  )
698
831
  for modality in self._modalities
832
+ if modality != "obs"
699
833
  }
834
+ self._non_validated = None
700
835
 
701
836
  @property
702
837
  def var_index(self) -> FieldAttr:
@@ -708,29 +843,19 @@ class MuDataCurator:
708
843
  """Return the obs fields to validate against."""
709
844
  return self._obs_fields
710
845
 
846
+ @property
847
+ def non_validated(self) -> dict[str, dict[str, list[str]]]:
848
+ """Return the non-validated features and labels."""
849
+ if self._non_validated is None:
850
+ raise ValidationError("Please run validate() first!")
851
+ return self._non_validated
852
+
711
853
  def _verify_modality(self, modalities: Iterable[str]):
712
854
  """Verify the modality exists."""
713
855
  for modality in modalities:
714
856
  if modality not in self._mdata.mod.keys():
715
857
  raise ValidationError(f"modality '{modality}' does not exist!")
716
858
 
717
- def _save_from_var_index_modality(
718
- self, modality: str, validated_only: bool = True, **kwargs
719
- ):
720
- """Save variable records."""
721
- update_registry(
722
- values=list(self._mdata[modality].var.index),
723
- field=self._var_fields[modality],
724
- key="var_index",
725
- save_function=f'.add_new_from_var_index("{modality}")',
726
- using_key=self._using_key,
727
- validated_only=validated_only,
728
- dtype="number",
729
- source=self._sources.get(modality, {}).get("var_index"),
730
- exclude=self._exclude.get(modality, {}).get("var_index"),
731
- **kwargs,
732
- )
733
-
734
859
  def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
735
860
  """Parse the categorical fields."""
736
861
  prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
@@ -756,13 +881,18 @@ class MuDataCurator:
756
881
 
757
882
  Args:
758
883
  using_key: The instance where the lookup is performed.
759
- if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
760
884
  if "public", the lookup is performed on the public reference.
761
885
  """
886
+ obs_fields = {}
887
+ for mod, fields in self._obs_fields.items():
888
+ for k, v in fields.items():
889
+ if k == "obs":
890
+ obs_fields[k] = v
891
+ else:
892
+ obs_fields[f"{mod}:{k}"] = v
762
893
  return CurateLookup(
763
- categoricals=self._obs_fields,
894
+ categoricals=obs_fields,
764
895
  slots={
765
- **self._obs_fields,
766
896
  **{f"{k}_var_index": v for k, v in self._var_fields.items()},
767
897
  },
768
898
  using_key=using_key or self._using_key,
@@ -776,27 +906,11 @@ class MuDataCurator:
776
906
  organism: str | None = None,
777
907
  **kwargs,
778
908
  ):
779
- """Update columns records.
780
-
781
- Args:
782
- modality: The modality name.
783
- column_names: The column names to save.
784
- organism: The organism name.
785
- **kwargs: Additional keyword arguments to pass to the registry model.
786
- """
787
- self._kwargs.update({"organism": organism} if organism else {})
788
- values = column_names or self._mdata[modality].obs.columns
789
- update_registry(
790
- values=list(values),
791
- field=Feature.name,
792
- key=f"{modality} obs columns",
793
- using_key=self._using_key,
794
- validated_only=False,
795
- df=self._mdata[modality].obs,
796
- source=self._sources.get(modality, {}).get("columns"),
797
- exclude=self._exclude.get(modality, {}).get("columns"),
798
- **self._kwargs, # type: ignore
799
- **kwargs,
909
+ """Update columns records."""
910
+ warnings.warn(
911
+ "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
912
+ DeprecationWarning,
913
+ stacklevel=2,
800
914
  )
801
915
 
802
916
  def add_new_from_var_index(
@@ -807,21 +921,21 @@ class MuDataCurator:
807
921
  Args:
808
922
  modality: The modality name.
809
923
  organism: The organism name.
810
- **kwargs: Additional keyword arguments to pass to the registry model.
924
+ **kwargs: Additional keyword arguments to pass to create new records.
811
925
  """
812
926
  self._kwargs.update({"organism": organism} if organism else {})
813
- self._save_from_var_index_modality(
814
- modality=modality, validated_only=False, **self._kwargs, **kwargs
927
+ self._mod_adata_curators[modality].add_new_from_var_index(
928
+ **self._kwargs, **kwargs
815
929
  )
816
930
 
817
931
  def _update_registry_all(self):
818
932
  """Update all registries."""
819
- for modality in self._var_fields.keys():
820
- self._save_from_var_index_modality(
821
- modality=modality, validated_only=True, **self._kwargs
933
+ if self._obs_df_curator is not None:
934
+ self._obs_df_curator._update_registry_all(
935
+ validated_only=True, **self._kwargs
822
936
  )
823
- for _, df_annotator in self._df_annotators.items():
824
- df_annotator._update_registry_all(validated_only=True, **self._kwargs)
937
+ for _, adata_curator in self._mod_adata_curators.items():
938
+ adata_curator._update_registry_all(validated_only=True, **self._kwargs)
825
939
 
826
940
  def add_new_from(
827
941
  self,
@@ -836,15 +950,17 @@ class MuDataCurator:
836
950
  key: The key referencing the slot in the DataFrame.
837
951
  modality: The modality name.
838
952
  organism: The organism name.
839
- **kwargs: Additional keyword arguments to pass to the registry model.
953
+ **kwargs: Additional keyword arguments to pass to create new records.
840
954
  """
841
955
  if len(kwargs) > 0 and key == "all":
842
956
  raise ValueError("Cannot pass additional arguments to 'all' key!")
843
957
  self._kwargs.update({"organism": organism} if organism else {})
844
958
  modality = modality or "obs"
845
- if modality in self._df_annotators:
846
- df_annotator = self._df_annotators[modality]
847
- df_annotator.add_new_from(key=key, **self._kwargs, **kwargs)
959
+ if modality in self._mod_adata_curators:
960
+ adata_curator = self._mod_adata_curators[modality]
961
+ adata_curator.add_new_from(key=key, **self._kwargs, **kwargs)
962
+ if modality == "obs":
963
+ self._obs_df_curator.add_new_from(key=key, **self._kwargs, **kwargs)
848
964
 
849
965
  def validate(self, organism: str | None = None) -> bool:
850
966
  """Validate categories."""
@@ -853,7 +969,7 @@ class MuDataCurator:
853
969
  self._kwargs.update({"organism": organism} if organism else {})
854
970
  if self._using_key is not None and self._using_key != "default":
855
971
  logger.important(
856
- f"validating metadata using registries of instance {colors.italic(self._using_key)}"
972
+ f"validating using registries of instance {colors.italic(self._using_key)}"
857
973
  )
858
974
 
859
975
  # add all validated records to the current instance
@@ -864,49 +980,42 @@ class MuDataCurator:
864
980
  finally:
865
981
  settings.verbosity = verbosity
866
982
 
867
- validated_var = True
868
- non_validated_var_modality = {}
869
- for modality, var_field in self._var_fields.items():
870
- is_validated_var, non_validated_var = validate_categories(
871
- self._mdata[modality].var.index,
872
- field=var_field,
873
- key=f"{modality}_var_index",
874
- using_key=self._using_key,
875
- source=self._sources.get(modality, {}).get("var_index"),
876
- exclude=self._exclude.get(modality, {}).get("var_index"),
877
- validated_hint_print=f'.add_validated_from_var_index("{modality}")',
878
- **self._kwargs, # type: ignore
879
- )
880
- validated_var &= is_validated_var
881
- if len(non_validated_var) > 0:
882
- non_validated_var_modality[modality] = non_validated_var
983
+ self._non_validated = {} # type: ignore
883
984
 
884
- validated_obs = True
885
- non_validated_obs_modality = {}
886
- for modality, fields in self._obs_fields.items():
887
- if modality == "obs":
888
- obs = self._mdata.obs
889
- else:
890
- obs = self._mdata[modality].obs
891
- is_validated_obs, non_validated_obs = validate_categories_in_df(
892
- obs,
893
- fields=fields,
894
- using_key=self._using_key,
895
- sources=self._sources.get(modality),
896
- exclude=self._exclude.get(modality),
897
- **self._kwargs,
898
- )
899
- validated_obs &= is_validated_obs
900
- non_validated_obs_modality[modality] = non_validated_obs
901
- if modality in non_validated_var_modality:
902
- non_validated_obs_modality[modality]["var_index"] = (
903
- non_validated_var_modality[modality]
904
- )
905
- if len(non_validated_obs_modality[modality]) > 0:
906
- self._non_validated = non_validated_obs_modality[modality]
907
- self._validated = validated_var and validated_obs
985
+ obs_validated = True
986
+ if "obs" in self._modalities:
987
+ logger.info('validating categoricals in "obs"...')
988
+ obs_validated &= self._obs_df_curator.validate(**self._kwargs)
989
+ self._non_validated["obs"] = self._obs_df_curator.non_validated # type: ignore
990
+ logger.print("")
991
+
992
+ mods_validated = True
993
+ for modality, adata_curator in self._mod_adata_curators.items():
994
+ logger.info(f'validating categoricals in modality "{modality}"...')
995
+ mods_validated &= adata_curator.validate(**self._kwargs)
996
+ if len(adata_curator.non_validated) > 0:
997
+ self._non_validated[modality] = adata_curator.non_validated # type: ignore
998
+ logger.print("")
999
+
1000
+ self._validated = obs_validated & mods_validated
908
1001
  return self._validated
909
1002
 
1003
+ def standardize(self, key: str, modality: str | None = None):
1004
+ """Replace synonyms with standardized values.
1005
+
1006
+ Args:
1007
+ key: The key referencing the slot in the `MuData`.
1008
+ modality: The modality name.
1009
+
1010
+ Inplace modification of the dataset.
1011
+ """
1012
+ modality = modality or "obs"
1013
+ if modality in self._mod_adata_curators:
1014
+ adata_curator = self._mod_adata_curators[modality]
1015
+ adata_curator.standardize(key=key)
1016
+ if modality == "obs":
1017
+ self._obs_df_curator.standardize(key=key)
1018
+
910
1019
  def save_artifact(
911
1020
  self,
912
1021
  description: str | None = None,
@@ -917,10 +1026,10 @@ class MuDataCurator:
917
1026
  """Save the validated ``MuData`` and metadata.
918
1027
 
919
1028
  Args:
920
- description: `str | None = None` A description of the ``MuData`` object.
921
- key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
922
- revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
923
- run: `Run | None = None` The run that creates the artifact.
1029
+ description: A description of the ``MuData`` object.
1030
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
1031
+ revises: Previous version of the artifact. Triggers a revision.
1032
+ run: The run that creates the artifact.
924
1033
 
925
1034
  Returns:
926
1035
  A saved artifact record.
@@ -934,10 +1043,6 @@ class MuDataCurator:
934
1043
  verbosity = settings.verbosity
935
1044
  try:
936
1045
  settings.verbosity = "warning"
937
- if not self._validated:
938
- # save all validated records to the current instance
939
- self._update_registry_all()
940
-
941
1046
  self._artifact = save_artifact(
942
1047
  self._mdata,
943
1048
  description=description,
@@ -1007,7 +1112,7 @@ class Curator(BaseCurator):
1007
1112
  var_index: FieldAttr,
1008
1113
  categoricals: dict[str, FieldAttr] | None = None,
1009
1114
  obs_columns: FieldAttr = Feature.name,
1010
- using_key: str = "default",
1115
+ using_key: str | None = None,
1011
1116
  verbosity: str = "hint",
1012
1117
  organism: str | None = None,
1013
1118
  sources: dict[str, Record] | None = None,
@@ -1031,7 +1136,7 @@ class Curator(BaseCurator):
1031
1136
  mdata: MuData,
1032
1137
  var_index: dict[str, dict[str, FieldAttr]],
1033
1138
  categoricals: dict[str, FieldAttr] | None = None,
1034
- using_key: str = "default",
1139
+ using_key: str | None = None,
1035
1140
  verbosity: str = "hint",
1036
1141
  organism: str | None = None,
1037
1142
  ) -> MuDataCurator:
@@ -1081,15 +1186,14 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
1081
1186
  return filter_kwargs
1082
1187
 
1083
1188
 
1084
- def standardize_and_inspect(
1189
+ def inspect_instance(
1085
1190
  values: Iterable[str],
1086
1191
  field: FieldAttr,
1087
1192
  registry: type[Record],
1088
- standardize: bool = False,
1089
1193
  exclude: str | list | None = None,
1090
1194
  **kwargs,
1091
1195
  ):
1092
- """Standardize and inspect values using a registry."""
1196
+ """Inspect values using a registry."""
1093
1197
  # inspect exclude values in the default instance
1094
1198
  values = list(values)
1095
1199
  include_validated = []
@@ -1103,16 +1207,6 @@ def standardize_and_inspect(
1103
1207
  values = [i for i in values if i not in inspect_result_exclude.validated]
1104
1208
  include_validated = inspect_result_exclude.validated
1105
1209
 
1106
- if standardize:
1107
- if hasattr(registry, "standardize") and hasattr(
1108
- registry,
1109
- "synonyms", # https://github.com/laminlabs/lamindb/issues/1685
1110
- ):
1111
- standardized_values = registry.standardize(
1112
- values, field=field, mute=True, **kwargs
1113
- )
1114
- values = standardized_values
1115
-
1116
1210
  inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
1117
1211
  inspect_result._validated += include_validated
1118
1212
  inspect_result._non_validated = [
@@ -1144,8 +1238,7 @@ def validate_categories(
1144
1238
  organism: str | None = None,
1145
1239
  source: Record | None = None,
1146
1240
  exclude: str | list | None = None,
1147
- standardize: bool = True,
1148
- validated_hint_print: str | None = None,
1241
+ hint_print: str | None = None,
1149
1242
  ) -> tuple[bool, list]:
1150
1243
  """Validate ontology terms in a pandas series using LaminDB registries.
1151
1244
 
@@ -1158,7 +1251,7 @@ def validate_categories(
1158
1251
  source: The source record.
1159
1252
  exclude: Exclude specific values from validation.
1160
1253
  standardize: Whether to standardize the values.
1161
- validated_hint_print: The hint to print for validated values.
1254
+ hint_print: The hint to print that suggests fixing non-validated values.
1162
1255
  """
1163
1256
  from lamindb._from_values import _print_values
1164
1257
  from lamindb.core._settings import settings
@@ -1167,42 +1260,43 @@ def validate_categories(
1167
1260
 
1168
1261
  def _log_mapping_info():
1169
1262
  logger.indent = ""
1170
- logger.info(f"mapping {colors.italic(key)} on {colors.italic(model_field)}")
1171
- logger.indent = " "
1263
+ logger.info(f'mapping "{key}" on {colors.italic(model_field)}')
1264
+ logger.indent = " "
1172
1265
 
1173
1266
  registry = field.field.model
1174
1267
 
1268
+ # {"organism": organism_name/organism_record}
1175
1269
  kwargs = check_registry_organism(registry, organism)
1176
1270
  kwargs.update({"source": source} if source else {})
1177
1271
  kwargs_current = get_current_filter_kwargs(registry, kwargs)
1178
1272
 
1179
- # inspect the default instance
1180
- inspect_result = standardize_and_inspect(
1273
+ # inspect values from the default instance
1274
+ inspect_result = inspect_instance(
1181
1275
  values=values,
1182
1276
  field=field,
1183
1277
  registry=registry,
1184
- standardize=standardize,
1185
1278
  exclude=exclude,
1186
1279
  **kwargs_current,
1187
1280
  )
1188
1281
  non_validated = inspect_result.non_validated
1282
+ syn_mapper = inspect_result.synonyms_mapper
1189
1283
 
1190
- # inspect the using instance
1284
+ # inspect the non-validated values from the using_key instance
1191
1285
  values_validated = []
1192
1286
  if using_key is not None and using_key != "default" and non_validated:
1193
1287
  registry_using = get_registry_instance(registry, using_key)
1194
- inspect_result = standardize_and_inspect(
1288
+ inspect_result = inspect_instance(
1195
1289
  values=non_validated,
1196
1290
  field=field,
1197
1291
  registry=registry_using,
1198
- standardize=standardize,
1199
1292
  exclude=exclude,
1200
1293
  **kwargs,
1201
1294
  )
1202
1295
  non_validated = inspect_result.non_validated
1203
1296
  values_validated += inspect_result.validated
1297
+ syn_mapper.update(inspect_result.synonyms_mapper)
1204
1298
 
1205
- # inspect from public (bionty only)
1299
+ # inspect the non-validated values from public (bionty only)
1206
1300
  if hasattr(registry, "public"):
1207
1301
  verbosity = settings.verbosity
1208
1302
  try:
@@ -1216,39 +1310,35 @@ def validate_categories(
1216
1310
  finally:
1217
1311
  settings.verbosity = verbosity
1218
1312
 
1219
- validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
1220
- n_validated = len(values_validated)
1221
-
1222
- if n_validated > 0:
1223
- _log_mapping_info()
1224
- terms_str = f"{', '.join([f'{chr(39)}{v}{chr(39)}' for v in values_validated[:10]])}{', ...' if len(values_validated) > 10 else ''}"
1225
- val_numerous = "" if n_validated == 1 else "s"
1226
- logger.warning(
1227
- f"found {colors.yellow(n_validated)} validated term{val_numerous}: "
1228
- f"{colors.yellow(terms_str)}\n"
1229
- f"→ save term{val_numerous} via {colors.yellow(validated_hint_print)}"
1230
- )
1231
-
1232
- non_validated_hint_print = validated_hint_print.replace("_validated_", "_new_")
1313
+ # logging messages
1314
+ non_validated_hint_print = hint_print or f'.add_new_from("{key}")'
1233
1315
  non_validated = [i for i in non_validated if i not in values_validated]
1234
1316
  n_non_validated = len(non_validated)
1235
1317
  if n_non_validated == 0:
1236
- if n_validated == 0:
1318
+ if len(values_validated) == 0:
1319
+ # nothing to validate
1237
1320
  logger.indent = ""
1238
- logger.success(f"'{key}' is validated against {colors.italic(model_field)}")
1321
+ logger.success(f'"{key}" is validated against {colors.italic(model_field)}')
1239
1322
  return True, []
1240
1323
  else:
1241
1324
  # validated values still need to be saved to the current instance
1242
1325
  return False, []
1243
1326
  else:
1244
- non_val_numerous = ("", "is") if n_non_validated == 1 else ("s", "are")
1327
+ are = "is" if n_non_validated == 1 else "are"
1328
+ s = "" if n_non_validated == 1 else "s"
1245
1329
  print_values = _print_values(non_validated)
1246
- warning_message = (
1247
- f"{colors.red(f'{n_non_validated} term{non_val_numerous[0]}')} {non_val_numerous[1]} not validated: "
1248
- f"{colors.red(', '.join(print_values.split(', ')[:10]) + ', ...' if len(print_values.split(', ')) > 10 else print_values)}\n"
1249
- f"→ fix typo{non_val_numerous[0]}, remove non-existent value{non_val_numerous[0]}, or save term{non_val_numerous[0]} via "
1250
- f"{colors.red(non_validated_hint_print)}"
1251
- )
1330
+ warning_message = f"{colors.red(f'{n_non_validated} term{s}')} {are} not validated: {colors.red(print_values)}\n"
1331
+ if syn_mapper:
1332
+ s = "" if len(syn_mapper) == 1 else "s"
1333
+ syn_mapper_print = _print_values(
1334
+ [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
1335
+ )
1336
+ hint_msg = f'.standardize("{key}")'
1337
+ warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}"
1338
+ if n_non_validated > len(syn_mapper):
1339
+ if syn_mapper:
1340
+ warning_message += " for remaining terms:\n"
1341
+ warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}"
1252
1342
 
1253
1343
  if logger.indent == "":
1254
1344
  _log_mapping_info()
@@ -1257,6 +1347,44 @@ def validate_categories(
1257
1347
  return False, non_validated
1258
1348
 
1259
1349
 
1350
+ def standardize_categories(
1351
+ values: Iterable[str],
1352
+ field: FieldAttr,
1353
+ using_key: str | None = None,
1354
+ organism: str | None = None,
1355
+ source: Record | None = None,
1356
+ ) -> dict:
1357
+ """Get a synonym mapper."""
1358
+ registry = field.field.model
1359
+ if not hasattr(registry, "standardize"):
1360
+ return {}
1361
+ # standardize values using the default instance
1362
+ syn_mapper = registry.standardize(
1363
+ values,
1364
+ field=field.field.name,
1365
+ organism=organism,
1366
+ source=source,
1367
+ mute=True,
1368
+ return_mapper=True,
1369
+ )
1370
+
1371
+ if len(values) > len(syn_mapper): # type: ignore
1372
+ # standardize values using the using_key instance
1373
+ if using_key is not None and using_key != "default":
1374
+ registry_using = get_registry_instance(registry, using_key)
1375
+ syn_mapper.update(
1376
+ registry_using.standardize(
1377
+ [v for v in values if v not in syn_mapper],
1378
+ field=field.field.name,
1379
+ organism=organism,
1380
+ source=source,
1381
+ mute=True,
1382
+ return_mapper=True,
1383
+ )
1384
+ )
1385
+ return syn_mapper
1386
+
1387
+
1260
1388
  def validate_categories_in_df(
1261
1389
  df: pd.DataFrame,
1262
1390
  fields: dict[str, FieldAttr],
@@ -1304,15 +1432,15 @@ def save_artifact(
1304
1432
 
1305
1433
  Args:
1306
1434
  data: The DataFrame or AnnData object to save.
1307
- description: A description of the artifact.
1308
1435
  fields: A dictionary mapping obs_column to registry_field.
1309
1436
  columns_field: The registry field to validate variables index against.
1437
+ description: A description of the artifact.
1310
1438
  organism: The organism name.
1311
1439
  adata: The AnnData object to save and get n_observations, must be provided if data is a path.
1312
- type: `Literal["dataset", "model"] | None = None` The artifact type.
1313
- key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
1314
- revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
1315
- run: `Run | None = None` The run that creates the artifact.
1440
+ type: The artifact type.
1441
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
1442
+ revises: Previous version of the artifact. Triggers a revision.
1443
+ run: The run that creates the artifact.
1316
1444
 
1317
1445
  Returns:
1318
1446
  The saved Artifact.
@@ -1402,6 +1530,7 @@ def save_artifact(
1402
1530
  feature=feature,
1403
1531
  feature_ref_is_name=feature_ref_is_name,
1404
1532
  label_ref_is_name=label_ref_is_name,
1533
+ from_curator=True,
1405
1534
  )
1406
1535
 
1407
1536
  if artifact._accessor == "MuData":
@@ -1457,15 +1586,12 @@ def update_registry(
1457
1586
  values: list[str],
1458
1587
  field: FieldAttr,
1459
1588
  key: str,
1460
- save_function: str = "add_new_from",
1461
1589
  using_key: str | None = None,
1462
1590
  validated_only: bool = True,
1463
1591
  df: pd.DataFrame | None = None,
1464
1592
  organism: str | None = None,
1465
1593
  dtype: str | None = None,
1466
1594
  source: Record | None = None,
1467
- standardize: bool = True,
1468
- warning: bool = True,
1469
1595
  exclude: str | list | None = None,
1470
1596
  **kwargs,
1471
1597
  ) -> None:
@@ -1475,13 +1601,13 @@ def update_registry(
1475
1601
  values: A list of values to be saved as labels.
1476
1602
  field: The FieldAttr object representing the field for which labels are being saved.
1477
1603
  key: The name of the feature to save.
1478
- save_function: The name of the function to save the labels.
1479
1604
  using_key: The name of the instance from which to transfer labels (if applicable).
1480
1605
  validated_only: If True, only save validated labels.
1481
1606
  df: A DataFrame to save labels from.
1482
1607
  organism: The organism name.
1483
1608
  dtype: The type of the feature.
1484
1609
  source: The source record.
1610
+ exclude: Values to exclude from inspect.
1485
1611
  kwargs: Additional keyword arguments to pass to the registry model to create new records.
1486
1612
  """
1487
1613
  from lamindb._save import save as ln_save
@@ -1490,78 +1616,55 @@ def update_registry(
1490
1616
  registry = field.field.model
1491
1617
  filter_kwargs = check_registry_organism(registry, organism)
1492
1618
  filter_kwargs.update({"source": source} if source else {})
1619
+ if not values:
1620
+ return
1493
1621
 
1494
1622
  verbosity = settings.verbosity
1495
1623
  try:
1496
1624
  settings.verbosity = "error"
1625
+ labels_saved: dict = {"from public": [], "new": []}
1497
1626
 
1498
- # save from public
1627
+ # inspect the default instance and save validated records from public
1499
1628
  filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1500
- existing_and_public_records = (
1501
- registry.from_values(
1502
- list(values),
1503
- field=field,
1504
- **filter_kwargs_current,
1505
- )
1506
- if values
1507
- else []
1629
+ existing_and_public_records = registry.from_values(
1630
+ list(values), field=field, **filter_kwargs_current
1508
1631
  )
1509
-
1510
- labels_saved: dict = {"from public": [], "without reference": []}
1511
-
1632
+ existing_and_public_labels = [
1633
+ getattr(r, field.field.name) for r in existing_and_public_records
1634
+ ]
1635
+ # public records that are not already in the database
1512
1636
  public_records = [r for r in existing_and_public_records if r._state.adding]
1513
1637
  # here we check to only save the public records if they are from the specified source
1514
1638
  # we check the uid because r.source and source can be from different instances
1515
1639
  if source:
1516
1640
  public_records = [r for r in public_records if r.source.uid == source.uid]
1517
-
1518
- if public_records:
1641
+ if len(public_records) > 0:
1519
1642
  settings.verbosity = "info"
1520
1643
  logger.info(f"saving validated records of '{key}'")
1521
1644
  settings.verbosity = "error"
1522
- ln_save(public_records)
1523
- labels_saved["from public"] = [
1524
- getattr(r, field.field.name) for r in public_records
1645
+ ln_save(public_records)
1646
+ labels_saved["from public"] = [
1647
+ getattr(r, field.field.name) for r in public_records
1648
+ ]
1649
+ # non-validated records from the default instance
1650
+ non_validated_labels = [
1651
+ i for i in values if i not in existing_and_public_labels
1525
1652
  ]
1526
- non_public_labels = [i for i in values if i not in labels_saved["from public"]]
1527
-
1528
- # inspect the default instance
1529
- inspect_result_current = standardize_and_inspect(
1530
- values=non_public_labels,
1531
- field=field,
1532
- registry=registry,
1533
- standardize=standardize,
1534
- exclude=exclude,
1535
- **filter_kwargs_current,
1536
- )
1537
- if not inspect_result_current.non_validated:
1538
- all_labels = registry.from_values(
1539
- inspect_result_current.validated,
1540
- field=field,
1541
- **filter_kwargs_current,
1542
- )
1543
- settings.verbosity = verbosity
1544
- return all_labels
1545
1653
 
1546
- # inspect the using_key instance
1654
+ # inspect and save validated records the using_key instance
1547
1655
  (
1548
1656
  labels_saved[f"from {using_key}"],
1549
1657
  non_validated_labels,
1550
1658
  ) = update_registry_from_using_instance(
1551
- inspect_result_current.non_validated,
1659
+ non_validated_labels,
1552
1660
  field=field,
1553
1661
  using_key=using_key,
1554
1662
  exclude=exclude,
1555
1663
  **filter_kwargs,
1556
1664
  )
1557
1665
 
1558
- labels_saved["without reference"] = [
1559
- i
1560
- for i in non_validated_labels
1561
- if i not in labels_saved[f"from {using_key}"]
1562
- ]
1563
-
1564
- # save non-validated records
1666
+ # save non-validated/new records
1667
+ labels_saved["new"] = non_validated_labels
1565
1668
  if not validated_only:
1566
1669
  non_validated_records = []
1567
1670
  if df is not None and registry == Feature:
@@ -1572,7 +1675,7 @@ def update_registry(
1572
1675
  # make sure organism record is saved to the current instance
1573
1676
  filter_kwargs["organism"] = _save_organism(name=organism)
1574
1677
  init_kwargs = {}
1575
- for value in labels_saved["without reference"]:
1678
+ for value in labels_saved["new"]:
1576
1679
  init_kwargs[field.field.name] = value
1577
1680
  if registry == Feature:
1578
1681
  init_kwargs["dtype"] = "cat" if dtype is None else dtype
@@ -1585,38 +1688,26 @@ def update_registry(
1585
1688
  )
1586
1689
  ln_save(non_validated_records)
1587
1690
 
1588
- # save parent labels for ulabels
1691
+ # save parent labels for ulabels, for example a parent label "project" for label "project001"
1589
1692
  if registry == ULabel and field.field.name == "name":
1590
- save_ulabels_with_parent(values, field=field, key=key)
1591
-
1592
- # # get all records that are now validated in the current instance
1593
- # all_labels = registry.from_values(
1594
- # inspect_result_current.validated + inspect_result_current.non_validated,
1595
- # field=field,
1596
- # **get_current_filter_kwargs(registry, filter_kwargs),
1597
- # )
1693
+ save_ulabels_parent(values, field=field, key=key)
1694
+
1598
1695
  finally:
1599
1696
  settings.verbosity = verbosity
1600
1697
 
1601
1698
  log_saved_labels(
1602
1699
  labels_saved,
1603
1700
  key=key,
1604
- save_function=save_function,
1605
1701
  model_field=f"{registry.__name__}.{field.field.name}",
1606
1702
  validated_only=validated_only,
1607
- warning=warning,
1608
1703
  )
1609
1704
 
1610
- # return all_labels
1611
-
1612
1705
 
1613
1706
  def log_saved_labels(
1614
1707
  labels_saved: dict,
1615
1708
  key: str,
1616
- save_function: str,
1617
1709
  model_field: str,
1618
1710
  validated_only: bool = True,
1619
- warning: bool = True,
1620
1711
  ) -> None:
1621
1712
  """Log the saved labels."""
1622
1713
  from ._from_values import _print_values
@@ -1625,45 +1716,26 @@ def log_saved_labels(
1625
1716
  for k, labels in labels_saved.items():
1626
1717
  if not labels:
1627
1718
  continue
1628
-
1629
- if k == "without reference" and validated_only:
1719
+ if k == "new" and validated_only:
1630
1720
  continue
1631
- # msg = colors.yellow(
1632
- # f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
1633
- # )
1634
- # lookup_print = (
1635
- # f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
1636
- # )
1637
-
1638
- # hint = f".add_new_from('{key}')"
1639
- # msg += f"\n → to lookup values, use {lookup_print}"
1640
- # msg += (
1641
- # f"\n → to save, run {colors.yellow(hint)}"
1642
- # if save_function == "add_new_from"
1643
- # else f"\n → to save, run {colors.yellow(save_function)}"
1644
- # )
1645
- # if warning:
1646
- # logger.warning(msg)
1647
- # else:
1648
- # logger.info(msg)
1649
1721
  else:
1650
- k = "" if k == "without reference" else f"{colors.green(k)} "
1722
+ k = "" if k == "new" else f"{colors.green(k)} "
1651
1723
  # the term "transferred" stresses that this is always in the context of transferring
1652
1724
  # labels from a public ontology or a different instance to the present instance
1653
1725
  s = "s" if len(labels) > 1 else ""
1654
1726
  logger.success(
1655
- f"added {len(labels)} record{s} {k}with {model_field} for {colors.italic(key)}: {_print_values(labels)}"
1727
+ f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {_print_values(labels)}'
1656
1728
  )
1657
1729
 
1658
1730
 
1659
- def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> None:
1731
+ def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None:
1660
1732
  """Save a parent label for the given labels."""
1661
1733
  registry = field.field.model
1662
1734
  assert registry == ULabel # noqa: S101
1663
1735
  all_records = registry.from_values(list(values), field=field)
1664
- is_feature = registry.filter(name=f"is_{key}").one_or_none()
1736
+ is_feature = registry.filter(name=f"{key}").one_or_none()
1665
1737
  if is_feature is None:
1666
- is_feature = registry(name=f"is_{key}").save()
1738
+ is_feature = registry(name=f"{key}").save()
1667
1739
  logger.important(f"Created a parent ULabel: {is_feature}")
1668
1740
  is_feature.children.add(*all_records)
1669
1741
 
@@ -1672,7 +1744,6 @@ def update_registry_from_using_instance(
1672
1744
  values: list[str],
1673
1745
  field: FieldAttr,
1674
1746
  using_key: str | None = None,
1675
- standardize: bool = False,
1676
1747
  exclude: str | list | None = None,
1677
1748
  **kwargs,
1678
1749
  ) -> tuple[list[str], list[str]]:
@@ -1682,7 +1753,6 @@ def update_registry_from_using_instance(
1682
1753
  values: A list of values to be saved as labels.
1683
1754
  field: The FieldAttr object representing the field for which labels are being saved.
1684
1755
  using_key: The name of the instance from which to transfer labels (if applicable).
1685
- standardize: Whether to also standardize the values.
1686
1756
  kwargs: Additional keyword arguments to pass to the registry model.
1687
1757
 
1688
1758
  Returns:
@@ -1694,11 +1764,10 @@ def update_registry_from_using_instance(
1694
1764
  if using_key is not None and using_key != "default":
1695
1765
  registry_using = get_registry_instance(field.field.model, using_key)
1696
1766
 
1697
- inspect_result_using = standardize_and_inspect(
1767
+ inspect_result_using = inspect_instance(
1698
1768
  values=values,
1699
1769
  field=field,
1700
1770
  registry=registry_using,
1701
- standardize=standardize,
1702
1771
  exclude=exclude,
1703
1772
  **kwargs,
1704
1773
  )
@@ -1713,7 +1782,7 @@ def update_registry_from_using_instance(
1713
1782
  return labels_saved, not_saved
1714
1783
 
1715
1784
 
1716
- def _save_organism(name: str): # pragma: no cover
1785
+ def _save_organism(name: str):
1717
1786
  """Save an organism record."""
1718
1787
  import bionty as bt
1719
1788
 
@@ -1722,8 +1791,8 @@ def _save_organism(name: str): # pragma: no cover
1722
1791
  organism = bt.Organism.from_source(name=name)
1723
1792
  if organism is None:
1724
1793
  raise ValidationError(
1725
- f"Organism '{name}' not found\n"
1726
- f" → please save it: bt.Organism(name='{name}').save()"
1794
+ f'Organism "{name}" not found\n'
1795
+ f' → please save it: bt.Organism(name="{name}").save()'
1727
1796
  )
1728
1797
  organism.save()
1729
1798
  return organism