lamindb 0.76.13__py3-none-any.whl → 0.76.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_curate.py CHANGED
@@ -34,21 +34,21 @@ class CurateLookup:
34
34
  categoricals: dict[str, FieldAttr],
35
35
  slots: dict[str, FieldAttr] = None,
36
36
  using_key: str | None = None,
37
+ public: bool = False,
37
38
  ) -> None:
38
39
  if slots is None:
39
40
  slots = {}
40
41
  self._fields = {**categoricals, **slots}
41
42
  self._using_key = None if using_key == "default" else using_key
42
43
  self._using_key_name = self._using_key or ln_setup.settings.instance.slug
43
- debug_message = (
44
- f"Lookup objects from the " f"{colors.italic(self._using_key_name)}"
45
- )
44
+ self._public = public
45
+ debug_message = f"Lookup objects from {colors.italic(self._using_key_name)}"
46
46
  logger.debug(debug_message)
47
47
 
48
48
  def __getattr__(self, name):
49
49
  if name in self._fields:
50
50
  registry = self._fields[name].field.model
51
- if self._using_key == "public":
51
+ if self._public and hasattr(registry, "public"):
52
52
  return registry.public().lookup()
53
53
  else:
54
54
  return get_registry_instance(registry, self._using_key).lookup()
@@ -59,7 +59,7 @@ class CurateLookup:
59
59
  def __getitem__(self, name):
60
60
  if name in self._fields:
61
61
  registry = self._fields[name].field.model
62
- if self._using_key == "public":
62
+ if self._public and hasattr(registry, "public"):
63
63
  return registry.public().lookup()
64
64
  else:
65
65
  return get_registry_instance(registry, self._using_key).lookup()
@@ -75,12 +75,14 @@ class CurateLookup:
75
75
  getitem_keys = "\n ".join(
76
76
  [str([key]) for key in self._fields if not key.isidentifier()]
77
77
  )
78
+ ref = "public" if self._public else self._using_key_name
78
79
  return (
79
- f"Lookup objects from the {colors.italic(self._using_key_name)}:\n "
80
+ f"Lookup objects from the {colors.italic(ref)}:\n "
80
81
  f"{colors.green(getattr_keys)}\n "
81
- f"{colors.green(getitem_keys)}\n\n"
82
- "Example:\n → categories = validator.lookup().cell_type\n"
83
- " → categories.alveolar_type_1_fibroblast_cell"
82
+ f"{colors.green(getitem_keys)}\n"
83
+ "Example:\n → categories = validator.lookup()['cell_type']\n"
84
+ " → categories.alveolar_type_1_fibroblast_cell\n\n"
85
+ "To look up public ontologies, use .lookup(public=True)"
84
86
  )
85
87
  else: # pragma: no cover
86
88
  return colors.warning("No fields are found!")
@@ -97,12 +99,20 @@ class BaseCurator:
97
99
  """
98
100
  pass
99
101
 
100
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
102
+ def save_artifact(
103
+ self,
104
+ description: str | None = None,
105
+ key: str | None = None,
106
+ revises: Artifact | None = None,
107
+ run: Run | None = None,
108
+ ) -> Artifact:
101
109
  """Save the dataset as artifact.
102
110
 
103
111
  Args:
104
- description: Description of the DataFrame object.
105
- **kwargs: Object level metadata.
112
+ description: `str | None = None` A description of the DataFrame object.
113
+ key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
114
+ revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
115
+ run: `Run | None = None` The run that creates the artifact.
106
116
 
107
117
  Returns:
108
118
  A saved artifact record.
@@ -174,7 +184,7 @@ class DataFrameCurator(BaseCurator):
174
184
  def non_validated(self) -> list:
175
185
  """Return the non-validated features and labels."""
176
186
  if self._non_validated is None:
177
- raise ValueError("Please run validate() first!")
187
+ raise ValidationError("Please run validate() first!")
178
188
  return self._non_validated
179
189
 
180
190
  @property
@@ -182,7 +192,9 @@ class DataFrameCurator(BaseCurator):
182
192
  """Return the columns fields to validate against."""
183
193
  return self._fields
184
194
 
185
- def lookup(self, using_key: str | None = None) -> CurateLookup:
195
+ def lookup(
196
+ self, using_key: str | None = None, public: bool = False
197
+ ) -> CurateLookup:
186
198
  """Lookup categories.
187
199
 
188
200
  Args:
@@ -194,6 +206,7 @@ class DataFrameCurator(BaseCurator):
194
206
  categoricals=self._fields,
195
207
  slots={"columns": self._columns_field},
196
208
  using_key=using_key or self._using_key,
209
+ public=public,
197
210
  )
198
211
 
199
212
  def _check_valid_keys(self, extra: set = None) -> None:
@@ -209,7 +222,7 @@ class DataFrameCurator(BaseCurator):
209
222
  valid_keys = set(self._df.columns) | {"columns"} | extra
210
223
  nonval_keys = [key for key in d.keys() if key not in valid_keys]
211
224
  if len(nonval_keys) > 0:
212
- raise ValueError(
225
+ raise ValidationError(
213
226
  f"the following keys passed to {name} are not allowed: {nonval_keys}"
214
227
  )
215
228
 
@@ -245,16 +258,6 @@ class DataFrameCurator(BaseCurator):
245
258
  **kwargs,
246
259
  )
247
260
 
248
- def add_validated_from(self, key: str, organism: str | None = None):
249
- """Add validated categories.
250
-
251
- Args:
252
- key: The key referencing the slot in the DataFrame.
253
- organism: The organism name.
254
- """
255
- self._kwargs.update({"organism": organism} if organism else {})
256
- self._update_registry(key, validated_only=True, **self._kwargs)
257
-
258
261
  def add_new_from(self, key: str, organism: str | None = None, **kwargs):
259
262
  """Add validated & new categories.
260
263
 
@@ -285,9 +288,11 @@ class DataFrameCurator(BaseCurator):
285
288
  self._save_columns(validated_only=validated_only, **kwargs)
286
289
  else:
287
290
  if categorical not in self.fields:
288
- raise ValueError(f"Feature {categorical} is not part of the fields!")
291
+ raise ValidationError(
292
+ f"Feature {categorical} is not part of the fields!"
293
+ )
289
294
  update_registry(
290
- values=self._df[categorical].unique().tolist(),
295
+ values=flatten_unique(self._df[categorical]),
291
296
  field=self.fields[categorical],
292
297
  key=categorical,
293
298
  using_key=self._using_key,
@@ -300,7 +305,7 @@ class DataFrameCurator(BaseCurator):
300
305
  def _update_registry_all(self, validated_only: bool = True, **kwargs):
301
306
  """Save labels for all features."""
302
307
  for name in self.fields.keys():
303
- logger.info(f"saving labels for '{name}'")
308
+ logger.info(f"saving validated records of '{name}'")
304
309
  self._update_registry(name, validated_only=validated_only, **kwargs)
305
310
 
306
311
  def validate(self, organism: str | None = None) -> bool:
@@ -313,6 +318,10 @@ class DataFrameCurator(BaseCurator):
313
318
  Whether the DataFrame is validated.
314
319
  """
315
320
  self._kwargs.update({"organism": organism} if organism else {})
321
+
322
+ # add all validated records to the current instance
323
+ self._update_registry_all()
324
+
316
325
  self._validated, self._non_validated = validate_categories_in_df( # type: ignore
317
326
  self._df,
318
327
  fields=self.fields,
@@ -323,12 +332,20 @@ class DataFrameCurator(BaseCurator):
323
332
  )
324
333
  return self._validated
325
334
 
326
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
335
+ def save_artifact(
336
+ self,
337
+ description: str | None = None,
338
+ key: str | None = None,
339
+ revises: Artifact | None = None,
340
+ run: Run | None = None,
341
+ ) -> Artifact:
327
342
  """Save the validated DataFrame and metadata.
328
343
 
329
344
  Args:
330
- description: Description of the DataFrame object.
331
- **kwargs: Object level metadata.
345
+ description: `str | None = None` Description of the DataFrame object.
346
+ key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
347
+ revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
348
+ run: `Run | None = None` The run that creates the artifact.
332
349
 
333
350
  Returns:
334
351
  A saved artifact record.
@@ -344,15 +361,18 @@ class DataFrameCurator(BaseCurator):
344
361
  verbosity = settings.verbosity
345
362
  try:
346
363
  settings.verbosity = "warning"
347
- # save all validated records to the current instance
348
- self.add_validated_from("all")
364
+ if not self._validated:
365
+ # save all validated records to the current instance
366
+ self._update_registry_all()
349
367
 
350
368
  self._artifact = save_artifact(
351
369
  self._df,
352
370
  description=description,
353
371
  fields=self.fields,
354
372
  columns_field=self._columns_field,
355
- **kwargs,
373
+ key=key,
374
+ revises=revises,
375
+ run=run,
356
376
  **self._kwargs,
357
377
  )
358
378
  finally:
@@ -457,7 +477,9 @@ class AnnDataCurator(DataFrameCurator):
457
477
  """Return the obs fields to validate against."""
458
478
  return self._obs_fields
459
479
 
460
- def lookup(self, using_key: str | None = None) -> CurateLookup:
480
+ def lookup(
481
+ self, using_key: str | None = None, public: bool = False
482
+ ) -> CurateLookup:
461
483
  """Lookup categories.
462
484
 
463
485
  Args:
@@ -469,6 +491,7 @@ class AnnDataCurator(DataFrameCurator):
469
491
  categoricals=self._obs_fields,
470
492
  slots={"columns": self._columns_field, "var_index": self._var_field},
471
493
  using_key=using_key or self._using_key,
494
+ public=public,
472
495
  )
473
496
 
474
497
  def _save_from_var_index(
@@ -479,7 +502,7 @@ class AnnDataCurator(DataFrameCurator):
479
502
  values=list(self._adata.var.index),
480
503
  field=self.var_index,
481
504
  key="var_index",
482
- save_function="add_new_from_var_index",
505
+ save_function=".add_new_from_var_index()",
483
506
  using_key=self._using_key,
484
507
  validated_only=validated_only,
485
508
  organism=organism,
@@ -489,12 +512,11 @@ class AnnDataCurator(DataFrameCurator):
489
512
 
490
513
  def _update_registry_all(self, validated_only: bool = True, **kwargs):
491
514
  """Save labels for all features."""
492
- for name in self.fields.keys():
493
- logger.info(f"saving labels for '{name}'")
494
- if name == "var_index":
495
- self._save_from_var_index(validated_only=validated_only, **kwargs)
496
- else:
497
- self._update_registry(name, validated_only=validated_only, **kwargs)
515
+ logger.info("saving validated records of 'var_index'")
516
+ self._save_from_var_index(validated_only=validated_only, **self._kwargs)
517
+ for name in self._obs_fields.keys():
518
+ logger.info(f"saving validated terms of '{name}'")
519
+ self._update_registry(name, validated_only=validated_only, **self._kwargs)
498
520
 
499
521
  def add_new_from_var_index(self, organism: str | None = None, **kwargs):
500
522
  """Update variable records.
@@ -506,15 +528,6 @@ class AnnDataCurator(DataFrameCurator):
506
528
  self._kwargs.update({"organism": organism} if organism else {})
507
529
  self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
508
530
 
509
- def add_validated_from_var_index(self, organism: str | None = None):
510
- """Add validated variable records.
511
-
512
- Args:
513
- organism: The organism name.
514
- """
515
- self._kwargs.update({"organism": organism} if organism else {})
516
- self._save_from_var_index(validated_only=True, **self._kwargs)
517
-
518
531
  def validate(self, organism: str | None = None) -> bool:
519
532
  """Validate categories.
520
533
 
@@ -530,6 +543,9 @@ class AnnDataCurator(DataFrameCurator):
530
543
  f"validating metadata using registries of instance {colors.italic(self._using_key)}"
531
544
  )
532
545
 
546
+ # add all validated records to the current instance
547
+ self._update_registry_all()
548
+
533
549
  validated_var, non_validated_var = validate_categories(
534
550
  self._adata.var.index,
535
551
  field=self._var_field,
@@ -554,30 +570,49 @@ class AnnDataCurator(DataFrameCurator):
554
570
  self._validated = validated_var and validated_obs
555
571
  return self._validated
556
572
 
557
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
573
+ def save_artifact(
574
+ self,
575
+ description: str | None = None,
576
+ key: str | None = None,
577
+ revises: Artifact | None = None,
578
+ run: Run | None = None,
579
+ ) -> Artifact:
558
580
  """Save the validated ``AnnData`` and metadata.
559
581
 
560
582
  Args:
561
- description: Description of the ``AnnData`` object.
562
- **kwargs: Object level metadata.
583
+ description: `str | None = None` A description of the ``AnnData`` object.
584
+ key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
585
+ revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
586
+ run: `Run | None = None` The run that creates the artifact.
563
587
 
564
588
  Returns:
565
589
  A saved artifact record.
566
590
  """
591
+ from lamindb.core._settings import settings
592
+
567
593
  if not self._validated:
568
594
  self.validate()
569
595
  if not self._validated:
570
596
  raise ValidationError("Dataset does not validate. Please curate.")
571
-
572
- self._artifact = save_artifact(
573
- self._data,
574
- adata=self._adata,
575
- description=description,
576
- columns_field=self.var_index,
577
- fields=self.categoricals,
578
- **self._kwargs,
579
- **kwargs,
580
- )
597
+ verbosity = settings.verbosity
598
+ try:
599
+ settings.verbosity = "warning"
600
+ if not self._validated:
601
+ # save all validated records to the current instance
602
+ self._update_registry_all()
603
+ self._artifact = save_artifact(
604
+ self._data,
605
+ adata=self._adata,
606
+ description=description,
607
+ columns_field=self.var_index,
608
+ fields=self.categoricals,
609
+ key=key,
610
+ revises=revises,
611
+ run=run,
612
+ **self._kwargs,
613
+ )
614
+ finally:
615
+ settings.verbosity = verbosity
581
616
  return self._artifact
582
617
 
583
618
 
@@ -656,10 +691,6 @@ class MuDataCurator:
656
691
  )
657
692
  for modality in self._modalities
658
693
  }
659
- for modality in self._var_fields.keys():
660
- self._save_from_var_index_modality(
661
- modality=modality, validated_only=True, **self._kwargs
662
- )
663
694
 
664
695
  @property
665
696
  def var_index(self) -> FieldAttr:
@@ -675,7 +706,7 @@ class MuDataCurator:
675
706
  """Verify the modality exists."""
676
707
  for modality in modalities:
677
708
  if modality not in self._mdata.mod.keys():
678
- raise ValueError(f"modality '{modality}' does not exist!")
709
+ raise ValidationError(f"modality '{modality}' does not exist!")
679
710
 
680
711
  def _save_from_var_index_modality(
681
712
  self, modality: str, validated_only: bool = True, **kwargs
@@ -685,7 +716,7 @@ class MuDataCurator:
685
716
  values=list(self._mdata[modality].var.index),
686
717
  field=self._var_fields[modality],
687
718
  key="var_index",
688
- save_function="add_new_from_var_index",
719
+ save_function=f'.add_new_from_var_index("{modality}")',
689
720
  using_key=self._using_key,
690
721
  validated_only=validated_only,
691
722
  dtype="number",
@@ -700,7 +731,7 @@ class MuDataCurator:
700
731
  obs_fields: dict[str, dict[str, FieldAttr]] = {}
701
732
  for k, v in categoricals.items():
702
733
  if k not in self._mdata.obs.columns:
703
- raise ValueError(f"column '{k}' does not exist in mdata.obs!")
734
+ raise ValidationError(f"column '{k}' does not exist in mdata.obs!")
704
735
  if any(k.startswith(prefix) for prefix in prefixes):
705
736
  modality, col = k.split(":")[0], k.split(":")[1]
706
737
  if modality not in obs_fields.keys():
@@ -712,7 +743,9 @@ class MuDataCurator:
712
743
  obs_fields["obs"][k] = v
713
744
  return obs_fields
714
745
 
715
- def lookup(self, using_key: str | None = None) -> CurateLookup:
746
+ def lookup(
747
+ self, using_key: str | None = None, public: bool = False
748
+ ) -> CurateLookup:
716
749
  """Lookup categories.
717
750
 
718
751
  Args:
@@ -727,6 +760,7 @@ class MuDataCurator:
727
760
  **{f"{k}_var_index": v for k, v in self._var_fields.items()},
728
761
  },
729
762
  using_key=using_key or self._using_key,
763
+ public=public,
730
764
  )
731
765
 
732
766
  def add_new_from_columns(
@@ -774,33 +808,14 @@ class MuDataCurator:
774
808
  modality=modality, validated_only=False, **self._kwargs, **kwargs
775
809
  )
776
810
 
777
- def add_validated_from_var_index(self, modality: str, organism: str | None = None):
778
- """Add validated variable records.
779
-
780
- Args:
781
- modality: The modality name.
782
- organism: The organism name.
783
- """
784
- self._kwargs.update({"organism": organism} if organism else {})
785
- self._save_from_var_index_modality(
786
- modality=modality, validated_only=True, **self._kwargs
787
- )
788
-
789
- def add_validated_from(
790
- self, key: str, modality: str | None = None, organism: str | None = None
791
- ):
792
- """Add validated categories.
793
-
794
- Args:
795
- key: The key referencing the slot in the DataFrame.
796
- modality: The modality name.
797
- organism: The organism name.
798
- """
799
- self._kwargs.update({"organism": organism} if organism else {})
800
- modality = modality or "obs"
801
- if modality in self._df_annotators:
802
- df_annotator = self._df_annotators[modality]
803
- df_annotator.add_validated_from(key=key, **self._kwargs)
811
+ def _update_registry_all(self):
812
+ """Update all registries."""
813
+ for modality in self._var_fields.keys():
814
+ self._save_from_var_index_modality(
815
+ modality=modality, validated_only=True, **self._kwargs
816
+ )
817
+ for _, df_annotator in self._df_annotators.items():
818
+ df_annotator._update_registry_all(validated_only=True, **self._kwargs)
804
819
 
805
820
  def add_new_from(
806
821
  self,
@@ -827,11 +842,22 @@ class MuDataCurator:
827
842
 
828
843
  def validate(self, organism: str | None = None) -> bool:
829
844
  """Validate categories."""
845
+ from lamindb.core._settings import settings
846
+
830
847
  self._kwargs.update({"organism": organism} if organism else {})
831
848
  if self._using_key is not None and self._using_key != "default":
832
849
  logger.important(
833
850
  f"validating metadata using registries of instance {colors.italic(self._using_key)}"
834
851
  )
852
+
853
+ # add all validated records to the current instance
854
+ verbosity = settings.verbosity
855
+ try:
856
+ settings.verbosity = "error"
857
+ self._update_registry_all()
858
+ finally:
859
+ settings.verbosity = verbosity
860
+
835
861
  validated_var = True
836
862
  non_validated_var_modality = {}
837
863
  for modality, var_field in self._var_fields.items():
@@ -842,6 +868,7 @@ class MuDataCurator:
842
868
  using_key=self._using_key,
843
869
  source=self._sources.get(modality, {}).get("var_index"),
844
870
  exclude=self._exclude.get(modality, {}).get("var_index"),
871
+ validated_hint_print=f'.add_validated_from_var_index("{modality}")',
845
872
  **self._kwargs, # type: ignore
846
873
  )
847
874
  validated_var &= is_validated_var
@@ -874,56 +901,75 @@ class MuDataCurator:
874
901
  self._validated = validated_var and validated_obs
875
902
  return self._validated
876
903
 
877
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
904
+ def save_artifact(
905
+ self,
906
+ description: str | None = None,
907
+ key: str | None = None,
908
+ revises: Artifact | None = None,
909
+ run: Run | None = None,
910
+ ) -> Artifact:
878
911
  """Save the validated ``MuData`` and metadata.
879
912
 
880
913
  Args:
881
- description: Description of the ``MuData`` object.
882
- **kwargs: Object level metadata.
914
+ description: `str | None = None` A description of the ``MuData`` object.
915
+ key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
916
+ revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
917
+ run: `Run | None = None` The run that creates the artifact.
883
918
 
884
919
  Returns:
885
920
  A saved artifact record.
886
921
  """
922
+ from lamindb.core._settings import settings
923
+
887
924
  if not self._validated:
888
- raise ValidationError("Please run `validate()` first!")
925
+ self.validate()
926
+ if not self._validated:
927
+ raise ValidationError("Dataset does not validate. Please curate.")
928
+ verbosity = settings.verbosity
929
+ try:
930
+ settings.verbosity = "warning"
931
+ if not self._validated:
932
+ # save all validated records to the current instance
933
+ self._update_registry_all()
889
934
 
890
- self._artifact = save_artifact(
891
- self._mdata,
892
- description=description,
893
- columns_field=self.var_index,
894
- fields=self.categoricals,
895
- **self._kwargs,
896
- **kwargs,
897
- )
935
+ self._artifact = save_artifact(
936
+ self._mdata,
937
+ description=description,
938
+ columns_field=self.var_index,
939
+ fields=self.categoricals,
940
+ key=key,
941
+ revises=revises,
942
+ run=run,
943
+ **self._kwargs,
944
+ )
945
+ finally:
946
+ settings.verbosity = verbosity
898
947
  return self._artifact
899
948
 
900
949
 
901
950
  class Curator(BaseCurator):
902
951
  """Dataset curator.
903
952
 
904
- Data curation entails accurately labeling datasets with standardized metadata
905
- to facilitate data integration, interpretation and analysis.
906
-
907
- The curation flow has several steps:
908
-
909
- 1. Instantiate `Curator` from one of the following dataset objects:
910
-
911
- - :meth:`~lamindb.Curator.from_df`
912
- - :meth:`~lamindb.Curator.from_anndata`
913
- - :meth:`~lamindb.Curator.from_mudata`
953
+ A `Curator` object makes it easy to save validated & annotated artifacts.
914
954
 
915
- During object creation, any passed categoricals found in the object will be saved.
955
+ Example:
916
956
 
917
- 2. Run :meth:`~lamindb.core.DataFrameCurator.validate` to check the data against the defined criteria. This method identifies:
957
+ >>> curator = ln.Curator.from_df(
958
+ >>> df,
959
+ >>> # define validation criteria as mappings
960
+ >>> columns=ln.Feature.name, # map column names
961
+ >>> categoricals={"perturbation": ln.ULabel.name}, # map categories
962
+ >>> )
963
+ >>> curator.validate() # validate the data in df
964
+ >>> artifact = curate.save_artifact(description="my RNA-seq")
965
+ >>> artifact.describe() # see annotations
918
966
 
919
- - Values that can successfully validated and already exist in the registry.
920
- - Values which are new and not yet validated or potentially problematic values.
967
+ `curator.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
921
968
 
922
- 3. Determine how to handle validated and non-validated values:
969
+ If you find non-validated values, you have several options:
923
970
 
924
- - Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
925
- - Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
926
- - All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
971
+ - new values found in the data can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`
972
+ - non-validated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and addressed manually
927
973
  """
928
974
 
929
975
  @classmethod
@@ -1076,7 +1122,7 @@ def check_registry_organism(registry: Record, organism: str | None = None) -> di
1076
1122
  import bionty as bt
1077
1123
 
1078
1124
  if organism is None and bt.settings.organism is None:
1079
- raise ValueError(
1125
+ raise ValidationError(
1080
1126
  f"{registry.__name__} registry requires an organism!\n"
1081
1127
  " → please pass an organism name via organism="
1082
1128
  )
@@ -1104,8 +1150,8 @@ def validate_categories(
1104
1150
  using_key: A reference LaminDB instance.
1105
1151
  organism: The organism name.
1106
1152
  source: The source record.
1107
- exclude: Exclude specific values.
1108
- standardize: Standardize the values.
1153
+ exclude: Exclude specific values from validation.
1154
+ standardize: Whether to standardize the values.
1109
1155
  validated_hint_print: The hint to print for validated values.
1110
1156
  """
1111
1157
  from lamindb._from_values import _print_values
@@ -1166,15 +1212,18 @@ def validate_categories(
1166
1212
 
1167
1213
  validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
1168
1214
  n_validated = len(values_validated)
1215
+
1169
1216
  if n_validated > 0:
1170
1217
  _log_mapping_info()
1218
+ terms_str = f"{', '.join([f'{chr(39)}{v}{chr(39)}' for v in values_validated[:10]])}{', ...' if len(values_validated) > 10 else ''}"
1219
+ val_numerous = "" if n_validated == 1 else "s"
1171
1220
  logger.warning(
1172
- f"found {colors.yellow(n_validated)} validated terms: "
1173
- f"{colors.yellow(values_validated)}\n → save terms via "
1174
- f"{colors.yellow(validated_hint_print)}"
1221
+ f"found {colors.yellow(n_validated)} validated term{val_numerous}: "
1222
+ f"{colors.yellow(terms_str)}\n"
1223
+ f"→ save term{val_numerous} via {colors.yellow(validated_hint_print)}"
1175
1224
  )
1176
1225
 
1177
- non_validated_hint_print = f".add_new_from('{key}')"
1226
+ non_validated_hint_print = validated_hint_print.replace("_validated_", "_new_")
1178
1227
  non_validated = [i for i in non_validated if i not in values_validated]
1179
1228
  n_non_validated = len(non_validated)
1180
1229
  if n_non_validated == 0:
@@ -1186,13 +1235,15 @@ def validate_categories(
1186
1235
  # validated values still need to be saved to the current instance
1187
1236
  return False, []
1188
1237
  else:
1189
- are = "are" if n_non_validated > 1 else "is"
1238
+ non_val_numerous = ("", "is") if n_non_validated == 1 else ("s", "are")
1190
1239
  print_values = _print_values(non_validated)
1191
1240
  warning_message = (
1192
- f"{colors.red(f'{n_non_validated} terms')} {are} not validated: "
1193
- f"{colors.red(print_values)}\n → fix typos, remove non-existent values, or save terms via "
1241
+ f"{colors.red(f'{n_non_validated} term{non_val_numerous[0]}')} {non_val_numerous[1]} not validated: "
1242
+ f"{colors.red(', '.join(print_values.split(', ')[:10]) + ', ...' if len(print_values.split(', ')) > 10 else print_values)}\n"
1243
+ f"→ fix typo{non_val_numerous[0]}, remove non-existent value{non_val_numerous[0]}, or save term{non_val_numerous[0]} via "
1194
1244
  f"{colors.red(non_validated_hint_print)}"
1195
1245
  )
1246
+
1196
1247
  if logger.indent == "":
1197
1248
  _log_mapping_info()
1198
1249
  logger.warning(warning_message)
@@ -1239,7 +1290,9 @@ def save_artifact(
1239
1290
  description: str | None = None,
1240
1291
  organism: str | None = None,
1241
1292
  adata: ad.AnnData | None = None,
1242
- **kwargs,
1293
+ key: str | None = None,
1294
+ revises: Artifact | None = None,
1295
+ run: Run | None = None,
1243
1296
  ) -> Artifact:
1244
1297
  """Save all metadata with an Artifact.
1245
1298
 
@@ -1249,29 +1302,43 @@ def save_artifact(
1249
1302
  fields: A dictionary mapping obs_column to registry_field.
1250
1303
  columns_field: The registry field to validate variables index against.
1251
1304
  organism: The organism name.
1252
- adata: The AnnData object to save, must be provided if data is a path.
1253
- kwargs: Additional keyword arguments to pass to the registry model.
1305
+ adata: The AnnData object to save and get n_observations, must be provided if data is a path.
1306
+ type: `Literal["dataset", "model"] | None = None` The artifact type.
1307
+ key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
1308
+ revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
1309
+ run: `Run | None = None` The run that creates the artifact.
1254
1310
 
1255
1311
  Returns:
1256
1312
  The saved Artifact.
1257
1313
  """
1258
1314
  from ._artifact import data_is_anndata
1315
+ from .core._data import add_labels
1259
1316
 
1260
1317
  artifact = None
1261
1318
  if data_is_anndata(data):
1262
1319
  assert adata is not None # noqa: S101
1263
- artifact = Artifact.from_anndata(data, description=description, **kwargs)
1320
+ artifact = Artifact.from_anndata(
1321
+ data, description=description, key=key, revises=revises, run=run
1322
+ )
1264
1323
  artifact.n_observations = adata.shape[0]
1265
1324
  data = adata
1266
1325
 
1267
1326
  elif isinstance(data, pd.DataFrame):
1268
- artifact = Artifact.from_df(data, description=description, **kwargs)
1327
+ artifact = Artifact.from_df(
1328
+ data, description=description, key=key, revises=revises, run=run
1329
+ )
1269
1330
  else:
1270
1331
  try:
1271
1332
  from mudata import MuData
1272
1333
 
1273
1334
  if isinstance(data, MuData):
1274
- artifact = Artifact.from_mudata(data, description=description, **kwargs)
1335
+ artifact = Artifact.from_mudata(
1336
+ data,
1337
+ description=description,
1338
+ key=key,
1339
+ revises=revises,
1340
+ run=run,
1341
+ )
1275
1342
  artifact.n_observations = data.n_obs
1276
1343
  except ImportError:
1277
1344
  pass
@@ -1301,7 +1368,12 @@ def save_artifact(
1301
1368
  else:
1302
1369
  raise NotImplementedError
1303
1370
 
1304
- def _add_labels(data, artifact: Artifact, fields: dict[str, FieldAttr]):
1371
+ def _add_labels(
1372
+ data,
1373
+ artifact: Artifact,
1374
+ fields: dict[str, FieldAttr],
1375
+ feature_ref_is_name: bool | None = None,
1376
+ ):
1305
1377
  features = Feature.lookup().dict()
1306
1378
  for key, field in fields.items():
1307
1379
  feature = features.get(key)
@@ -1314,16 +1386,47 @@ def save_artifact(
1314
1386
  field=field,
1315
1387
  **filter_kwargs_current,
1316
1388
  )
1317
- artifact.labels.add(labels, feature)
1389
+ if len(labels) == 0:
1390
+ continue
1391
+ if hasattr(registry, "_name_field"):
1392
+ label_ref_is_name = field.field.name == registry._name_field
1393
+ add_labels(
1394
+ artifact,
1395
+ records=labels,
1396
+ feature=feature,
1397
+ feature_ref_is_name=feature_ref_is_name,
1398
+ label_ref_is_name=label_ref_is_name,
1399
+ )
1318
1400
 
1319
1401
  if artifact._accessor == "MuData":
1320
1402
  for modality, modality_fields in fields.items():
1403
+ column_field_modality = columns_field.get(modality)
1321
1404
  if modality == "obs":
1322
- _add_labels(data, artifact, modality_fields)
1405
+ _add_labels(
1406
+ data,
1407
+ artifact,
1408
+ modality_fields,
1409
+ feature_ref_is_name=(
1410
+ None
1411
+ if column_field_modality is None
1412
+ else _ref_is_name(column_field_modality)
1413
+ ),
1414
+ )
1323
1415
  else:
1324
- _add_labels(data[modality], artifact, modality_fields)
1416
+ _add_labels(
1417
+ data[modality],
1418
+ artifact,
1419
+ modality_fields,
1420
+ feature_ref_is_name=(
1421
+ None
1422
+ if column_field_modality is None
1423
+ else _ref_is_name(column_field_modality)
1424
+ ),
1425
+ )
1325
1426
  else:
1326
- _add_labels(data, artifact, fields)
1427
+ _add_labels(
1428
+ data, artifact, fields, feature_ref_is_name=_ref_is_name(columns_field)
1429
+ )
1327
1430
 
1328
1431
  slug = ln_setup.settings.instance.slug
1329
1432
  if ln_setup.settings.instance.is_remote: # pragma: no cover
@@ -1331,6 +1434,19 @@ def save_artifact(
1331
1434
  return artifact
1332
1435
 
1333
1436
 
1437
+ def flatten_unique(series):
1438
+ """Flatten a pandas series if it contains lists."""
1439
+ result = set()
1440
+
1441
+ for item in series:
1442
+ if isinstance(item, list):
1443
+ result.update(item)
1444
+ else:
1445
+ result.add(item)
1446
+
1447
+ return list(result)
1448
+
1449
+
1334
1450
  def update_registry(
1335
1451
  values: list[str],
1336
1452
  field: FieldAttr,
@@ -1438,7 +1554,8 @@ def update_registry(
1438
1554
  if not validated_only:
1439
1555
  non_validated_records = []
1440
1556
  if df is not None and registry == Feature:
1441
- non_validated_records = Feature.from_df(df)
1557
+ nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
1558
+ non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
1442
1559
  else:
1443
1560
  if "organism" in filter_kwargs:
1444
1561
  # make sure organism record is saved to the current instance
@@ -1499,24 +1616,25 @@ def log_saved_labels(
1499
1616
  continue
1500
1617
 
1501
1618
  if k == "without reference" and validated_only:
1502
- msg = colors.yellow(
1503
- f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
1504
- )
1505
- lookup_print = (
1506
- f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
1507
- )
1508
-
1509
- hint = f".add_new_from('{key}')"
1510
- msg += f"\n → to lookup values, use {lookup_print}"
1511
- msg += (
1512
- f"\n → to save, run {colors.yellow(hint)}"
1513
- if save_function == "add_new_from"
1514
- else f"\n → to save, run {colors.yellow(save_function)}"
1515
- )
1516
- if warning:
1517
- logger.warning(msg)
1518
- else:
1519
- logger.info(msg)
1619
+ continue
1620
+ # msg = colors.yellow(
1621
+ # f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
1622
+ # )
1623
+ # lookup_print = (
1624
+ # f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
1625
+ # )
1626
+
1627
+ # hint = f".add_new_from('{key}')"
1628
+ # msg += f"\n → to lookup values, use {lookup_print}"
1629
+ # msg += (
1630
+ # f"\n → to save, run {colors.yellow(hint)}"
1631
+ # if save_function == "add_new_from"
1632
+ # else f"\n → to save, run {colors.yellow(save_function)}"
1633
+ # )
1634
+ # if warning:
1635
+ # logger.warning(msg)
1636
+ # else:
1637
+ # logger.info(msg)
1520
1638
  else:
1521
1639
  k = "" if k == "without reference" else f"{colors.green(k)} "
1522
1640
  # the term "transferred" stresses that this is always in the context of transferring
@@ -1534,8 +1652,8 @@ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> N
1534
1652
  all_records = registry.from_values(list(values), field=field)
1535
1653
  is_feature = registry.filter(name=f"is_{key}").one_or_none()
1536
1654
  if is_feature is None:
1537
- is_feature = registry(name=f"is_{key}")
1538
- is_feature.save()
1655
+ is_feature = registry(name=f"is_{key}").save()
1656
+ logger.important(f"Created a parent ULabel: {is_feature}")
1539
1657
  is_feature.children.add(*all_records)
1540
1658
 
1541
1659
 
@@ -1592,7 +1710,7 @@ def _save_organism(name: str): # pragma: no cover
1592
1710
  if organism is None:
1593
1711
  organism = bt.Organism.from_source(name=name)
1594
1712
  if organism is None:
1595
- raise ValueError(
1713
+ raise ValidationError(
1596
1714
  f"Organism '{name}' not found\n"
1597
1715
  f" → please save it: bt.Organism(name='{name}').save()"
1598
1716
  )
@@ -1600,4 +1718,12 @@ def _save_organism(name: str): # pragma: no cover
1600
1718
  return organism
1601
1719
 
1602
1720
 
1721
+ def _ref_is_name(field: FieldAttr) -> bool | None:
1722
+ """Check if the reference field is a name field."""
1723
+ from ._can_validate import get_name_field
1724
+
1725
+ name_field = get_name_field(field.field.model)
1726
+ return field.field.name == name_field
1727
+
1728
+
1603
1729
  Curate = Curator # backward compat