lamindb 0.76.13__py3-none-any.whl → 0.76.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +39 -37
- lamindb/_can_validate.py +6 -6
- lamindb/_collection.py +18 -5
- lamindb/_curate.py +298 -172
- lamindb/_feature.py +2 -3
- lamindb/_feature_set.py +1 -2
- lamindb/_from_values.py +1 -5
- lamindb/_is_versioned.py +1 -2
- lamindb/_parents.py +28 -5
- lamindb/_query_manager.py +1 -2
- lamindb/_query_set.py +8 -4
- lamindb/_record.py +78 -4
- lamindb/_save.py +2 -2
- lamindb/_transform.py +1 -2
- lamindb/_ulabel.py +1 -1
- lamindb/core/__init__.py +2 -0
- lamindb/core/_data.py +19 -7
- lamindb/core/_feature_manager.py +76 -42
- lamindb/core/_label_manager.py +21 -0
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/exceptions.py +7 -0
- lamindb/core/storage/_backed_access.py +16 -8
- lamindb/core/storage/_pyarrow_dataset.py +31 -0
- lamindb/core/types.py +1 -0
- {lamindb-0.76.13.dist-info → lamindb-0.76.15.dist-info}/METADATA +9 -10
- {lamindb-0.76.13.dist-info → lamindb-0.76.15.dist-info}/RECORD +29 -28
- {lamindb-0.76.13.dist-info → lamindb-0.76.15.dist-info}/LICENSE +0 -0
- {lamindb-0.76.13.dist-info → lamindb-0.76.15.dist-info}/WHEEL +0 -0
lamindb/_curate.py
CHANGED
@@ -34,21 +34,21 @@ class CurateLookup:
|
|
34
34
|
categoricals: dict[str, FieldAttr],
|
35
35
|
slots: dict[str, FieldAttr] = None,
|
36
36
|
using_key: str | None = None,
|
37
|
+
public: bool = False,
|
37
38
|
) -> None:
|
38
39
|
if slots is None:
|
39
40
|
slots = {}
|
40
41
|
self._fields = {**categoricals, **slots}
|
41
42
|
self._using_key = None if using_key == "default" else using_key
|
42
43
|
self._using_key_name = self._using_key or ln_setup.settings.instance.slug
|
43
|
-
|
44
|
-
|
45
|
-
)
|
44
|
+
self._public = public
|
45
|
+
debug_message = f"Lookup objects from {colors.italic(self._using_key_name)}"
|
46
46
|
logger.debug(debug_message)
|
47
47
|
|
48
48
|
def __getattr__(self, name):
|
49
49
|
if name in self._fields:
|
50
50
|
registry = self._fields[name].field.model
|
51
|
-
if self.
|
51
|
+
if self._public and hasattr(registry, "public"):
|
52
52
|
return registry.public().lookup()
|
53
53
|
else:
|
54
54
|
return get_registry_instance(registry, self._using_key).lookup()
|
@@ -59,7 +59,7 @@ class CurateLookup:
|
|
59
59
|
def __getitem__(self, name):
|
60
60
|
if name in self._fields:
|
61
61
|
registry = self._fields[name].field.model
|
62
|
-
if self.
|
62
|
+
if self._public and hasattr(registry, "public"):
|
63
63
|
return registry.public().lookup()
|
64
64
|
else:
|
65
65
|
return get_registry_instance(registry, self._using_key).lookup()
|
@@ -75,12 +75,14 @@ class CurateLookup:
|
|
75
75
|
getitem_keys = "\n ".join(
|
76
76
|
[str([key]) for key in self._fields if not key.isidentifier()]
|
77
77
|
)
|
78
|
+
ref = "public" if self._public else self._using_key_name
|
78
79
|
return (
|
79
|
-
f"Lookup objects from the {colors.italic(
|
80
|
+
f"Lookup objects from the {colors.italic(ref)}:\n "
|
80
81
|
f"{colors.green(getattr_keys)}\n "
|
81
|
-
f"{colors.green(getitem_keys)}\n
|
82
|
-
"Example:\n → categories = validator.lookup()
|
83
|
-
" → categories.alveolar_type_1_fibroblast_cell"
|
82
|
+
f"{colors.green(getitem_keys)}\n"
|
83
|
+
"Example:\n → categories = validator.lookup()['cell_type']\n"
|
84
|
+
" → categories.alveolar_type_1_fibroblast_cell\n\n"
|
85
|
+
"To look up public ontologies, use .lookup(public=True)"
|
84
86
|
)
|
85
87
|
else: # pragma: no cover
|
86
88
|
return colors.warning("No fields are found!")
|
@@ -97,12 +99,20 @@ class BaseCurator:
|
|
97
99
|
"""
|
98
100
|
pass
|
99
101
|
|
100
|
-
def save_artifact(
|
102
|
+
def save_artifact(
|
103
|
+
self,
|
104
|
+
description: str | None = None,
|
105
|
+
key: str | None = None,
|
106
|
+
revises: Artifact | None = None,
|
107
|
+
run: Run | None = None,
|
108
|
+
) -> Artifact:
|
101
109
|
"""Save the dataset as artifact.
|
102
110
|
|
103
111
|
Args:
|
104
|
-
description:
|
105
|
-
|
112
|
+
description: `str | None = None` A description of the DataFrame object.
|
113
|
+
key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
114
|
+
revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
|
115
|
+
run: `Run | None = None` The run that creates the artifact.
|
106
116
|
|
107
117
|
Returns:
|
108
118
|
A saved artifact record.
|
@@ -174,7 +184,7 @@ class DataFrameCurator(BaseCurator):
|
|
174
184
|
def non_validated(self) -> list:
|
175
185
|
"""Return the non-validated features and labels."""
|
176
186
|
if self._non_validated is None:
|
177
|
-
raise
|
187
|
+
raise ValidationError("Please run validate() first!")
|
178
188
|
return self._non_validated
|
179
189
|
|
180
190
|
@property
|
@@ -182,7 +192,9 @@ class DataFrameCurator(BaseCurator):
|
|
182
192
|
"""Return the columns fields to validate against."""
|
183
193
|
return self._fields
|
184
194
|
|
185
|
-
def lookup(
|
195
|
+
def lookup(
|
196
|
+
self, using_key: str | None = None, public: bool = False
|
197
|
+
) -> CurateLookup:
|
186
198
|
"""Lookup categories.
|
187
199
|
|
188
200
|
Args:
|
@@ -194,6 +206,7 @@ class DataFrameCurator(BaseCurator):
|
|
194
206
|
categoricals=self._fields,
|
195
207
|
slots={"columns": self._columns_field},
|
196
208
|
using_key=using_key or self._using_key,
|
209
|
+
public=public,
|
197
210
|
)
|
198
211
|
|
199
212
|
def _check_valid_keys(self, extra: set = None) -> None:
|
@@ -209,7 +222,7 @@ class DataFrameCurator(BaseCurator):
|
|
209
222
|
valid_keys = set(self._df.columns) | {"columns"} | extra
|
210
223
|
nonval_keys = [key for key in d.keys() if key not in valid_keys]
|
211
224
|
if len(nonval_keys) > 0:
|
212
|
-
raise
|
225
|
+
raise ValidationError(
|
213
226
|
f"the following keys passed to {name} are not allowed: {nonval_keys}"
|
214
227
|
)
|
215
228
|
|
@@ -245,16 +258,6 @@ class DataFrameCurator(BaseCurator):
|
|
245
258
|
**kwargs,
|
246
259
|
)
|
247
260
|
|
248
|
-
def add_validated_from(self, key: str, organism: str | None = None):
|
249
|
-
"""Add validated categories.
|
250
|
-
|
251
|
-
Args:
|
252
|
-
key: The key referencing the slot in the DataFrame.
|
253
|
-
organism: The organism name.
|
254
|
-
"""
|
255
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
256
|
-
self._update_registry(key, validated_only=True, **self._kwargs)
|
257
|
-
|
258
261
|
def add_new_from(self, key: str, organism: str | None = None, **kwargs):
|
259
262
|
"""Add validated & new categories.
|
260
263
|
|
@@ -285,9 +288,11 @@ class DataFrameCurator(BaseCurator):
|
|
285
288
|
self._save_columns(validated_only=validated_only, **kwargs)
|
286
289
|
else:
|
287
290
|
if categorical not in self.fields:
|
288
|
-
raise
|
291
|
+
raise ValidationError(
|
292
|
+
f"Feature {categorical} is not part of the fields!"
|
293
|
+
)
|
289
294
|
update_registry(
|
290
|
-
values=self._df[categorical]
|
295
|
+
values=flatten_unique(self._df[categorical]),
|
291
296
|
field=self.fields[categorical],
|
292
297
|
key=categorical,
|
293
298
|
using_key=self._using_key,
|
@@ -300,7 +305,7 @@ class DataFrameCurator(BaseCurator):
|
|
300
305
|
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
301
306
|
"""Save labels for all features."""
|
302
307
|
for name in self.fields.keys():
|
303
|
-
logger.info(f"saving
|
308
|
+
logger.info(f"saving validated records of '{name}'")
|
304
309
|
self._update_registry(name, validated_only=validated_only, **kwargs)
|
305
310
|
|
306
311
|
def validate(self, organism: str | None = None) -> bool:
|
@@ -313,6 +318,10 @@ class DataFrameCurator(BaseCurator):
|
|
313
318
|
Whether the DataFrame is validated.
|
314
319
|
"""
|
315
320
|
self._kwargs.update({"organism": organism} if organism else {})
|
321
|
+
|
322
|
+
# add all validated records to the current instance
|
323
|
+
self._update_registry_all()
|
324
|
+
|
316
325
|
self._validated, self._non_validated = validate_categories_in_df( # type: ignore
|
317
326
|
self._df,
|
318
327
|
fields=self.fields,
|
@@ -323,12 +332,20 @@ class DataFrameCurator(BaseCurator):
|
|
323
332
|
)
|
324
333
|
return self._validated
|
325
334
|
|
326
|
-
def save_artifact(
|
335
|
+
def save_artifact(
|
336
|
+
self,
|
337
|
+
description: str | None = None,
|
338
|
+
key: str | None = None,
|
339
|
+
revises: Artifact | None = None,
|
340
|
+
run: Run | None = None,
|
341
|
+
) -> Artifact:
|
327
342
|
"""Save the validated DataFrame and metadata.
|
328
343
|
|
329
344
|
Args:
|
330
|
-
description: Description of the DataFrame object.
|
331
|
-
|
345
|
+
description: `str | None = None` Description of the DataFrame object.
|
346
|
+
key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
347
|
+
revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
|
348
|
+
run: `Run | None = None` The run that creates the artifact.
|
332
349
|
|
333
350
|
Returns:
|
334
351
|
A saved artifact record.
|
@@ -344,15 +361,18 @@ class DataFrameCurator(BaseCurator):
|
|
344
361
|
verbosity = settings.verbosity
|
345
362
|
try:
|
346
363
|
settings.verbosity = "warning"
|
347
|
-
|
348
|
-
|
364
|
+
if not self._validated:
|
365
|
+
# save all validated records to the current instance
|
366
|
+
self._update_registry_all()
|
349
367
|
|
350
368
|
self._artifact = save_artifact(
|
351
369
|
self._df,
|
352
370
|
description=description,
|
353
371
|
fields=self.fields,
|
354
372
|
columns_field=self._columns_field,
|
355
|
-
|
373
|
+
key=key,
|
374
|
+
revises=revises,
|
375
|
+
run=run,
|
356
376
|
**self._kwargs,
|
357
377
|
)
|
358
378
|
finally:
|
@@ -457,7 +477,9 @@ class AnnDataCurator(DataFrameCurator):
|
|
457
477
|
"""Return the obs fields to validate against."""
|
458
478
|
return self._obs_fields
|
459
479
|
|
460
|
-
def lookup(
|
480
|
+
def lookup(
|
481
|
+
self, using_key: str | None = None, public: bool = False
|
482
|
+
) -> CurateLookup:
|
461
483
|
"""Lookup categories.
|
462
484
|
|
463
485
|
Args:
|
@@ -469,6 +491,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
469
491
|
categoricals=self._obs_fields,
|
470
492
|
slots={"columns": self._columns_field, "var_index": self._var_field},
|
471
493
|
using_key=using_key or self._using_key,
|
494
|
+
public=public,
|
472
495
|
)
|
473
496
|
|
474
497
|
def _save_from_var_index(
|
@@ -479,7 +502,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
479
502
|
values=list(self._adata.var.index),
|
480
503
|
field=self.var_index,
|
481
504
|
key="var_index",
|
482
|
-
save_function="add_new_from_var_index",
|
505
|
+
save_function=".add_new_from_var_index()",
|
483
506
|
using_key=self._using_key,
|
484
507
|
validated_only=validated_only,
|
485
508
|
organism=organism,
|
@@ -489,12 +512,11 @@ class AnnDataCurator(DataFrameCurator):
|
|
489
512
|
|
490
513
|
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
491
514
|
"""Save labels for all features."""
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
self._update_registry(name, validated_only=validated_only, **kwargs)
|
515
|
+
logger.info("saving validated records of 'var_index'")
|
516
|
+
self._save_from_var_index(validated_only=validated_only, **self._kwargs)
|
517
|
+
for name in self._obs_fields.keys():
|
518
|
+
logger.info(f"saving validated terms of '{name}'")
|
519
|
+
self._update_registry(name, validated_only=validated_only, **self._kwargs)
|
498
520
|
|
499
521
|
def add_new_from_var_index(self, organism: str | None = None, **kwargs):
|
500
522
|
"""Update variable records.
|
@@ -506,15 +528,6 @@ class AnnDataCurator(DataFrameCurator):
|
|
506
528
|
self._kwargs.update({"organism": organism} if organism else {})
|
507
529
|
self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
|
508
530
|
|
509
|
-
def add_validated_from_var_index(self, organism: str | None = None):
|
510
|
-
"""Add validated variable records.
|
511
|
-
|
512
|
-
Args:
|
513
|
-
organism: The organism name.
|
514
|
-
"""
|
515
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
516
|
-
self._save_from_var_index(validated_only=True, **self._kwargs)
|
517
|
-
|
518
531
|
def validate(self, organism: str | None = None) -> bool:
|
519
532
|
"""Validate categories.
|
520
533
|
|
@@ -530,6 +543,9 @@ class AnnDataCurator(DataFrameCurator):
|
|
530
543
|
f"validating metadata using registries of instance {colors.italic(self._using_key)}"
|
531
544
|
)
|
532
545
|
|
546
|
+
# add all validated records to the current instance
|
547
|
+
self._update_registry_all()
|
548
|
+
|
533
549
|
validated_var, non_validated_var = validate_categories(
|
534
550
|
self._adata.var.index,
|
535
551
|
field=self._var_field,
|
@@ -554,30 +570,49 @@ class AnnDataCurator(DataFrameCurator):
|
|
554
570
|
self._validated = validated_var and validated_obs
|
555
571
|
return self._validated
|
556
572
|
|
557
|
-
def save_artifact(
|
573
|
+
def save_artifact(
|
574
|
+
self,
|
575
|
+
description: str | None = None,
|
576
|
+
key: str | None = None,
|
577
|
+
revises: Artifact | None = None,
|
578
|
+
run: Run | None = None,
|
579
|
+
) -> Artifact:
|
558
580
|
"""Save the validated ``AnnData`` and metadata.
|
559
581
|
|
560
582
|
Args:
|
561
|
-
description:
|
562
|
-
|
583
|
+
description: `str | None = None` A description of the ``AnnData`` object.
|
584
|
+
key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
585
|
+
revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
|
586
|
+
run: `Run | None = None` The run that creates the artifact.
|
563
587
|
|
564
588
|
Returns:
|
565
589
|
A saved artifact record.
|
566
590
|
"""
|
591
|
+
from lamindb.core._settings import settings
|
592
|
+
|
567
593
|
if not self._validated:
|
568
594
|
self.validate()
|
569
595
|
if not self._validated:
|
570
596
|
raise ValidationError("Dataset does not validate. Please curate.")
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
597
|
+
verbosity = settings.verbosity
|
598
|
+
try:
|
599
|
+
settings.verbosity = "warning"
|
600
|
+
if not self._validated:
|
601
|
+
# save all validated records to the current instance
|
602
|
+
self._update_registry_all()
|
603
|
+
self._artifact = save_artifact(
|
604
|
+
self._data,
|
605
|
+
adata=self._adata,
|
606
|
+
description=description,
|
607
|
+
columns_field=self.var_index,
|
608
|
+
fields=self.categoricals,
|
609
|
+
key=key,
|
610
|
+
revises=revises,
|
611
|
+
run=run,
|
612
|
+
**self._kwargs,
|
613
|
+
)
|
614
|
+
finally:
|
615
|
+
settings.verbosity = verbosity
|
581
616
|
return self._artifact
|
582
617
|
|
583
618
|
|
@@ -656,10 +691,6 @@ class MuDataCurator:
|
|
656
691
|
)
|
657
692
|
for modality in self._modalities
|
658
693
|
}
|
659
|
-
for modality in self._var_fields.keys():
|
660
|
-
self._save_from_var_index_modality(
|
661
|
-
modality=modality, validated_only=True, **self._kwargs
|
662
|
-
)
|
663
694
|
|
664
695
|
@property
|
665
696
|
def var_index(self) -> FieldAttr:
|
@@ -675,7 +706,7 @@ class MuDataCurator:
|
|
675
706
|
"""Verify the modality exists."""
|
676
707
|
for modality in modalities:
|
677
708
|
if modality not in self._mdata.mod.keys():
|
678
|
-
raise
|
709
|
+
raise ValidationError(f"modality '{modality}' does not exist!")
|
679
710
|
|
680
711
|
def _save_from_var_index_modality(
|
681
712
|
self, modality: str, validated_only: bool = True, **kwargs
|
@@ -685,7 +716,7 @@ class MuDataCurator:
|
|
685
716
|
values=list(self._mdata[modality].var.index),
|
686
717
|
field=self._var_fields[modality],
|
687
718
|
key="var_index",
|
688
|
-
save_function=
|
719
|
+
save_function=f'.add_new_from_var_index("{modality}")',
|
689
720
|
using_key=self._using_key,
|
690
721
|
validated_only=validated_only,
|
691
722
|
dtype="number",
|
@@ -700,7 +731,7 @@ class MuDataCurator:
|
|
700
731
|
obs_fields: dict[str, dict[str, FieldAttr]] = {}
|
701
732
|
for k, v in categoricals.items():
|
702
733
|
if k not in self._mdata.obs.columns:
|
703
|
-
raise
|
734
|
+
raise ValidationError(f"column '{k}' does not exist in mdata.obs!")
|
704
735
|
if any(k.startswith(prefix) for prefix in prefixes):
|
705
736
|
modality, col = k.split(":")[0], k.split(":")[1]
|
706
737
|
if modality not in obs_fields.keys():
|
@@ -712,7 +743,9 @@ class MuDataCurator:
|
|
712
743
|
obs_fields["obs"][k] = v
|
713
744
|
return obs_fields
|
714
745
|
|
715
|
-
def lookup(
|
746
|
+
def lookup(
|
747
|
+
self, using_key: str | None = None, public: bool = False
|
748
|
+
) -> CurateLookup:
|
716
749
|
"""Lookup categories.
|
717
750
|
|
718
751
|
Args:
|
@@ -727,6 +760,7 @@ class MuDataCurator:
|
|
727
760
|
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
728
761
|
},
|
729
762
|
using_key=using_key or self._using_key,
|
763
|
+
public=public,
|
730
764
|
)
|
731
765
|
|
732
766
|
def add_new_from_columns(
|
@@ -774,33 +808,14 @@ class MuDataCurator:
|
|
774
808
|
modality=modality, validated_only=False, **self._kwargs, **kwargs
|
775
809
|
)
|
776
810
|
|
777
|
-
def
|
778
|
-
"""
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
self._save_from_var_index_modality(
|
786
|
-
modality=modality, validated_only=True, **self._kwargs
|
787
|
-
)
|
788
|
-
|
789
|
-
def add_validated_from(
|
790
|
-
self, key: str, modality: str | None = None, organism: str | None = None
|
791
|
-
):
|
792
|
-
"""Add validated categories.
|
793
|
-
|
794
|
-
Args:
|
795
|
-
key: The key referencing the slot in the DataFrame.
|
796
|
-
modality: The modality name.
|
797
|
-
organism: The organism name.
|
798
|
-
"""
|
799
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
800
|
-
modality = modality or "obs"
|
801
|
-
if modality in self._df_annotators:
|
802
|
-
df_annotator = self._df_annotators[modality]
|
803
|
-
df_annotator.add_validated_from(key=key, **self._kwargs)
|
811
|
+
def _update_registry_all(self):
|
812
|
+
"""Update all registries."""
|
813
|
+
for modality in self._var_fields.keys():
|
814
|
+
self._save_from_var_index_modality(
|
815
|
+
modality=modality, validated_only=True, **self._kwargs
|
816
|
+
)
|
817
|
+
for _, df_annotator in self._df_annotators.items():
|
818
|
+
df_annotator._update_registry_all(validated_only=True, **self._kwargs)
|
804
819
|
|
805
820
|
def add_new_from(
|
806
821
|
self,
|
@@ -827,11 +842,22 @@ class MuDataCurator:
|
|
827
842
|
|
828
843
|
def validate(self, organism: str | None = None) -> bool:
|
829
844
|
"""Validate categories."""
|
845
|
+
from lamindb.core._settings import settings
|
846
|
+
|
830
847
|
self._kwargs.update({"organism": organism} if organism else {})
|
831
848
|
if self._using_key is not None and self._using_key != "default":
|
832
849
|
logger.important(
|
833
850
|
f"validating metadata using registries of instance {colors.italic(self._using_key)}"
|
834
851
|
)
|
852
|
+
|
853
|
+
# add all validated records to the current instance
|
854
|
+
verbosity = settings.verbosity
|
855
|
+
try:
|
856
|
+
settings.verbosity = "error"
|
857
|
+
self._update_registry_all()
|
858
|
+
finally:
|
859
|
+
settings.verbosity = verbosity
|
860
|
+
|
835
861
|
validated_var = True
|
836
862
|
non_validated_var_modality = {}
|
837
863
|
for modality, var_field in self._var_fields.items():
|
@@ -842,6 +868,7 @@ class MuDataCurator:
|
|
842
868
|
using_key=self._using_key,
|
843
869
|
source=self._sources.get(modality, {}).get("var_index"),
|
844
870
|
exclude=self._exclude.get(modality, {}).get("var_index"),
|
871
|
+
validated_hint_print=f'.add_validated_from_var_index("{modality}")',
|
845
872
|
**self._kwargs, # type: ignore
|
846
873
|
)
|
847
874
|
validated_var &= is_validated_var
|
@@ -874,56 +901,75 @@ class MuDataCurator:
|
|
874
901
|
self._validated = validated_var and validated_obs
|
875
902
|
return self._validated
|
876
903
|
|
877
|
-
def save_artifact(
|
904
|
+
def save_artifact(
|
905
|
+
self,
|
906
|
+
description: str | None = None,
|
907
|
+
key: str | None = None,
|
908
|
+
revises: Artifact | None = None,
|
909
|
+
run: Run | None = None,
|
910
|
+
) -> Artifact:
|
878
911
|
"""Save the validated ``MuData`` and metadata.
|
879
912
|
|
880
913
|
Args:
|
881
|
-
description:
|
882
|
-
|
914
|
+
description: `str | None = None` A description of the ``MuData`` object.
|
915
|
+
key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
916
|
+
revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
|
917
|
+
run: `Run | None = None` The run that creates the artifact.
|
883
918
|
|
884
919
|
Returns:
|
885
920
|
A saved artifact record.
|
886
921
|
"""
|
922
|
+
from lamindb.core._settings import settings
|
923
|
+
|
887
924
|
if not self._validated:
|
888
|
-
|
925
|
+
self.validate()
|
926
|
+
if not self._validated:
|
927
|
+
raise ValidationError("Dataset does not validate. Please curate.")
|
928
|
+
verbosity = settings.verbosity
|
929
|
+
try:
|
930
|
+
settings.verbosity = "warning"
|
931
|
+
if not self._validated:
|
932
|
+
# save all validated records to the current instance
|
933
|
+
self._update_registry_all()
|
889
934
|
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
935
|
+
self._artifact = save_artifact(
|
936
|
+
self._mdata,
|
937
|
+
description=description,
|
938
|
+
columns_field=self.var_index,
|
939
|
+
fields=self.categoricals,
|
940
|
+
key=key,
|
941
|
+
revises=revises,
|
942
|
+
run=run,
|
943
|
+
**self._kwargs,
|
944
|
+
)
|
945
|
+
finally:
|
946
|
+
settings.verbosity = verbosity
|
898
947
|
return self._artifact
|
899
948
|
|
900
949
|
|
901
950
|
class Curator(BaseCurator):
|
902
951
|
"""Dataset curator.
|
903
952
|
|
904
|
-
|
905
|
-
to facilitate data integration, interpretation and analysis.
|
906
|
-
|
907
|
-
The curation flow has several steps:
|
908
|
-
|
909
|
-
1. Instantiate `Curator` from one of the following dataset objects:
|
910
|
-
|
911
|
-
- :meth:`~lamindb.Curator.from_df`
|
912
|
-
- :meth:`~lamindb.Curator.from_anndata`
|
913
|
-
- :meth:`~lamindb.Curator.from_mudata`
|
953
|
+
A `Curator` object makes it easy to save validated & annotated artifacts.
|
914
954
|
|
915
|
-
|
955
|
+
Example:
|
916
956
|
|
917
|
-
|
957
|
+
>>> curator = ln.Curator.from_df(
|
958
|
+
>>> df,
|
959
|
+
>>> # define validation criteria as mappings
|
960
|
+
>>> columns=ln.Feature.name, # map column names
|
961
|
+
>>> categoricals={"perturbation": ln.ULabel.name}, # map categories
|
962
|
+
>>> )
|
963
|
+
>>> curator.validate() # validate the data in df
|
964
|
+
>>> artifact = curate.save_artifact(description="my RNA-seq")
|
965
|
+
>>> artifact.describe() # see annotations
|
918
966
|
|
919
|
-
|
920
|
-
- Values which are new and not yet validated or potentially problematic values.
|
967
|
+
`curator.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
|
921
968
|
|
922
|
-
|
969
|
+
If you find non-validated values, you have several options:
|
923
970
|
|
924
|
-
-
|
925
|
-
-
|
926
|
-
- All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
|
971
|
+
- new values found in the data can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`
|
972
|
+
- non-validated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and addressed manually
|
927
973
|
"""
|
928
974
|
|
929
975
|
@classmethod
|
@@ -1076,7 +1122,7 @@ def check_registry_organism(registry: Record, organism: str | None = None) -> di
|
|
1076
1122
|
import bionty as bt
|
1077
1123
|
|
1078
1124
|
if organism is None and bt.settings.organism is None:
|
1079
|
-
raise
|
1125
|
+
raise ValidationError(
|
1080
1126
|
f"{registry.__name__} registry requires an organism!\n"
|
1081
1127
|
" → please pass an organism name via organism="
|
1082
1128
|
)
|
@@ -1104,8 +1150,8 @@ def validate_categories(
|
|
1104
1150
|
using_key: A reference LaminDB instance.
|
1105
1151
|
organism: The organism name.
|
1106
1152
|
source: The source record.
|
1107
|
-
exclude: Exclude specific values.
|
1108
|
-
standardize:
|
1153
|
+
exclude: Exclude specific values from validation.
|
1154
|
+
standardize: Whether to standardize the values.
|
1109
1155
|
validated_hint_print: The hint to print for validated values.
|
1110
1156
|
"""
|
1111
1157
|
from lamindb._from_values import _print_values
|
@@ -1166,15 +1212,18 @@ def validate_categories(
|
|
1166
1212
|
|
1167
1213
|
validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
|
1168
1214
|
n_validated = len(values_validated)
|
1215
|
+
|
1169
1216
|
if n_validated > 0:
|
1170
1217
|
_log_mapping_info()
|
1218
|
+
terms_str = f"{', '.join([f'{chr(39)}{v}{chr(39)}' for v in values_validated[:10]])}{', ...' if len(values_validated) > 10 else ''}"
|
1219
|
+
val_numerous = "" if n_validated == 1 else "s"
|
1171
1220
|
logger.warning(
|
1172
|
-
f"found {colors.yellow(n_validated)} validated
|
1173
|
-
f"{colors.yellow(
|
1174
|
-
f"{colors.yellow(validated_hint_print)}"
|
1221
|
+
f"found {colors.yellow(n_validated)} validated term{val_numerous}: "
|
1222
|
+
f"{colors.yellow(terms_str)}\n"
|
1223
|
+
f"→ save term{val_numerous} via {colors.yellow(validated_hint_print)}"
|
1175
1224
|
)
|
1176
1225
|
|
1177
|
-
non_validated_hint_print =
|
1226
|
+
non_validated_hint_print = validated_hint_print.replace("_validated_", "_new_")
|
1178
1227
|
non_validated = [i for i in non_validated if i not in values_validated]
|
1179
1228
|
n_non_validated = len(non_validated)
|
1180
1229
|
if n_non_validated == 0:
|
@@ -1186,13 +1235,15 @@ def validate_categories(
|
|
1186
1235
|
# validated values still need to be saved to the current instance
|
1187
1236
|
return False, []
|
1188
1237
|
else:
|
1189
|
-
|
1238
|
+
non_val_numerous = ("", "is") if n_non_validated == 1 else ("s", "are")
|
1190
1239
|
print_values = _print_values(non_validated)
|
1191
1240
|
warning_message = (
|
1192
|
-
f"{colors.red(f'{n_non_validated}
|
1193
|
-
f"{colors.red(print_values)
|
1241
|
+
f"{colors.red(f'{n_non_validated} term{non_val_numerous[0]}')} {non_val_numerous[1]} not validated: "
|
1242
|
+
f"{colors.red(', '.join(print_values.split(', ')[:10]) + ', ...' if len(print_values.split(', ')) > 10 else print_values)}\n"
|
1243
|
+
f"→ fix typo{non_val_numerous[0]}, remove non-existent value{non_val_numerous[0]}, or save term{non_val_numerous[0]} via "
|
1194
1244
|
f"{colors.red(non_validated_hint_print)}"
|
1195
1245
|
)
|
1246
|
+
|
1196
1247
|
if logger.indent == "":
|
1197
1248
|
_log_mapping_info()
|
1198
1249
|
logger.warning(warning_message)
|
@@ -1239,7 +1290,9 @@ def save_artifact(
|
|
1239
1290
|
description: str | None = None,
|
1240
1291
|
organism: str | None = None,
|
1241
1292
|
adata: ad.AnnData | None = None,
|
1242
|
-
|
1293
|
+
key: str | None = None,
|
1294
|
+
revises: Artifact | None = None,
|
1295
|
+
run: Run | None = None,
|
1243
1296
|
) -> Artifact:
|
1244
1297
|
"""Save all metadata with an Artifact.
|
1245
1298
|
|
@@ -1249,29 +1302,43 @@ def save_artifact(
|
|
1249
1302
|
fields: A dictionary mapping obs_column to registry_field.
|
1250
1303
|
columns_field: The registry field to validate variables index against.
|
1251
1304
|
organism: The organism name.
|
1252
|
-
adata: The AnnData object to save, must be provided if data is a path.
|
1253
|
-
|
1305
|
+
adata: The AnnData object to save and get n_observations, must be provided if data is a path.
|
1306
|
+
type: `Literal["dataset", "model"] | None = None` The artifact type.
|
1307
|
+
key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
1308
|
+
revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
|
1309
|
+
run: `Run | None = None` The run that creates the artifact.
|
1254
1310
|
|
1255
1311
|
Returns:
|
1256
1312
|
The saved Artifact.
|
1257
1313
|
"""
|
1258
1314
|
from ._artifact import data_is_anndata
|
1315
|
+
from .core._data import add_labels
|
1259
1316
|
|
1260
1317
|
artifact = None
|
1261
1318
|
if data_is_anndata(data):
|
1262
1319
|
assert adata is not None # noqa: S101
|
1263
|
-
artifact = Artifact.from_anndata(
|
1320
|
+
artifact = Artifact.from_anndata(
|
1321
|
+
data, description=description, key=key, revises=revises, run=run
|
1322
|
+
)
|
1264
1323
|
artifact.n_observations = adata.shape[0]
|
1265
1324
|
data = adata
|
1266
1325
|
|
1267
1326
|
elif isinstance(data, pd.DataFrame):
|
1268
|
-
artifact = Artifact.from_df(
|
1327
|
+
artifact = Artifact.from_df(
|
1328
|
+
data, description=description, key=key, revises=revises, run=run
|
1329
|
+
)
|
1269
1330
|
else:
|
1270
1331
|
try:
|
1271
1332
|
from mudata import MuData
|
1272
1333
|
|
1273
1334
|
if isinstance(data, MuData):
|
1274
|
-
artifact = Artifact.from_mudata(
|
1335
|
+
artifact = Artifact.from_mudata(
|
1336
|
+
data,
|
1337
|
+
description=description,
|
1338
|
+
key=key,
|
1339
|
+
revises=revises,
|
1340
|
+
run=run,
|
1341
|
+
)
|
1275
1342
|
artifact.n_observations = data.n_obs
|
1276
1343
|
except ImportError:
|
1277
1344
|
pass
|
@@ -1301,7 +1368,12 @@ def save_artifact(
|
|
1301
1368
|
else:
|
1302
1369
|
raise NotImplementedError
|
1303
1370
|
|
1304
|
-
def _add_labels(
|
1371
|
+
def _add_labels(
|
1372
|
+
data,
|
1373
|
+
artifact: Artifact,
|
1374
|
+
fields: dict[str, FieldAttr],
|
1375
|
+
feature_ref_is_name: bool | None = None,
|
1376
|
+
):
|
1305
1377
|
features = Feature.lookup().dict()
|
1306
1378
|
for key, field in fields.items():
|
1307
1379
|
feature = features.get(key)
|
@@ -1314,16 +1386,47 @@ def save_artifact(
|
|
1314
1386
|
field=field,
|
1315
1387
|
**filter_kwargs_current,
|
1316
1388
|
)
|
1317
|
-
|
1389
|
+
if len(labels) == 0:
|
1390
|
+
continue
|
1391
|
+
if hasattr(registry, "_name_field"):
|
1392
|
+
label_ref_is_name = field.field.name == registry._name_field
|
1393
|
+
add_labels(
|
1394
|
+
artifact,
|
1395
|
+
records=labels,
|
1396
|
+
feature=feature,
|
1397
|
+
feature_ref_is_name=feature_ref_is_name,
|
1398
|
+
label_ref_is_name=label_ref_is_name,
|
1399
|
+
)
|
1318
1400
|
|
1319
1401
|
if artifact._accessor == "MuData":
|
1320
1402
|
for modality, modality_fields in fields.items():
|
1403
|
+
column_field_modality = columns_field.get(modality)
|
1321
1404
|
if modality == "obs":
|
1322
|
-
_add_labels(
|
1405
|
+
_add_labels(
|
1406
|
+
data,
|
1407
|
+
artifact,
|
1408
|
+
modality_fields,
|
1409
|
+
feature_ref_is_name=(
|
1410
|
+
None
|
1411
|
+
if column_field_modality is None
|
1412
|
+
else _ref_is_name(column_field_modality)
|
1413
|
+
),
|
1414
|
+
)
|
1323
1415
|
else:
|
1324
|
-
_add_labels(
|
1416
|
+
_add_labels(
|
1417
|
+
data[modality],
|
1418
|
+
artifact,
|
1419
|
+
modality_fields,
|
1420
|
+
feature_ref_is_name=(
|
1421
|
+
None
|
1422
|
+
if column_field_modality is None
|
1423
|
+
else _ref_is_name(column_field_modality)
|
1424
|
+
),
|
1425
|
+
)
|
1325
1426
|
else:
|
1326
|
-
_add_labels(
|
1427
|
+
_add_labels(
|
1428
|
+
data, artifact, fields, feature_ref_is_name=_ref_is_name(columns_field)
|
1429
|
+
)
|
1327
1430
|
|
1328
1431
|
slug = ln_setup.settings.instance.slug
|
1329
1432
|
if ln_setup.settings.instance.is_remote: # pragma: no cover
|
@@ -1331,6 +1434,19 @@ def save_artifact(
|
|
1331
1434
|
return artifact
|
1332
1435
|
|
1333
1436
|
|
1437
|
+
def flatten_unique(series):
|
1438
|
+
"""Flatten a pandas series if it contains lists."""
|
1439
|
+
result = set()
|
1440
|
+
|
1441
|
+
for item in series:
|
1442
|
+
if isinstance(item, list):
|
1443
|
+
result.update(item)
|
1444
|
+
else:
|
1445
|
+
result.add(item)
|
1446
|
+
|
1447
|
+
return list(result)
|
1448
|
+
|
1449
|
+
|
1334
1450
|
def update_registry(
|
1335
1451
|
values: list[str],
|
1336
1452
|
field: FieldAttr,
|
@@ -1438,7 +1554,8 @@ def update_registry(
|
|
1438
1554
|
if not validated_only:
|
1439
1555
|
non_validated_records = []
|
1440
1556
|
if df is not None and registry == Feature:
|
1441
|
-
|
1557
|
+
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
1558
|
+
non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
|
1442
1559
|
else:
|
1443
1560
|
if "organism" in filter_kwargs:
|
1444
1561
|
# make sure organism record is saved to the current instance
|
@@ -1499,24 +1616,25 @@ def log_saved_labels(
|
|
1499
1616
|
continue
|
1500
1617
|
|
1501
1618
|
if k == "without reference" and validated_only:
|
1502
|
-
|
1503
|
-
|
1504
|
-
)
|
1505
|
-
|
1506
|
-
|
1507
|
-
)
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
msg +=
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
)
|
1516
|
-
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1619
|
+
continue
|
1620
|
+
# msg = colors.yellow(
|
1621
|
+
# f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
|
1622
|
+
# )
|
1623
|
+
# lookup_print = (
|
1624
|
+
# f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
|
1625
|
+
# )
|
1626
|
+
|
1627
|
+
# hint = f".add_new_from('{key}')"
|
1628
|
+
# msg += f"\n → to lookup values, use {lookup_print}"
|
1629
|
+
# msg += (
|
1630
|
+
# f"\n → to save, run {colors.yellow(hint)}"
|
1631
|
+
# if save_function == "add_new_from"
|
1632
|
+
# else f"\n → to save, run {colors.yellow(save_function)}"
|
1633
|
+
# )
|
1634
|
+
# if warning:
|
1635
|
+
# logger.warning(msg)
|
1636
|
+
# else:
|
1637
|
+
# logger.info(msg)
|
1520
1638
|
else:
|
1521
1639
|
k = "" if k == "without reference" else f"{colors.green(k)} "
|
1522
1640
|
# the term "transferred" stresses that this is always in the context of transferring
|
@@ -1534,8 +1652,8 @@ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> N
|
|
1534
1652
|
all_records = registry.from_values(list(values), field=field)
|
1535
1653
|
is_feature = registry.filter(name=f"is_{key}").one_or_none()
|
1536
1654
|
if is_feature is None:
|
1537
|
-
is_feature = registry(name=f"is_{key}")
|
1538
|
-
|
1655
|
+
is_feature = registry(name=f"is_{key}").save()
|
1656
|
+
logger.important(f"Created a parent ULabel: {is_feature}")
|
1539
1657
|
is_feature.children.add(*all_records)
|
1540
1658
|
|
1541
1659
|
|
@@ -1592,7 +1710,7 @@ def _save_organism(name: str): # pragma: no cover
|
|
1592
1710
|
if organism is None:
|
1593
1711
|
organism = bt.Organism.from_source(name=name)
|
1594
1712
|
if organism is None:
|
1595
|
-
raise
|
1713
|
+
raise ValidationError(
|
1596
1714
|
f"Organism '{name}' not found\n"
|
1597
1715
|
f" → please save it: bt.Organism(name='{name}').save()"
|
1598
1716
|
)
|
@@ -1600,4 +1718,12 @@ def _save_organism(name: str): # pragma: no cover
|
|
1600
1718
|
return organism
|
1601
1719
|
|
1602
1720
|
|
1721
|
+
def _ref_is_name(field: FieldAttr) -> bool | None:
|
1722
|
+
"""Check if the reference field is a name field."""
|
1723
|
+
from ._can_validate import get_name_field
|
1724
|
+
|
1725
|
+
name_field = get_name_field(field.field.model)
|
1726
|
+
return field.field.name == name_field
|
1727
|
+
|
1728
|
+
|
1603
1729
|
Curate = Curator # backward compat
|