lamindb 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +3 -3
- lamindb/core/_context.py +64 -69
- lamindb/core/datasets/_small.py +2 -2
- lamindb/curators/__init__.py +683 -893
- lamindb/models/__init__.py +8 -1
- lamindb/models/_feature_manager.py +23 -19
- lamindb/models/_from_values.py +1 -1
- lamindb/models/_is_versioned.py +5 -15
- lamindb/models/artifact.py +210 -111
- lamindb/models/can_curate.py +4 -1
- lamindb/models/collection.py +6 -4
- lamindb/models/feature.py +27 -30
- lamindb/models/has_parents.py +22 -7
- lamindb/models/project.py +2 -2
- lamindb/models/query_set.py +6 -35
- lamindb/models/record.py +167 -117
- lamindb/models/run.py +56 -2
- lamindb/models/save.py +1 -3
- lamindb/models/schema.py +277 -77
- lamindb/models/transform.py +4 -13
- {lamindb-1.3.1.dist-info → lamindb-1.4.0.dist-info}/METADATA +6 -5
- {lamindb-1.3.1.dist-info → lamindb-1.4.0.dist-info}/RECORD +24 -24
- {lamindb-1.3.1.dist-info → lamindb-1.4.0.dist-info}/LICENSE +0 -0
- {lamindb-1.3.1.dist-info → lamindb-1.4.0.dist-info}/WHEEL +0 -0
lamindb/curators/__init__.py
CHANGED
@@ -42,6 +42,8 @@ from lamindb_setup.core import deprecated
|
|
42
42
|
from lamindb_setup.core._docs import doc_args
|
43
43
|
from lamindb_setup.core.upath import UPath
|
44
44
|
|
45
|
+
from lamindb.core._compat import is_package_installed
|
46
|
+
|
45
47
|
if TYPE_CHECKING:
|
46
48
|
from lamindb_setup.core.types import UPathStr
|
47
49
|
from mudata import MuData
|
@@ -103,25 +105,26 @@ class CatLookup:
|
|
103
105
|
|
104
106
|
def __init__(
|
105
107
|
self,
|
106
|
-
categoricals: dict[str, FieldAttr],
|
108
|
+
categoricals: list[Feature] | dict[str, FieldAttr],
|
107
109
|
slots: dict[str, FieldAttr] = None,
|
108
110
|
public: bool = False,
|
109
|
-
organism: str | None = None,
|
110
111
|
sources: dict[str, Record] | None = None,
|
111
112
|
) -> None:
|
112
113
|
slots = slots or {}
|
114
|
+
if isinstance(categoricals, list):
|
115
|
+
categoricals = {
|
116
|
+
feature.name: parse_dtype(feature.dtype)[0]["field"]
|
117
|
+
for feature in categoricals
|
118
|
+
}
|
113
119
|
self._categoricals = {**categoricals, **slots}
|
114
120
|
self._public = public
|
115
|
-
self._organism = organism
|
116
121
|
self._sources = sources
|
117
122
|
|
118
123
|
def __getattr__(self, name):
|
119
124
|
if name in self._categoricals:
|
120
125
|
registry = self._categoricals[name].field.model
|
121
126
|
if self._public and hasattr(registry, "public"):
|
122
|
-
return registry.public(
|
123
|
-
organism=self._organism, source=self._sources.get(name)
|
124
|
-
).lookup()
|
127
|
+
return registry.public(source=self._sources.get(name)).lookup()
|
125
128
|
else:
|
126
129
|
return registry.lookup()
|
127
130
|
raise AttributeError(
|
@@ -132,9 +135,7 @@ class CatLookup:
|
|
132
135
|
if name in self._categoricals:
|
133
136
|
registry = self._categoricals[name].field.model
|
134
137
|
if self._public and hasattr(registry, "public"):
|
135
|
-
return registry.public(
|
136
|
-
organism=self._organism, source=self._sources.get(name)
|
137
|
-
).lookup()
|
138
|
+
return registry.public(source=self._sources.get(name)).lookup()
|
138
139
|
else:
|
139
140
|
return registry.lookup()
|
140
141
|
raise AttributeError(
|
@@ -240,6 +241,7 @@ class Curator:
|
|
240
241
|
pass # pragma: no cover
|
241
242
|
|
242
243
|
|
244
|
+
# default implementation for MuDataCurator and SpatialDataCurator
|
243
245
|
class SlotsCurator(Curator):
|
244
246
|
"""Curator for a dataset with slots.
|
245
247
|
|
@@ -262,7 +264,7 @@ class SlotsCurator(Curator):
|
|
262
264
|
# in form of {table/modality_key: var_field}
|
263
265
|
self._var_fields: dict[str, FieldAttr] = {}
|
264
266
|
# in form of {table/modality_key: categoricals}
|
265
|
-
self.
|
267
|
+
self._cat_columns: dict[str, dict[str, CatColumn]] = {}
|
266
268
|
|
267
269
|
@property
|
268
270
|
@doc_args(SLOTS_DOCSTRING)
|
@@ -273,7 +275,8 @@ class SlotsCurator(Curator):
|
|
273
275
|
@doc_args(VALIDATE_DOCSTRING)
|
274
276
|
def validate(self) -> None:
|
275
277
|
"""{}""" # noqa: D415
|
276
|
-
for
|
278
|
+
for slot, curator in self._slots.items():
|
279
|
+
logger.info(f"validating slot {slot} ...")
|
277
280
|
curator.validate()
|
278
281
|
|
279
282
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
@@ -288,18 +291,34 @@ class SlotsCurator(Curator):
|
|
288
291
|
"""{}""" # noqa: D415
|
289
292
|
if not self._is_validated:
|
290
293
|
self.validate()
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
294
|
+
if self._artifact is None:
|
295
|
+
if data_is_mudata(self._dataset):
|
296
|
+
self._artifact = Artifact.from_mudata(
|
297
|
+
self._dataset,
|
298
|
+
key=key,
|
299
|
+
description=description,
|
300
|
+
revises=revises,
|
301
|
+
run=run,
|
302
|
+
)
|
303
|
+
elif data_is_spatialdata(self._dataset):
|
304
|
+
self._artifact = Artifact.from_spatialdata(
|
305
|
+
self._dataset,
|
306
|
+
key=key,
|
307
|
+
description=description,
|
308
|
+
revises=revises,
|
309
|
+
run=run,
|
310
|
+
)
|
311
|
+
self._artifact.schema = self._schema
|
312
|
+
self._artifact.save()
|
313
|
+
cat_columns = {}
|
314
|
+
for curator in self._slots.values():
|
315
|
+
for key, cat_column in curator._cat_manager._cat_columns.items():
|
316
|
+
cat_columns[key] = cat_column
|
317
|
+
return annotate_artifact( # type: ignore
|
318
|
+
self._artifact,
|
298
319
|
index_field=self._var_fields,
|
299
|
-
artifact=self._artifact,
|
300
|
-
revises=revises,
|
301
|
-
run=run,
|
302
320
|
schema=self._schema,
|
321
|
+
cat_columns=cat_columns,
|
303
322
|
)
|
304
323
|
|
305
324
|
|
@@ -373,11 +392,34 @@ class DataFrameCurator(Curator):
|
|
373
392
|
schema: Schema,
|
374
393
|
) -> None:
|
375
394
|
super().__init__(dataset=dataset, schema=schema)
|
376
|
-
categoricals =
|
395
|
+
categoricals = []
|
396
|
+
features = []
|
397
|
+
feature_ids: set[int] = set()
|
398
|
+
if schema.flexible and isinstance(self._dataset, pd.DataFrame):
|
399
|
+
features += Feature.filter(name__in=self._dataset.keys()).list()
|
400
|
+
feature_ids = {feature.id for feature in features}
|
377
401
|
if schema.n > 0:
|
402
|
+
schema_features = schema.features.all().list()
|
403
|
+
if feature_ids:
|
404
|
+
features.extend(
|
405
|
+
feature
|
406
|
+
for feature in schema_features
|
407
|
+
if feature.id not in feature_ids
|
408
|
+
)
|
409
|
+
else:
|
410
|
+
features.extend(schema_features)
|
411
|
+
else:
|
412
|
+
assert schema.itype is not None # noqa: S101
|
413
|
+
if features:
|
378
414
|
# populate features
|
379
415
|
pandera_columns = {}
|
380
|
-
|
416
|
+
if schema.minimal_set:
|
417
|
+
optional_feature_uids = set(schema.optionals.get_uids())
|
418
|
+
for feature in features:
|
419
|
+
if schema.minimal_set:
|
420
|
+
required = feature.uid not in optional_feature_uids
|
421
|
+
else:
|
422
|
+
required = False
|
381
423
|
if feature.dtype in {"int", "float", "num"}:
|
382
424
|
dtype = (
|
383
425
|
self._dataset[feature.name].dtype
|
@@ -393,6 +435,7 @@ class DataFrameCurator(Curator):
|
|
393
435
|
),
|
394
436
|
nullable=feature.nullable,
|
395
437
|
coerce=feature.coerce_dtype,
|
438
|
+
required=required,
|
396
439
|
)
|
397
440
|
else:
|
398
441
|
pandera_dtype = (
|
@@ -404,14 +447,18 @@ class DataFrameCurator(Curator):
|
|
404
447
|
pandera_dtype,
|
405
448
|
nullable=feature.nullable,
|
406
449
|
coerce=feature.coerce_dtype,
|
450
|
+
required=required,
|
407
451
|
)
|
408
452
|
if feature.dtype.startswith("cat"):
|
409
|
-
categoricals
|
453
|
+
# validate categoricals if the column is required or if the column is present
|
454
|
+
if required or feature.name in self._dataset.columns:
|
455
|
+
categoricals.append(feature)
|
410
456
|
self._pandera_schema = pandera.DataFrameSchema(
|
411
|
-
pandera_columns,
|
457
|
+
pandera_columns,
|
458
|
+
coerce=schema.coerce_dtype,
|
459
|
+
strict=schema.maximal_set,
|
460
|
+
ordered=schema.ordered_set,
|
412
461
|
)
|
413
|
-
else:
|
414
|
-
assert schema.itype is not None # noqa: S101
|
415
462
|
self._cat_manager = DataFrameCatManager(
|
416
463
|
self._dataset,
|
417
464
|
columns=parse_cat_dtype(schema.itype, is_itype=True)["field"],
|
@@ -504,16 +551,21 @@ class DataFrameCurator(Curator):
|
|
504
551
|
if not self._is_validated:
|
505
552
|
self.validate() # raises ValidationError if doesn't validate
|
506
553
|
result = parse_cat_dtype(self._schema.itype, is_itype=True)
|
507
|
-
|
508
|
-
self.
|
509
|
-
|
510
|
-
|
554
|
+
if self._artifact is None:
|
555
|
+
self._artifact = Artifact.from_df(
|
556
|
+
self._dataset,
|
557
|
+
key=key,
|
558
|
+
description=description,
|
559
|
+
revises=revises,
|
560
|
+
run=run,
|
561
|
+
)
|
562
|
+
self._artifact.schema = self._schema
|
563
|
+
self._artifact.save()
|
564
|
+
return annotate_artifact( # type: ignore
|
565
|
+
self._artifact,
|
511
566
|
index_field=result["field"],
|
512
|
-
key=key,
|
513
|
-
artifact=self._artifact,
|
514
|
-
revises=revises,
|
515
|
-
run=run,
|
516
567
|
schema=self._schema,
|
568
|
+
cat_columns=self._cat_manager._cat_columns,
|
517
569
|
)
|
518
570
|
|
519
571
|
|
@@ -548,7 +600,7 @@ class AnnDataCurator(SlotsCurator):
|
|
548
600
|
ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
|
549
601
|
ln.Feature(name="sample_note", dtype=str).save(),
|
550
602
|
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
|
551
|
-
ln.Feature(name="cell_type_by_model", dtype=bt.CellType
|
603
|
+
ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
|
552
604
|
],
|
553
605
|
).save()
|
554
606
|
|
@@ -596,6 +648,12 @@ class AnnDataCurator(SlotsCurator):
|
|
596
648
|
for slot, slot_schema in schema.slots.items()
|
597
649
|
if slot in {"obs", "var", "uns"}
|
598
650
|
}
|
651
|
+
# TODO: better way to handle this!
|
652
|
+
if "var" in self._slots:
|
653
|
+
self._slots["var"]._cat_manager._cat_columns["var_index"] = self._slots[
|
654
|
+
"var"
|
655
|
+
]._cat_manager._cat_columns.pop("columns")
|
656
|
+
self._slots["var"]._cat_manager._cat_columns["var_index"]._key = "var_index"
|
599
657
|
|
600
658
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
601
659
|
def save_artifact(
|
@@ -609,23 +667,28 @@ class AnnDataCurator(SlotsCurator):
|
|
609
667
|
"""{}""" # noqa: D415
|
610
668
|
if not self._is_validated:
|
611
669
|
self.validate()
|
612
|
-
if
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
670
|
+
if self._artifact is None:
|
671
|
+
self._artifact = Artifact.from_anndata(
|
672
|
+
self._dataset,
|
673
|
+
key=key,
|
674
|
+
description=description,
|
675
|
+
revises=revises,
|
676
|
+
run=run,
|
677
|
+
)
|
678
|
+
self._artifact.schema = self._schema
|
679
|
+
self._artifact.save()
|
680
|
+
return annotate_artifact( # type: ignore
|
681
|
+
self._artifact,
|
682
|
+
cat_columns=(
|
683
|
+
self.slots["obs"]._cat_manager._cat_columns
|
684
|
+
if "obs" in self.slots
|
685
|
+
else {}
|
686
|
+
),
|
620
687
|
index_field=(
|
621
688
|
parse_cat_dtype(self.slots["var"]._schema.itype, is_itype=True)["field"]
|
622
689
|
if "var" in self._slots
|
623
690
|
else None
|
624
691
|
),
|
625
|
-
key=key,
|
626
|
-
artifact=self._artifact,
|
627
|
-
revises=revises,
|
628
|
-
run=run,
|
629
692
|
schema=self._schema,
|
630
693
|
)
|
631
694
|
|
@@ -636,14 +699,14 @@ def _assign_var_fields_categoricals_multimodal(
|
|
636
699
|
slot: str,
|
637
700
|
slot_schema: Schema,
|
638
701
|
var_fields: dict[str, FieldAttr],
|
639
|
-
|
702
|
+
cat_columns: dict[str, dict[str, CatColumn]],
|
640
703
|
slots: dict[str, DataFrameCurator],
|
641
704
|
) -> None:
|
642
705
|
"""Assigns var_fields and categoricals for multimodal data curators."""
|
643
706
|
if modality is not None:
|
644
707
|
# Makes sure that all tables are present
|
645
708
|
var_fields[modality] = None
|
646
|
-
|
709
|
+
cat_columns[modality] = {}
|
647
710
|
|
648
711
|
if slot_type == "var":
|
649
712
|
var_field = parse_cat_dtype(slot_schema.itype, is_itype=True)["field"]
|
@@ -654,12 +717,12 @@ def _assign_var_fields_categoricals_multimodal(
|
|
654
717
|
# Note that this is NOT nested since the nested key is always "var"
|
655
718
|
var_fields[modality] = var_field
|
656
719
|
else:
|
657
|
-
obs_fields = slots[slot]._cat_manager.
|
720
|
+
obs_fields = slots[slot]._cat_manager._cat_columns
|
658
721
|
if modality is None:
|
659
|
-
|
722
|
+
cat_columns[slot] = obs_fields
|
660
723
|
else:
|
661
724
|
# Note that this is NOT nested since the nested key is always "obs"
|
662
|
-
|
725
|
+
cat_columns[modality] = obs_fields
|
663
726
|
|
664
727
|
|
665
728
|
class MuDataCurator(SlotsCurator):
|
@@ -770,7 +833,7 @@ class MuDataCurator(SlotsCurator):
|
|
770
833
|
slot=slot,
|
771
834
|
slot_schema=slot_schema,
|
772
835
|
var_fields=self._var_fields,
|
773
|
-
|
836
|
+
cat_columns=self._cat_columns,
|
774
837
|
slots=self._slots,
|
775
838
|
)
|
776
839
|
|
@@ -890,7 +953,7 @@ class SpatialDataCurator(SlotsCurator):
|
|
890
953
|
slot=slot,
|
891
954
|
slot_schema=slot_schema,
|
892
955
|
var_fields=self._var_fields,
|
893
|
-
|
956
|
+
cat_columns=self._cat_columns,
|
894
957
|
slots=self._slots,
|
895
958
|
)
|
896
959
|
|
@@ -898,6 +961,296 @@ class SpatialDataCurator(SlotsCurator):
|
|
898
961
|
self._columns_field = self._var_fields
|
899
962
|
|
900
963
|
|
964
|
+
class CatColumn:
|
965
|
+
"""Categorical column for `DataFrame`.
|
966
|
+
|
967
|
+
Args:
|
968
|
+
values_getter: A callable or iterable that returns the values to validate.
|
969
|
+
field: The field to validate against.
|
970
|
+
key: The name of the column to validate. Only used for logging.
|
971
|
+
values_setter: A callable that sets the values.
|
972
|
+
source: The source to validate against.
|
973
|
+
"""
|
974
|
+
|
975
|
+
def __init__(
|
976
|
+
self,
|
977
|
+
values_getter: Callable | Iterable[str],
|
978
|
+
field: FieldAttr,
|
979
|
+
key: str,
|
980
|
+
values_setter: Callable | None = None,
|
981
|
+
source: Record | None = None,
|
982
|
+
feature: Feature | None = None,
|
983
|
+
) -> None:
|
984
|
+
self._values_getter = values_getter
|
985
|
+
self._values_setter = values_setter
|
986
|
+
self._field = field
|
987
|
+
self._key = key
|
988
|
+
self._source = source
|
989
|
+
self._organism = None
|
990
|
+
self._validated: None | list[str] = None
|
991
|
+
self._non_validated: None | list[str] = None
|
992
|
+
self._synonyms: None | dict[str, str] = None
|
993
|
+
self.feature = feature
|
994
|
+
self.labels = None
|
995
|
+
if hasattr(field.field.model, "_name_field"):
|
996
|
+
label_ref_is_name = field.field.name == field.field.model._name_field
|
997
|
+
else:
|
998
|
+
label_ref_is_name = field.field.name == "name"
|
999
|
+
self.label_ref_is_name = label_ref_is_name
|
1000
|
+
|
1001
|
+
@property
|
1002
|
+
def values(self):
|
1003
|
+
"""Get the current values using the getter function."""
|
1004
|
+
if callable(self._values_getter):
|
1005
|
+
return self._values_getter()
|
1006
|
+
return self._values_getter
|
1007
|
+
|
1008
|
+
@values.setter
|
1009
|
+
def values(self, new_values):
|
1010
|
+
"""Set new values using the setter function if available."""
|
1011
|
+
if callable(self._values_setter):
|
1012
|
+
self._values_setter(new_values)
|
1013
|
+
else:
|
1014
|
+
# If values_getter is not callable, it's a direct reference we can update
|
1015
|
+
self._values_getter = new_values
|
1016
|
+
|
1017
|
+
@property
|
1018
|
+
def is_validated(self) -> bool:
|
1019
|
+
"""Return whether the column is validated."""
|
1020
|
+
return len(self._non_validated) == 0
|
1021
|
+
|
1022
|
+
def _replace_synonyms(self) -> list[str]:
|
1023
|
+
"""Replace synonyms in the column with standardized values."""
|
1024
|
+
syn_mapper = self._synonyms
|
1025
|
+
# replace the values in df
|
1026
|
+
std_values = self.values.map(
|
1027
|
+
lambda unstd_val: syn_mapper.get(unstd_val, unstd_val)
|
1028
|
+
)
|
1029
|
+
# remove the standardized values from self.non_validated
|
1030
|
+
non_validated = [i for i in self._non_validated if i not in syn_mapper]
|
1031
|
+
if len(non_validated) == 0:
|
1032
|
+
self._non_validated = []
|
1033
|
+
else:
|
1034
|
+
self._non_validated = non_validated # type: ignore
|
1035
|
+
# logging
|
1036
|
+
n = len(syn_mapper)
|
1037
|
+
if n > 0:
|
1038
|
+
syn_mapper_print = _format_values(
|
1039
|
+
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
1040
|
+
)
|
1041
|
+
s = "s" if n > 1 else ""
|
1042
|
+
logger.success(
|
1043
|
+
f'standardized {n} synonym{s} in "{self._key}": {colors.green(syn_mapper_print)}'
|
1044
|
+
)
|
1045
|
+
return std_values
|
1046
|
+
|
1047
|
+
def _add_validated(self) -> tuple[list, list]:
|
1048
|
+
"""Save features or labels records in the default instance."""
|
1049
|
+
from lamindb.models.save import save as ln_save
|
1050
|
+
|
1051
|
+
registry = self._field.field.model
|
1052
|
+
field_name = self._field.field.name
|
1053
|
+
model_field = registry.__get_name_with_module__()
|
1054
|
+
filter_kwargs = get_current_filter_kwargs(
|
1055
|
+
registry, {"organism": self._organism, "source": self._source}
|
1056
|
+
)
|
1057
|
+
values = [i for i in self.values if isinstance(i, str) and i]
|
1058
|
+
if not values:
|
1059
|
+
return [], []
|
1060
|
+
|
1061
|
+
# inspect the default instance and save validated records from public
|
1062
|
+
existing_and_public_records = registry.from_values(
|
1063
|
+
list(values), field=self._field, **filter_kwargs, mute=True
|
1064
|
+
)
|
1065
|
+
existing_and_public_labels = [
|
1066
|
+
getattr(r, field_name) for r in existing_and_public_records
|
1067
|
+
]
|
1068
|
+
# public records that are not already in the database
|
1069
|
+
public_records = [r for r in existing_and_public_records if r._state.adding]
|
1070
|
+
# here we check to only save the public records if they are from the specified source
|
1071
|
+
# we check the uid because r.source and source can be from different instances
|
1072
|
+
if self._source:
|
1073
|
+
public_records = [
|
1074
|
+
r for r in public_records if r.source.uid == self._source.uid
|
1075
|
+
]
|
1076
|
+
if len(public_records) > 0:
|
1077
|
+
logger.info(f"saving validated records of '{self._key}'")
|
1078
|
+
ln_save(public_records)
|
1079
|
+
labels_saved_public = [getattr(r, field_name) for r in public_records]
|
1080
|
+
# log the saved public labels
|
1081
|
+
# the term "transferred" stresses that this is always in the context of transferring
|
1082
|
+
# labels from a public ontology or a different instance to the present instance
|
1083
|
+
if len(labels_saved_public) > 0:
|
1084
|
+
s = "s" if len(labels_saved_public) > 1 else ""
|
1085
|
+
logger.success(
|
1086
|
+
f'added {len(labels_saved_public)} record{s} {colors.green("from_public")} with {model_field} for "{self._key}": {_format_values(labels_saved_public)}'
|
1087
|
+
)
|
1088
|
+
self.labels = existing_and_public_records
|
1089
|
+
|
1090
|
+
# non-validated records from the default instance
|
1091
|
+
non_validated_labels = [
|
1092
|
+
i for i in values if i not in existing_and_public_labels
|
1093
|
+
]
|
1094
|
+
|
1095
|
+
# validated, non-validated
|
1096
|
+
return existing_and_public_labels, non_validated_labels
|
1097
|
+
|
1098
|
+
def _add_new(
|
1099
|
+
self,
|
1100
|
+
values: list[str],
|
1101
|
+
df: pd.DataFrame | None = None, # remove when all users use schema
|
1102
|
+
dtype: str | None = None,
|
1103
|
+
**create_kwargs,
|
1104
|
+
) -> None:
|
1105
|
+
"""Add new labels to the registry."""
|
1106
|
+
from lamindb.models.save import save as ln_save
|
1107
|
+
|
1108
|
+
registry = self._field.field.model
|
1109
|
+
field_name = self._field.field.name
|
1110
|
+
non_validated_records: RecordList[Any] = [] # type: ignore
|
1111
|
+
if df is not None and registry == Feature:
|
1112
|
+
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
1113
|
+
non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
|
1114
|
+
else:
|
1115
|
+
if (
|
1116
|
+
self._organism
|
1117
|
+
and hasattr(registry, "organism")
|
1118
|
+
and registry._meta.get_field("organism").is_relation
|
1119
|
+
):
|
1120
|
+
# make sure organism record is saved to the current instance
|
1121
|
+
create_kwargs["organism"] = _save_organism(name=self._organism)
|
1122
|
+
|
1123
|
+
for value in values:
|
1124
|
+
init_kwargs = {field_name: value}
|
1125
|
+
if registry == Feature:
|
1126
|
+
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
1127
|
+
non_validated_records.append(registry(**init_kwargs, **create_kwargs))
|
1128
|
+
if len(non_validated_records) > 0:
|
1129
|
+
ln_save(non_validated_records)
|
1130
|
+
model_field = colors.italic(registry.__get_name_with_module__())
|
1131
|
+
s = "s" if len(values) > 1 else ""
|
1132
|
+
logger.success(
|
1133
|
+
f'added {len(values)} record{s} with {model_field} for "{self._key}": {_format_values(values)}'
|
1134
|
+
)
|
1135
|
+
|
1136
|
+
def _validate(
|
1137
|
+
self,
|
1138
|
+
values: list[str],
|
1139
|
+
curator: CatManager | None = None, # TODO: not yet used
|
1140
|
+
) -> tuple[list[str], dict]:
|
1141
|
+
"""Validate ontology terms using LaminDB registries."""
|
1142
|
+
registry = self._field.field.model
|
1143
|
+
field_name = self._field.field.name
|
1144
|
+
model_field = f"{registry.__name__}.{field_name}"
|
1145
|
+
|
1146
|
+
def _log_mapping_info():
|
1147
|
+
logger.indent = ""
|
1148
|
+
logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}')
|
1149
|
+
logger.indent = " "
|
1150
|
+
|
1151
|
+
kwargs_current = get_current_filter_kwargs(
|
1152
|
+
registry, {"organism": self._organism, "source": self._source}
|
1153
|
+
)
|
1154
|
+
|
1155
|
+
# inspect values from the default instance, excluding public
|
1156
|
+
inspect_result = registry.inspect(
|
1157
|
+
values, field=self._field, mute=True, from_source=False, **kwargs_current
|
1158
|
+
)
|
1159
|
+
non_validated = inspect_result.non_validated
|
1160
|
+
syn_mapper = inspect_result.synonyms_mapper
|
1161
|
+
|
1162
|
+
# inspect the non-validated values from public (BioRecord only)
|
1163
|
+
values_validated = []
|
1164
|
+
if hasattr(registry, "public"):
|
1165
|
+
public_records = registry.from_values(
|
1166
|
+
non_validated,
|
1167
|
+
field=self._field,
|
1168
|
+
mute=True,
|
1169
|
+
**kwargs_current,
|
1170
|
+
)
|
1171
|
+
values_validated += [getattr(r, field_name) for r in public_records]
|
1172
|
+
|
1173
|
+
# logging messages
|
1174
|
+
non_validated_hint_print = f'.add_new_from("{self._key}")'
|
1175
|
+
non_validated = [i for i in non_validated if i not in values_validated]
|
1176
|
+
n_non_validated = len(non_validated)
|
1177
|
+
if n_non_validated == 0:
|
1178
|
+
logger.indent = ""
|
1179
|
+
logger.success(
|
1180
|
+
f'"{self._key}" is validated against {colors.italic(model_field)}'
|
1181
|
+
)
|
1182
|
+
return [], {}
|
1183
|
+
else:
|
1184
|
+
are = "is" if n_non_validated == 1 else "are"
|
1185
|
+
s = "" if n_non_validated == 1 else "s"
|
1186
|
+
print_values = _format_values(non_validated)
|
1187
|
+
warning_message = f"{colors.red(f'{n_non_validated} term{s}')} {are} not validated: {colors.red(print_values)}\n"
|
1188
|
+
if syn_mapper:
|
1189
|
+
s = "" if len(syn_mapper) == 1 else "s"
|
1190
|
+
syn_mapper_print = _format_values(
|
1191
|
+
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
1192
|
+
)
|
1193
|
+
hint_msg = f'.standardize("{self._key}")'
|
1194
|
+
warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}"
|
1195
|
+
if n_non_validated > len(syn_mapper):
|
1196
|
+
if syn_mapper:
|
1197
|
+
warning_message += "\n for remaining terms:\n"
|
1198
|
+
warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}"
|
1199
|
+
|
1200
|
+
if logger.indent == "":
|
1201
|
+
_log_mapping_info()
|
1202
|
+
logger.warning(warning_message)
|
1203
|
+
if curator is not None:
|
1204
|
+
curator._validate_category_error_messages = strip_ansi_codes(
|
1205
|
+
warning_message
|
1206
|
+
)
|
1207
|
+
logger.indent = ""
|
1208
|
+
return non_validated, syn_mapper
|
1209
|
+
|
1210
|
+
def validate(self) -> None:
|
1211
|
+
"""Validate the column."""
|
1212
|
+
# add source-validated values to the registry
|
1213
|
+
self._validated, self._non_validated = self._add_validated()
|
1214
|
+
self._non_validated, self._synonyms = self._validate(values=self._non_validated)
|
1215
|
+
# always register new Features if they are columns
|
1216
|
+
if self._key == "columns" and self._field == Feature.name:
|
1217
|
+
self.add_new()
|
1218
|
+
|
1219
|
+
def standardize(self) -> None:
|
1220
|
+
"""Standardize the column."""
|
1221
|
+
registry = self._field.field.model
|
1222
|
+
if not hasattr(registry, "standardize"):
|
1223
|
+
return self.values
|
1224
|
+
if self._synonyms is None:
|
1225
|
+
self.validate()
|
1226
|
+
# get standardized values
|
1227
|
+
std_values = self._replace_synonyms()
|
1228
|
+
# update non_validated values
|
1229
|
+
self._non_validated = [
|
1230
|
+
i for i in self._non_validated if i not in self._synonyms.keys()
|
1231
|
+
]
|
1232
|
+
# remove synonyms since they are now standardized
|
1233
|
+
self._synonyms = {}
|
1234
|
+
# update the values with the standardized values
|
1235
|
+
self.values = std_values
|
1236
|
+
|
1237
|
+
def add_new(self, **create_kwargs) -> None:
|
1238
|
+
"""Add new values to the registry."""
|
1239
|
+
if self._non_validated is None:
|
1240
|
+
self.validate()
|
1241
|
+
if len(self._synonyms) > 0:
|
1242
|
+
# raise error because .standardize modifies the input dataset
|
1243
|
+
raise ValidationError(
|
1244
|
+
"Please run `.standardize()` before adding new values."
|
1245
|
+
)
|
1246
|
+
self._add_new(
|
1247
|
+
values=self._non_validated,
|
1248
|
+
**create_kwargs,
|
1249
|
+
)
|
1250
|
+
# remove the non_validated values since they are now registered
|
1251
|
+
self._non_validated = []
|
1252
|
+
|
1253
|
+
|
901
1254
|
class CatManager:
|
902
1255
|
"""Manage categoricals by updating registries.
|
903
1256
|
|
@@ -909,14 +1262,16 @@ class CatManager:
|
|
909
1262
|
- non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
|
910
1263
|
"""
|
911
1264
|
|
912
|
-
def __init__(self, *, dataset, categoricals, sources,
|
1265
|
+
def __init__(self, *, dataset, categoricals, sources, columns_field=None):
|
913
1266
|
# the below is shared with Curator
|
914
1267
|
self._artifact: Artifact = None # pass the dataset as an artifact
|
915
1268
|
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
916
1269
|
if isinstance(self._dataset, Artifact):
|
917
1270
|
self._artifact = self._dataset
|
918
1271
|
if self._artifact.otype in {"DataFrame", "AnnData"}:
|
919
|
-
self._dataset = self._dataset.load(
|
1272
|
+
self._dataset = self._dataset.load(
|
1273
|
+
is_run_input=False # we already track this in the Curator constructor
|
1274
|
+
)
|
920
1275
|
self._is_validated: bool = False
|
921
1276
|
# shared until here
|
922
1277
|
self._categoricals = categoricals or {}
|
@@ -924,49 +1279,24 @@ class CatManager:
|
|
924
1279
|
self._sources = sources or {}
|
925
1280
|
self._columns_field = columns_field
|
926
1281
|
self._validate_category_error_messages: str = ""
|
927
|
-
|
928
|
-
if organism:
|
929
|
-
self._organism = organism
|
930
|
-
else:
|
931
|
-
fields = list(self._categoricals.values()) + [columns_field]
|
932
|
-
organisms = {get_organism_kwargs(field).get("organism") for field in fields}
|
933
|
-
self._organism = organisms.pop() if len(organisms) > 0 else None
|
1282
|
+
self._cat_columns: dict[str, CatColumn] = {}
|
934
1283
|
|
935
1284
|
@property
|
936
1285
|
def non_validated(self) -> dict[str, list[str]]:
|
937
1286
|
"""Return the non-validated features and labels."""
|
938
1287
|
if self._non_validated is None:
|
939
1288
|
raise ValidationError("Please run validate() first!")
|
940
|
-
return
|
1289
|
+
return {
|
1290
|
+
key: cat_column._non_validated
|
1291
|
+
for key, cat_column in self._cat_columns.items()
|
1292
|
+
if cat_column._non_validated and key != "columns"
|
1293
|
+
}
|
941
1294
|
|
942
1295
|
@property
|
943
1296
|
def categoricals(self) -> dict:
|
944
1297
|
"""Return the columns fields to validate against."""
|
945
1298
|
return self._categoricals
|
946
1299
|
|
947
|
-
def _replace_synonyms(
|
948
|
-
self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
|
949
|
-
):
|
950
|
-
# replace the values in df
|
951
|
-
std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
|
952
|
-
# remove the standardized values from self.non_validated
|
953
|
-
non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
|
954
|
-
if len(non_validated) == 0:
|
955
|
-
self._non_validated.pop(key, None) # type: ignore
|
956
|
-
else:
|
957
|
-
self._non_validated[key] = non_validated # type: ignore
|
958
|
-
# logging
|
959
|
-
n = len(syn_mapper)
|
960
|
-
if n > 0:
|
961
|
-
syn_mapper_print = _format_values(
|
962
|
-
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
963
|
-
)
|
964
|
-
s = "s" if n > 1 else ""
|
965
|
-
logger.success(
|
966
|
-
f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
|
967
|
-
)
|
968
|
-
return std_values
|
969
|
-
|
970
1300
|
def validate(self) -> bool:
|
971
1301
|
"""Validate dataset.
|
972
1302
|
|
@@ -1006,19 +1336,49 @@ class CatManager:
|
|
1006
1336
|
if not self._is_validated: # need to raise error manually
|
1007
1337
|
raise ValidationError("Dataset does not validate. Please curate.")
|
1008
1338
|
|
1009
|
-
self._artifact
|
1010
|
-
self._dataset,
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1339
|
+
if self._artifact is None:
|
1340
|
+
if isinstance(self._dataset, pd.DataFrame):
|
1341
|
+
artifact = Artifact.from_df(
|
1342
|
+
self._dataset,
|
1343
|
+
key=key,
|
1344
|
+
description=description,
|
1345
|
+
revises=revises,
|
1346
|
+
run=run,
|
1347
|
+
)
|
1348
|
+
elif isinstance(self._dataset, AnnData):
|
1349
|
+
artifact = Artifact.from_anndata(
|
1350
|
+
self._dataset,
|
1351
|
+
key=key,
|
1352
|
+
description=description,
|
1353
|
+
revises=revises,
|
1354
|
+
run=run,
|
1355
|
+
)
|
1356
|
+
elif data_is_mudata(self._dataset):
|
1357
|
+
artifact = Artifact.from_mudata(
|
1358
|
+
self._dataset,
|
1359
|
+
key=key,
|
1360
|
+
description=description,
|
1361
|
+
revises=revises,
|
1362
|
+
run=run,
|
1363
|
+
)
|
1364
|
+
elif data_is_spatialdata(self._dataset):
|
1365
|
+
artifact = Artifact.from_spatialdata(
|
1366
|
+
self._dataset,
|
1367
|
+
key=key,
|
1368
|
+
description=description,
|
1369
|
+
revises=revises,
|
1370
|
+
run=run,
|
1371
|
+
)
|
1372
|
+
else:
|
1373
|
+
raise InvalidArgument( # pragma: no cover
|
1374
|
+
"data must be one of pd.Dataframe, AnnData, MuData, SpatialData."
|
1375
|
+
)
|
1376
|
+
self._artifact = artifact.save()
|
1377
|
+
annotate_artifact( # type: ignore
|
1378
|
+
self._artifact,
|
1014
1379
|
index_field=self._columns_field,
|
1015
|
-
|
1016
|
-
revises=revises,
|
1017
|
-
run=run,
|
1018
|
-
schema=None,
|
1019
|
-
organism=self._organism,
|
1380
|
+
cat_columns=self._cat_columns,
|
1020
1381
|
)
|
1021
|
-
|
1022
1382
|
return self._artifact
|
1023
1383
|
|
1024
1384
|
|
@@ -1029,24 +1389,73 @@ class DataFrameCatManager(CatManager):
|
|
1029
1389
|
self,
|
1030
1390
|
df: pd.DataFrame | Artifact,
|
1031
1391
|
columns: FieldAttr = Feature.name,
|
1032
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
1033
|
-
verbosity: str = "hint",
|
1034
|
-
organism: str | None = None,
|
1392
|
+
categoricals: list[Feature] | dict[str, FieldAttr] | None = None,
|
1035
1393
|
sources: dict[str, Record] | None = None,
|
1036
1394
|
) -> None:
|
1037
|
-
if organism is not None and not isinstance(organism, str):
|
1038
|
-
raise ValueError("organism must be a string such as 'human' or 'mouse'!")
|
1039
|
-
|
1040
|
-
settings.verbosity = verbosity
|
1041
1395
|
self._non_validated = None
|
1042
1396
|
super().__init__(
|
1043
1397
|
dataset=df,
|
1044
1398
|
columns_field=columns,
|
1045
|
-
organism=organism,
|
1046
1399
|
categoricals=categoricals,
|
1047
1400
|
sources=sources,
|
1048
1401
|
)
|
1049
|
-
|
1402
|
+
if columns == Feature.name:
|
1403
|
+
if isinstance(self._categoricals, list):
|
1404
|
+
values = [feature.name for feature in self._categoricals]
|
1405
|
+
else:
|
1406
|
+
values = list(self._categoricals.keys())
|
1407
|
+
self._cat_columns["columns"] = CatColumn(
|
1408
|
+
values_getter=values,
|
1409
|
+
field=self._columns_field,
|
1410
|
+
key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
|
1411
|
+
source=self._sources.get("columns"),
|
1412
|
+
)
|
1413
|
+
self._cat_columns["columns"].validate()
|
1414
|
+
else:
|
1415
|
+
# NOTE: for var_index right now
|
1416
|
+
self._cat_columns["columns"] = CatColumn(
|
1417
|
+
values_getter=lambda: self._dataset.columns, # lambda ensures the inplace update
|
1418
|
+
values_setter=lambda new_values: setattr(
|
1419
|
+
self._dataset, "columns", pd.Index(new_values)
|
1420
|
+
),
|
1421
|
+
field=self._columns_field,
|
1422
|
+
key="columns",
|
1423
|
+
source=self._sources.get("columns"),
|
1424
|
+
)
|
1425
|
+
if isinstance(self._categoricals, list):
|
1426
|
+
for feature in self._categoricals:
|
1427
|
+
result = parse_dtype(feature.dtype)[
|
1428
|
+
0
|
1429
|
+
] # TODO: support composite dtypes for categoricals
|
1430
|
+
key = feature.name
|
1431
|
+
field = result["field"]
|
1432
|
+
self._cat_columns[key] = CatColumn(
|
1433
|
+
values_getter=lambda k=key: self._dataset[
|
1434
|
+
k
|
1435
|
+
], # Capture key as default argument
|
1436
|
+
values_setter=lambda new_values, k=key: self._dataset.__setitem__(
|
1437
|
+
k, new_values
|
1438
|
+
),
|
1439
|
+
field=field,
|
1440
|
+
key=key,
|
1441
|
+
source=self._sources.get(key),
|
1442
|
+
feature=feature,
|
1443
|
+
)
|
1444
|
+
else:
|
1445
|
+
# below is for backward compat of ln.Curator.from_df()
|
1446
|
+
for key, field in self._categoricals.items():
|
1447
|
+
self._cat_columns[key] = CatColumn(
|
1448
|
+
values_getter=lambda k=key: self._dataset[
|
1449
|
+
k
|
1450
|
+
], # Capture key as default argument
|
1451
|
+
values_setter=lambda new_values, k=key: self._dataset.__setitem__(
|
1452
|
+
k, new_values
|
1453
|
+
),
|
1454
|
+
field=field,
|
1455
|
+
key=key,
|
1456
|
+
source=self._sources.get(key),
|
1457
|
+
feature=Feature.get(name=key),
|
1458
|
+
)
|
1050
1459
|
|
1051
1460
|
def lookup(self, public: bool = False) -> CatLookup:
|
1052
1461
|
"""Lookup categories.
|
@@ -1058,59 +1467,20 @@ class DataFrameCatManager(CatManager):
|
|
1058
1467
|
categoricals=self._categoricals,
|
1059
1468
|
slots={"columns": self._columns_field},
|
1060
1469
|
public=public,
|
1061
|
-
organism=self._organism,
|
1062
1470
|
sources=self._sources,
|
1063
1471
|
)
|
1064
1472
|
|
1065
|
-
def _save_columns(self, validated_only: bool = True) -> None:
|
1066
|
-
"""Save column name records."""
|
1067
|
-
# Always save features specified as the fields keys
|
1068
|
-
update_registry(
|
1069
|
-
values=list(self.categoricals.keys()),
|
1070
|
-
field=self._columns_field,
|
1071
|
-
key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
|
1072
|
-
validated_only=False,
|
1073
|
-
source=self._sources.get("columns"),
|
1074
|
-
)
|
1075
|
-
|
1076
|
-
# Save the rest of the columns based on validated_only
|
1077
|
-
additional_columns = set(self._dataset.keys()) - set(self.categoricals.keys())
|
1078
|
-
if additional_columns:
|
1079
|
-
update_registry(
|
1080
|
-
values=list(additional_columns),
|
1081
|
-
field=self._columns_field,
|
1082
|
-
key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
|
1083
|
-
validated_only=validated_only,
|
1084
|
-
df=self._dataset, # Get the Feature type from df
|
1085
|
-
source=self._sources.get("columns"),
|
1086
|
-
)
|
1087
|
-
|
1088
|
-
@deprecated(new_name="is run by default")
|
1089
|
-
def add_new_from_columns(self, organism: str | None = None, **kwargs):
|
1090
|
-
pass # pragma: no cover
|
1091
|
-
|
1092
1473
|
def validate(self) -> bool:
|
1093
|
-
"""Validate variables and categorical observations.
|
1094
|
-
|
1095
|
-
This method also registers the validated records in the current instance:
|
1096
|
-
- from public sources
|
1474
|
+
"""Validate variables and categorical observations."""
|
1475
|
+
self._validate_category_error_messages = "" # reset the error messages
|
1097
1476
|
|
1098
|
-
|
1099
|
-
|
1477
|
+
validated = True
|
1478
|
+
for _, cat_column in self._cat_columns.items():
|
1479
|
+
cat_column.validate()
|
1480
|
+
validated &= cat_column.is_validated
|
1481
|
+
self._is_validated = validated
|
1482
|
+
self._non_validated = {} # so it's no longer None
|
1100
1483
|
|
1101
|
-
Returns:
|
1102
|
-
Whether the DataFrame is validated.
|
1103
|
-
"""
|
1104
|
-
# add all validated records to the current instance
|
1105
|
-
self._update_registry_all()
|
1106
|
-
self._validate_category_error_messages = "" # reset the error messages
|
1107
|
-
self._is_validated, self._non_validated = validate_categories_in_df( # type: ignore
|
1108
|
-
self._dataset,
|
1109
|
-
fields=self.categoricals,
|
1110
|
-
sources=self._sources,
|
1111
|
-
curator=self,
|
1112
|
-
organism=self._organism,
|
1113
|
-
)
|
1114
1484
|
return self._is_validated
|
1115
1485
|
|
1116
1486
|
def standardize(self, key: str) -> None:
|
@@ -1123,82 +1493,37 @@ class DataFrameCatManager(CatManager):
|
|
1123
1493
|
"""
|
1124
1494
|
if self._artifact is not None:
|
1125
1495
|
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
1126
|
-
# list is needed to avoid RuntimeError: dictionary changed size during iteration
|
1127
|
-
avail_keys = list(self.non_validated.keys())
|
1128
|
-
if len(avail_keys) == 0:
|
1129
|
-
logger.warning("values are already standardized")
|
1130
|
-
return
|
1131
1496
|
|
1132
1497
|
if key == "all":
|
1133
|
-
|
1134
|
-
|
1135
|
-
syn_mapper = standardize_categories(
|
1136
|
-
self.non_validated[k],
|
1137
|
-
field=self._categoricals[k],
|
1138
|
-
source=self._sources.get(k),
|
1139
|
-
)
|
1140
|
-
self._dataset[k] = self._replace_synonyms(
|
1141
|
-
k, syn_mapper, self._dataset[k]
|
1142
|
-
)
|
1143
|
-
else:
|
1144
|
-
if key not in avail_keys:
|
1145
|
-
if key in self._categoricals:
|
1146
|
-
logger.warning(f"No non-standardized values found for {key!r}")
|
1147
|
-
else:
|
1148
|
-
raise KeyError(
|
1149
|
-
f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
|
1150
|
-
)
|
1151
|
-
else:
|
1152
|
-
if key in self._categoricals: # needed to exclude var_index
|
1153
|
-
syn_mapper = standardize_categories(
|
1154
|
-
self.non_validated[key],
|
1155
|
-
field=self._categoricals[key],
|
1156
|
-
source=self._sources.get(key),
|
1157
|
-
organism=self._organism,
|
1158
|
-
)
|
1159
|
-
self._dataset[key] = self._replace_synonyms(
|
1160
|
-
key, syn_mapper, self._dataset[key]
|
1161
|
-
)
|
1162
|
-
|
1163
|
-
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
1164
|
-
"""Save labels for all features."""
|
1165
|
-
for name in self.categoricals.keys():
|
1166
|
-
self._update_registry(name, validated_only=validated_only, **kwargs)
|
1167
|
-
|
1168
|
-
def _update_registry(
|
1169
|
-
self, categorical: str, validated_only: bool = True, **kwargs
|
1170
|
-
) -> None:
|
1171
|
-
if categorical == "all":
|
1172
|
-
self._update_registry_all(validated_only=validated_only, **kwargs)
|
1173
|
-
else:
|
1174
|
-
if categorical not in self.categoricals:
|
1175
|
-
raise ValidationError(
|
1176
|
-
f"Feature {categorical} is not part of the fields!"
|
1177
|
-
)
|
1178
|
-
update_registry(
|
1179
|
-
values=_flatten_unique(self._dataset[categorical]),
|
1180
|
-
field=self.categoricals[categorical],
|
1181
|
-
key=categorical,
|
1182
|
-
validated_only=validated_only,
|
1183
|
-
source=self._sources.get(categorical),
|
1184
|
-
organism=self._organism,
|
1498
|
+
logger.warning(
|
1499
|
+
"'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!"
|
1185
1500
|
)
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
1501
|
+
for k in self.non_validated.keys():
|
1502
|
+
self._cat_columns[k].standardize()
|
1503
|
+
else:
|
1504
|
+
self._cat_columns[key].standardize()
|
1189
1505
|
|
1190
1506
|
def add_new_from(self, key: str, **kwargs):
|
1191
1507
|
"""Add validated & new categories.
|
1192
1508
|
|
1193
1509
|
Args:
|
1194
1510
|
key: The key referencing the slot in the DataFrame from which to draw terms.
|
1195
|
-
organism: The organism name.
|
1196
1511
|
**kwargs: Additional keyword arguments to pass to create new records
|
1197
1512
|
"""
|
1198
1513
|
if len(kwargs) > 0 and key == "all":
|
1199
1514
|
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
1200
|
-
|
1515
|
+
if key == "all":
|
1516
|
+
logger.warning(
|
1517
|
+
"'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!"
|
1518
|
+
)
|
1519
|
+
for k in self.non_validated.keys():
|
1520
|
+
self._cat_columns[k].add_new(**kwargs)
|
1521
|
+
else:
|
1522
|
+
self._cat_columns[key].add_new(**kwargs)
|
1201
1523
|
|
1524
|
+
@deprecated(
|
1525
|
+
new_name="Run.filter(transform=context.run.transform, output_artifacts=None)"
|
1526
|
+
)
|
1202
1527
|
def clean_up_failed_runs(self):
|
1203
1528
|
"""Clean up previous failed runs that don't save any outputs."""
|
1204
1529
|
from lamindb.core._context import context
|
@@ -1218,8 +1543,6 @@ class AnnDataCatManager(CatManager):
|
|
1218
1543
|
var_index: FieldAttr | None = None,
|
1219
1544
|
categoricals: dict[str, FieldAttr] | None = None,
|
1220
1545
|
obs_columns: FieldAttr = Feature.name,
|
1221
|
-
verbosity: str = "hint",
|
1222
|
-
organism: str | None = None,
|
1223
1546
|
sources: dict[str, Record] | None = None,
|
1224
1547
|
) -> None:
|
1225
1548
|
if isinstance(var_index, str):
|
@@ -1242,7 +1565,6 @@ class AnnDataCatManager(CatManager):
|
|
1242
1565
|
dataset=data,
|
1243
1566
|
categoricals=categoricals,
|
1244
1567
|
sources=self._sources,
|
1245
|
-
organism=organism,
|
1246
1568
|
columns_field=var_index,
|
1247
1569
|
)
|
1248
1570
|
self._adata = self._dataset
|
@@ -1250,10 +1572,19 @@ class AnnDataCatManager(CatManager):
|
|
1250
1572
|
df=self._adata.obs,
|
1251
1573
|
categoricals=self.categoricals,
|
1252
1574
|
columns=obs_columns,
|
1253
|
-
verbosity=verbosity,
|
1254
|
-
organism=None,
|
1255
1575
|
sources=self._sources,
|
1256
1576
|
)
|
1577
|
+
self._cat_columns = self._obs_df_curator._cat_columns.copy()
|
1578
|
+
if var_index is not None:
|
1579
|
+
self._cat_columns["var_index"] = CatColumn(
|
1580
|
+
values_getter=lambda: self._adata.var.index,
|
1581
|
+
values_setter=lambda new_values: setattr(
|
1582
|
+
self._adata.var, "index", pd.Index(new_values)
|
1583
|
+
),
|
1584
|
+
field=self._var_field,
|
1585
|
+
key="var_index",
|
1586
|
+
source=self._sources.get("var_index"),
|
1587
|
+
)
|
1257
1588
|
|
1258
1589
|
@property
|
1259
1590
|
def var_index(self) -> FieldAttr:
|
@@ -1275,76 +1606,51 @@ class AnnDataCatManager(CatManager):
|
|
1275
1606
|
categoricals=self._obs_fields,
|
1276
1607
|
slots={"columns": self._columns_field, "var_index": self._var_field},
|
1277
1608
|
public=public,
|
1278
|
-
organism=self._organism,
|
1279
1609
|
sources=self._sources,
|
1280
1610
|
)
|
1281
1611
|
|
1282
|
-
def _save_from_var_index(
|
1283
|
-
self,
|
1284
|
-
validated_only: bool = True,
|
1285
|
-
):
|
1286
|
-
"""Save variable records."""
|
1287
|
-
if self.var_index is not None:
|
1288
|
-
update_registry(
|
1289
|
-
values=list(self._adata.var.index),
|
1290
|
-
field=self.var_index,
|
1291
|
-
key="var_index",
|
1292
|
-
validated_only=validated_only,
|
1293
|
-
organism=self._organism,
|
1294
|
-
source=self._sources.get("var_index"),
|
1295
|
-
)
|
1296
|
-
|
1297
1612
|
def add_new_from(self, key: str, **kwargs):
|
1298
1613
|
"""Add validated & new categories.
|
1299
1614
|
|
1300
1615
|
Args:
|
1301
1616
|
key: The key referencing the slot in the DataFrame from which to draw terms.
|
1302
|
-
organism: The organism name.
|
1303
1617
|
**kwargs: Additional keyword arguments to pass to create new records
|
1304
1618
|
"""
|
1305
|
-
|
1619
|
+
if key == "all":
|
1620
|
+
logger.warning(
|
1621
|
+
"'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!"
|
1622
|
+
)
|
1623
|
+
for k in self.non_validated.keys():
|
1624
|
+
self._cat_columns[k].add_new(**kwargs)
|
1625
|
+
else:
|
1626
|
+
self._cat_columns[key].add_new(**kwargs)
|
1306
1627
|
|
1628
|
+
@deprecated(new_name="add_new_from('var_index')")
|
1307
1629
|
def add_new_from_var_index(self, **kwargs):
|
1308
1630
|
"""Update variable records.
|
1309
1631
|
|
1310
1632
|
Args:
|
1311
|
-
organism: The organism name.
|
1312
1633
|
**kwargs: Additional keyword arguments to pass to create new records.
|
1313
1634
|
"""
|
1314
|
-
self.
|
1635
|
+
self.add_new_from(key="var_index", **kwargs)
|
1315
1636
|
|
1316
1637
|
def validate(self) -> bool:
|
1317
1638
|
"""Validate categories.
|
1318
1639
|
|
1319
1640
|
This method also registers the validated records in the current instance.
|
1320
1641
|
|
1321
|
-
Args:
|
1322
|
-
organism: The organism name.
|
1323
|
-
|
1324
1642
|
Returns:
|
1325
1643
|
Whether the AnnData object is validated.
|
1326
1644
|
"""
|
1327
1645
|
self._validate_category_error_messages = "" # reset the error messages
|
1328
1646
|
|
1329
|
-
|
1330
|
-
self.
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
source=self._sources.get("var_index"),
|
1337
|
-
hint_print=".add_new_from_var_index()",
|
1338
|
-
organism=self._organism, # type: ignore
|
1339
|
-
)
|
1340
|
-
else:
|
1341
|
-
validated_var = True
|
1342
|
-
non_validated_var = []
|
1343
|
-
validated_obs = self._obs_df_curator.validate()
|
1344
|
-
self._non_validated = self._obs_df_curator._non_validated # type: ignore
|
1345
|
-
if len(non_validated_var) > 0:
|
1346
|
-
self._non_validated["var_index"] = non_validated_var # type: ignore
|
1347
|
-
self._is_validated = validated_var and validated_obs
|
1647
|
+
validated = True
|
1648
|
+
for _, cat_column in self._cat_columns.items():
|
1649
|
+
cat_column.validate()
|
1650
|
+
validated &= cat_column.is_validated
|
1651
|
+
|
1652
|
+
self._non_validated = {} # so it's no longer None
|
1653
|
+
self._is_validated = validated
|
1348
1654
|
return self._is_validated
|
1349
1655
|
|
1350
1656
|
def standardize(self, key: str):
|
@@ -1360,23 +1666,17 @@ class AnnDataCatManager(CatManager):
|
|
1360
1666
|
"""
|
1361
1667
|
if self._artifact is not None:
|
1362
1668
|
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
1363
|
-
if key
|
1364
|
-
|
1365
|
-
|
1366
|
-
# in addition to the obs columns, standardize the var.index
|
1367
|
-
if key == "var_index" or key == "all":
|
1368
|
-
syn_mapper = standardize_categories(
|
1369
|
-
self._adata.var.index,
|
1370
|
-
field=self.var_index,
|
1371
|
-
source=self._sources.get("var_index"),
|
1372
|
-
organism=self._organism,
|
1669
|
+
if key == "all":
|
1670
|
+
logger.warning(
|
1671
|
+
"'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!"
|
1373
1672
|
)
|
1374
|
-
|
1375
|
-
self.
|
1376
|
-
|
1377
|
-
|
1673
|
+
for k in self.non_validated.keys():
|
1674
|
+
self._cat_columns[k].standardize()
|
1675
|
+
else:
|
1676
|
+
self._cat_columns[key].standardize()
|
1378
1677
|
|
1379
1678
|
|
1679
|
+
@deprecated(new_name="MuDataCurator")
|
1380
1680
|
class MuDataCatManager(CatManager):
|
1381
1681
|
"""Categorical manager for `MuData`."""
|
1382
1682
|
|
@@ -1385,15 +1685,12 @@ class MuDataCatManager(CatManager):
|
|
1385
1685
|
mdata: MuData | Artifact,
|
1386
1686
|
var_index: dict[str, FieldAttr] | None = None,
|
1387
1687
|
categoricals: dict[str, FieldAttr] | None = None,
|
1388
|
-
verbosity: str = "hint",
|
1389
|
-
organism: str | None = None,
|
1390
1688
|
sources: dict[str, Record] | None = None,
|
1391
1689
|
) -> None:
|
1392
1690
|
super().__init__(
|
1393
1691
|
dataset=mdata,
|
1394
1692
|
categoricals={},
|
1395
1693
|
sources=sources,
|
1396
|
-
organism=organism,
|
1397
1694
|
)
|
1398
1695
|
self._columns_field = (
|
1399
1696
|
var_index or {}
|
@@ -1402,25 +1699,20 @@ class MuDataCatManager(CatManager):
|
|
1402
1699
|
self._verify_modality(self._var_fields.keys())
|
1403
1700
|
self._obs_fields = self._parse_categoricals(categoricals or {})
|
1404
1701
|
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
1405
|
-
self._verbosity = verbosity
|
1406
1702
|
self._obs_df_curator = None
|
1407
1703
|
if "obs" in self._modalities:
|
1408
1704
|
self._obs_df_curator = DataFrameCatManager(
|
1409
1705
|
df=self._dataset.obs,
|
1410
1706
|
columns=Feature.name,
|
1411
1707
|
categoricals=self._obs_fields.get("obs", {}),
|
1412
|
-
verbosity=verbosity,
|
1413
1708
|
sources=self._sources.get("obs"),
|
1414
|
-
organism=organism,
|
1415
1709
|
)
|
1416
1710
|
self._mod_adata_curators = {
|
1417
1711
|
modality: AnnDataCatManager(
|
1418
1712
|
data=self._dataset[modality],
|
1419
1713
|
var_index=var_index.get(modality),
|
1420
1714
|
categoricals=self._obs_fields.get(modality),
|
1421
|
-
verbosity=verbosity,
|
1422
1715
|
sources=self._sources.get(modality),
|
1423
|
-
organism=organism,
|
1424
1716
|
)
|
1425
1717
|
for modality in self._modalities
|
1426
1718
|
if modality != "obs"
|
@@ -1442,6 +1734,16 @@ class MuDataCatManager(CatManager):
|
|
1442
1734
|
"""Return the non-validated features and labels."""
|
1443
1735
|
if self._non_validated is None:
|
1444
1736
|
raise ValidationError("Please run validate() first!")
|
1737
|
+
non_validated = {}
|
1738
|
+
if (
|
1739
|
+
self._obs_df_curator is not None
|
1740
|
+
and len(self._obs_df_curator.non_validated) > 0
|
1741
|
+
):
|
1742
|
+
non_validated["obs"] = self._obs_df_curator.non_validated
|
1743
|
+
for modality, adata_curator in self._mod_adata_curators.items():
|
1744
|
+
if len(adata_curator.non_validated) > 0:
|
1745
|
+
non_validated[modality] = adata_curator.non_validated
|
1746
|
+
self._non_validated = non_validated
|
1445
1747
|
return self._non_validated
|
1446
1748
|
|
1447
1749
|
def _verify_modality(self, modalities: Iterable[str]):
|
@@ -1487,35 +1789,18 @@ class MuDataCatManager(CatManager):
|
|
1487
1789
|
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
1488
1790
|
},
|
1489
1791
|
public=public,
|
1490
|
-
organism=self._organism,
|
1491
1792
|
sources=self._sources,
|
1492
1793
|
)
|
1493
1794
|
|
1494
|
-
@deprecated(new_name="
|
1495
|
-
def add_new_from_columns(
|
1496
|
-
self,
|
1497
|
-
modality: str,
|
1498
|
-
column_names: list[str] | None = None,
|
1499
|
-
**kwargs,
|
1500
|
-
):
|
1501
|
-
pass # pragma: no cover
|
1502
|
-
|
1795
|
+
@deprecated(new_name="add_new_from('var_index')")
|
1503
1796
|
def add_new_from_var_index(self, modality: str, **kwargs):
|
1504
1797
|
"""Update variable records.
|
1505
1798
|
|
1506
1799
|
Args:
|
1507
1800
|
modality: The modality name.
|
1508
|
-
organism: The organism name.
|
1509
1801
|
**kwargs: Additional keyword arguments to pass to create new records.
|
1510
1802
|
"""
|
1511
|
-
self._mod_adata_curators[modality].
|
1512
|
-
|
1513
|
-
def _update_registry_all(self):
|
1514
|
-
"""Update all registries."""
|
1515
|
-
if self._obs_df_curator is not None:
|
1516
|
-
self._obs_df_curator._update_registry_all(validated_only=True)
|
1517
|
-
for _, adata_curator in self._mod_adata_curators.items():
|
1518
|
-
adata_curator._obs_df_curator._update_registry_all(validated_only=True)
|
1803
|
+
self._mod_adata_curators[modality].add_new_from(key="var_index", **kwargs)
|
1519
1804
|
|
1520
1805
|
def add_new_from(
|
1521
1806
|
self,
|
@@ -1528,39 +1813,30 @@ class MuDataCatManager(CatManager):
|
|
1528
1813
|
Args:
|
1529
1814
|
key: The key referencing the slot in the DataFrame.
|
1530
1815
|
modality: The modality name.
|
1531
|
-
organism: The organism name.
|
1532
1816
|
**kwargs: Additional keyword arguments to pass to create new records.
|
1533
1817
|
"""
|
1534
|
-
if len(kwargs) > 0 and key == "all":
|
1535
|
-
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
1536
1818
|
modality = modality or "obs"
|
1537
1819
|
if modality in self._mod_adata_curators:
|
1538
1820
|
adata_curator = self._mod_adata_curators[modality]
|
1539
1821
|
adata_curator.add_new_from(key=key, **kwargs)
|
1540
1822
|
if modality == "obs":
|
1541
1823
|
self._obs_df_curator.add_new_from(key=key, **kwargs)
|
1824
|
+
if key == "var_index":
|
1825
|
+
self._mod_adata_curators[modality].add_new_from(key=key, **kwargs)
|
1542
1826
|
|
1543
1827
|
def validate(self) -> bool:
|
1544
1828
|
"""Validate categories."""
|
1545
|
-
# add all validated records to the current instance
|
1546
|
-
self._update_registry_all()
|
1547
|
-
self._non_validated = {} # type: ignore
|
1548
|
-
|
1549
1829
|
obs_validated = True
|
1550
1830
|
if "obs" in self._modalities:
|
1551
1831
|
logger.info('validating categoricals in "obs"...')
|
1552
1832
|
obs_validated &= self._obs_df_curator.validate()
|
1553
|
-
self._non_validated["obs"] = self._obs_df_curator.non_validated # type: ignore
|
1554
|
-
logger.print("")
|
1555
1833
|
|
1556
1834
|
mods_validated = True
|
1557
1835
|
for modality, adata_curator in self._mod_adata_curators.items():
|
1558
1836
|
logger.info(f'validating categoricals in modality "{modality}"...')
|
1559
1837
|
mods_validated &= adata_curator.validate()
|
1560
|
-
if len(adata_curator.non_validated) > 0:
|
1561
|
-
self._non_validated[modality] = adata_curator.non_validated # type: ignore
|
1562
|
-
logger.print("")
|
1563
1838
|
|
1839
|
+
self._non_validated = {} # so it's no longer None
|
1564
1840
|
self._is_validated = obs_validated & mods_validated
|
1565
1841
|
return self._is_validated
|
1566
1842
|
|
@@ -1592,6 +1868,7 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
|
1592
1868
|
)
|
1593
1869
|
|
1594
1870
|
|
1871
|
+
@deprecated(new_name="SpatialDataCurator")
|
1595
1872
|
class SpatialDataCatManager(CatManager):
|
1596
1873
|
"""Categorical manager for `SpatialData`."""
|
1597
1874
|
|
@@ -1600,8 +1877,6 @@ class SpatialDataCatManager(CatManager):
|
|
1600
1877
|
sdata: Any,
|
1601
1878
|
var_index: dict[str, FieldAttr],
|
1602
1879
|
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
1603
|
-
verbosity: str = "hint",
|
1604
|
-
organism: str | None = None,
|
1605
1880
|
sources: dict[str, dict[str, Record]] | None = None,
|
1606
1881
|
*,
|
1607
1882
|
sample_metadata_key: str | None = "sample",
|
@@ -1610,7 +1885,6 @@ class SpatialDataCatManager(CatManager):
|
|
1610
1885
|
dataset=sdata,
|
1611
1886
|
categoricals={},
|
1612
1887
|
sources=sources,
|
1613
|
-
organism=organism,
|
1614
1888
|
)
|
1615
1889
|
if isinstance(sdata, Artifact):
|
1616
1890
|
self._sdata = sdata.load()
|
@@ -1624,7 +1898,6 @@ class SpatialDataCatManager(CatManager):
|
|
1624
1898
|
self._table_keys = set(self._var_fields.keys()) | set(
|
1625
1899
|
self._categoricals.keys() - {self._sample_metadata_key}
|
1626
1900
|
)
|
1627
|
-
self._verbosity = verbosity
|
1628
1901
|
self._sample_df_curator = None
|
1629
1902
|
if self._sample_metadata_key is not None:
|
1630
1903
|
self._sample_metadata = self._sdata.get_attrs(
|
@@ -1675,18 +1948,14 @@ class SpatialDataCatManager(CatManager):
|
|
1675
1948
|
df=self._sample_metadata,
|
1676
1949
|
columns=Feature.name,
|
1677
1950
|
categoricals=self._categoricals.get(self._sample_metadata_key, {}),
|
1678
|
-
verbosity=verbosity,
|
1679
1951
|
sources=self._sources.get(self._sample_metadata_key),
|
1680
|
-
organism=organism,
|
1681
1952
|
)
|
1682
1953
|
self._table_adata_curators = {
|
1683
1954
|
table: AnnDataCatManager(
|
1684
1955
|
data=self._sdata[table],
|
1685
1956
|
var_index=var_index.get(table),
|
1686
1957
|
categoricals=self._categoricals.get(table),
|
1687
|
-
verbosity=verbosity,
|
1688
1958
|
sources=self._sources.get(table),
|
1689
|
-
organism=organism,
|
1690
1959
|
)
|
1691
1960
|
for table in self._table_keys
|
1692
1961
|
}
|
@@ -1708,7 +1977,15 @@ class SpatialDataCatManager(CatManager):
|
|
1708
1977
|
"""Return the non-validated features and labels."""
|
1709
1978
|
if self._non_validated is None:
|
1710
1979
|
raise ValidationError("Please run validate() first!")
|
1711
|
-
|
1980
|
+
non_curated = {}
|
1981
|
+
if len(self._sample_df_curator.non_validated) > 0:
|
1982
|
+
non_curated[self._sample_metadata_key] = (
|
1983
|
+
self._sample_df_curator.non_validated
|
1984
|
+
)
|
1985
|
+
for table, adata_curator in self._table_adata_curators.items():
|
1986
|
+
if len(adata_curator.non_validated) > 0:
|
1987
|
+
non_curated[table] = adata_curator.non_validated
|
1988
|
+
return non_curated
|
1712
1989
|
|
1713
1990
|
def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
|
1714
1991
|
"""Verify that the accessors exist (either a valid table or in attrs)."""
|
@@ -1734,38 +2011,19 @@ class SpatialDataCatManager(CatManager):
|
|
1734
2011
|
categoricals=cat_values_dict,
|
1735
2012
|
slots={"accessors": cat_values_dict.keys()},
|
1736
2013
|
public=public,
|
1737
|
-
organism=self._organism,
|
1738
2014
|
sources=self._sources,
|
1739
2015
|
)
|
1740
2016
|
|
1741
|
-
|
1742
|
-
"""Saves labels of all features for sample and table metadata."""
|
1743
|
-
if self._sample_df_curator is not None:
|
1744
|
-
self._sample_df_curator._update_registry_all(
|
1745
|
-
validated_only=True,
|
1746
|
-
)
|
1747
|
-
for _, adata_curator in self._table_adata_curators.items():
|
1748
|
-
adata_curator._obs_df_curator._update_registry_all(
|
1749
|
-
validated_only=True,
|
1750
|
-
)
|
1751
|
-
|
2017
|
+
@deprecated(new_name="add_new_from('var_index')")
|
1752
2018
|
def add_new_from_var_index(self, table: str, **kwargs) -> None:
|
1753
2019
|
"""Save new values from ``.var.index`` of table.
|
1754
2020
|
|
1755
2021
|
Args:
|
1756
2022
|
table: The table key.
|
1757
|
-
organism: The organism name.
|
1758
2023
|
**kwargs: Additional keyword arguments to pass to create new records.
|
1759
2024
|
"""
|
1760
|
-
if self._non_validated is None:
|
1761
|
-
raise ValidationError("Run .validate() first.")
|
1762
|
-
self._table_adata_curators[table].add_new_from_var_index(**kwargs)
|
1763
2025
|
if table in self.non_validated.keys():
|
1764
|
-
|
1765
|
-
self._non_validated[table].pop("var_index")
|
1766
|
-
|
1767
|
-
if len(self.non_validated[table].values()) == 0:
|
1768
|
-
self.non_validated.pop(table)
|
2026
|
+
self._table_adata_curators[table].add_new_from(key="var_index", **kwargs)
|
1769
2027
|
|
1770
2028
|
def add_new_from(
|
1771
2029
|
self,
|
@@ -1778,29 +2036,17 @@ class SpatialDataCatManager(CatManager):
|
|
1778
2036
|
Args:
|
1779
2037
|
key: The key referencing the slot in the DataFrame.
|
1780
2038
|
accessor: The accessor key such as 'sample' or 'table x'.
|
1781
|
-
organism: The organism name.
|
1782
2039
|
**kwargs: Additional keyword arguments to pass to create new records.
|
1783
2040
|
"""
|
1784
|
-
if self._non_validated is None:
|
1785
|
-
raise ValidationError("Run .validate() first.")
|
1786
|
-
|
1787
|
-
if len(kwargs) > 0 and key == "all":
|
1788
|
-
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
1789
|
-
|
1790
|
-
if accessor not in self.categoricals:
|
1791
|
-
raise ValueError(
|
1792
|
-
f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
|
1793
|
-
)
|
1794
|
-
|
1795
|
-
if accessor in self._table_adata_curators:
|
1796
|
-
adata_curator = self._table_adata_curators[accessor]
|
1797
|
-
adata_curator.add_new_from(key=key, **kwargs)
|
1798
|
-
if accessor == self._sample_metadata_key:
|
1799
|
-
self._sample_df_curator.add_new_from(key=key, **kwargs)
|
1800
|
-
|
1801
2041
|
if accessor in self.non_validated.keys():
|
1802
|
-
if
|
1803
|
-
self.
|
2042
|
+
if accessor in self._table_adata_curators:
|
2043
|
+
adata_curator = self._table_adata_curators[accessor]
|
2044
|
+
adata_curator.add_new_from(key=key, **kwargs)
|
2045
|
+
if accessor == self._sample_metadata_key:
|
2046
|
+
self._sample_df_curator.add_new_from(key=key, **kwargs)
|
2047
|
+
|
2048
|
+
if key == "var_index":
|
2049
|
+
self._table_adata_curators[accessor].add_new_from(key=key, **kwargs)
|
1804
2050
|
|
1805
2051
|
def standardize(self, key: str, accessor: str | None = None) -> None:
|
1806
2052
|
"""Replace synonyms with canonical values.
|
@@ -1835,42 +2081,27 @@ class SpatialDataCatManager(CatManager):
|
|
1835
2081
|
if accessor == self._sample_metadata_key:
|
1836
2082
|
self._sample_df_curator.standardize(key)
|
1837
2083
|
|
1838
|
-
if len(self.non_validated[accessor].values()) == 0:
|
1839
|
-
self.non_validated.pop(accessor)
|
1840
|
-
|
1841
2084
|
def validate(self) -> bool:
|
1842
2085
|
"""Validate variables and categorical observations.
|
1843
2086
|
|
1844
2087
|
This method also registers the validated records in the current instance:
|
1845
2088
|
- from public sources
|
1846
2089
|
|
1847
|
-
Args:
|
1848
|
-
organism: The organism name.
|
1849
|
-
|
1850
2090
|
Returns:
|
1851
2091
|
Whether the SpatialData object is validated.
|
1852
2092
|
"""
|
1853
2093
|
# add all validated records to the current instance
|
1854
|
-
self._update_registry_all()
|
1855
|
-
|
1856
|
-
self._non_validated = {} # type: ignore
|
1857
|
-
|
1858
2094
|
sample_validated = True
|
1859
2095
|
if self._sample_df_curator:
|
1860
2096
|
logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
|
1861
2097
|
sample_validated &= self._sample_df_curator.validate()
|
1862
|
-
if len(self._sample_df_curator.non_validated) > 0:
|
1863
|
-
self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
|
1864
|
-
logger.print("")
|
1865
2098
|
|
1866
2099
|
mods_validated = True
|
1867
2100
|
for table, adata_curator in self._table_adata_curators.items():
|
1868
2101
|
logger.info(f"validating categoricals of table '{table}' ...")
|
1869
2102
|
mods_validated &= adata_curator.validate()
|
1870
|
-
if len(adata_curator.non_validated) > 0:
|
1871
|
-
self._non_validated[table] = adata_curator.non_validated # type: ignore
|
1872
|
-
logger.print("")
|
1873
2103
|
|
2104
|
+
self._non_validated = {} # so it's no longer None
|
1874
2105
|
self._is_validated = sample_validated & mods_validated
|
1875
2106
|
return self._is_validated
|
1876
2107
|
|
@@ -1899,17 +2130,12 @@ class SpatialDataCatManager(CatManager):
|
|
1899
2130
|
if not self._is_validated:
|
1900
2131
|
raise ValidationError("Dataset does not validate. Please curate.")
|
1901
2132
|
|
1902
|
-
|
1903
|
-
self.
|
1904
|
-
|
1905
|
-
|
2133
|
+
self._artifact = Artifact.from_spatialdata(
|
2134
|
+
self._dataset, key=key, description=description, revises=revises, run=run
|
2135
|
+
).save()
|
2136
|
+
return annotate_artifact(
|
2137
|
+
self._artifact,
|
1906
2138
|
index_field=self.var_index,
|
1907
|
-
key=key,
|
1908
|
-
artifact=self._artifact,
|
1909
|
-
revises=revises,
|
1910
|
-
run=run,
|
1911
|
-
schema=None,
|
1912
|
-
organism=self._organism,
|
1913
2139
|
sample_metadata_key=self._sample_metadata_key,
|
1914
2140
|
)
|
1915
2141
|
|
@@ -1923,7 +2149,6 @@ class TiledbsomaCatManager(CatManager):
|
|
1923
2149
|
var_index: dict[str, tuple[str, FieldAttr]],
|
1924
2150
|
categoricals: dict[str, FieldAttr] | None = None,
|
1925
2151
|
obs_columns: FieldAttr = Feature.name,
|
1926
|
-
organism: str | None = None,
|
1927
2152
|
sources: dict[str, Record] | None = None,
|
1928
2153
|
):
|
1929
2154
|
self._obs_fields = categoricals or {}
|
@@ -1935,7 +2160,6 @@ class TiledbsomaCatManager(CatManager):
|
|
1935
2160
|
else:
|
1936
2161
|
self._dataset = UPath(experiment_uri)
|
1937
2162
|
self._artifact = None
|
1938
|
-
self._organism = organism
|
1939
2163
|
self._sources = sources or {}
|
1940
2164
|
|
1941
2165
|
self._is_validated: bool | None = False
|
@@ -2004,28 +2228,14 @@ class TiledbsomaCatManager(CatManager):
|
|
2004
2228
|
|
2005
2229
|
# register obs columns' names
|
2006
2230
|
register_columns = list(self._obs_fields.keys())
|
2007
|
-
|
2008
|
-
|
2231
|
+
# register categorical keys as features
|
2232
|
+
cat_column = CatColumn(
|
2233
|
+
values_getter=register_columns,
|
2009
2234
|
field=self._columns_field,
|
2010
2235
|
key="columns",
|
2011
|
-
validated_only=False,
|
2012
|
-
organism=self._organism,
|
2013
2236
|
source=self._sources.get("columns"),
|
2014
2237
|
)
|
2015
|
-
|
2016
|
-
# no need to register with validated_only=True if columns are features
|
2017
|
-
if (
|
2018
|
-
len(additional_columns) > 0
|
2019
|
-
and self._columns_field.field.model is not Feature
|
2020
|
-
):
|
2021
|
-
update_registry(
|
2022
|
-
values=additional_columns,
|
2023
|
-
field=self._columns_field,
|
2024
|
-
key="columns",
|
2025
|
-
validated_only=True,
|
2026
|
-
organism=self._organism,
|
2027
|
-
source=self._sources.get("columns"),
|
2028
|
-
)
|
2238
|
+
cat_column.add_new()
|
2029
2239
|
|
2030
2240
|
def validate(self):
|
2031
2241
|
"""Validate categories."""
|
@@ -2043,21 +2253,14 @@ class TiledbsomaCatManager(CatManager):
|
|
2043
2253
|
var_ms_values = (
|
2044
2254
|
var_ms.read(column_names=[key]).concat()[key].to_pylist()
|
2045
2255
|
)
|
2046
|
-
|
2047
|
-
|
2048
|
-
field=field,
|
2049
|
-
key=var_ms_key,
|
2050
|
-
validated_only=True,
|
2051
|
-
organism=self._organism,
|
2052
|
-
source=self._sources.get(var_ms_key),
|
2053
|
-
)
|
2054
|
-
_, non_val = validate_categories(
|
2055
|
-
values=var_ms_values,
|
2256
|
+
cat_column = CatColumn(
|
2257
|
+
values_getter=var_ms_values,
|
2056
2258
|
field=field,
|
2057
2259
|
key=var_ms_key,
|
2058
|
-
organism=self._organism,
|
2059
2260
|
source=self._sources.get(var_ms_key),
|
2060
2261
|
)
|
2262
|
+
cat_column.validate()
|
2263
|
+
non_val = cat_column._non_validated
|
2061
2264
|
if len(non_val) > 0:
|
2062
2265
|
validated = False
|
2063
2266
|
self._non_validated_values[var_ms_key] = non_val
|
@@ -2072,21 +2275,14 @@ class TiledbsomaCatManager(CatManager):
|
|
2072
2275
|
values = pa.compute.unique(
|
2073
2276
|
obs.read(column_names=[key]).concat()[key]
|
2074
2277
|
).to_pylist()
|
2075
|
-
|
2076
|
-
|
2077
|
-
field=field,
|
2078
|
-
key=key,
|
2079
|
-
validated_only=True,
|
2080
|
-
organism=self._organism,
|
2081
|
-
source=self._sources.get(key),
|
2082
|
-
)
|
2083
|
-
_, non_val = validate_categories(
|
2084
|
-
values=values,
|
2278
|
+
cat_column = CatColumn(
|
2279
|
+
values_getter=values,
|
2085
2280
|
field=field,
|
2086
2281
|
key=key,
|
2087
|
-
organism=self._organism,
|
2088
2282
|
source=self._sources.get(key),
|
2089
2283
|
)
|
2284
|
+
cat_column.validate()
|
2285
|
+
non_val = cat_column._non_validated
|
2090
2286
|
if len(non_val) > 0:
|
2091
2287
|
validated = False
|
2092
2288
|
self._non_validated_values[key] = non_val
|
@@ -2133,15 +2329,13 @@ class TiledbsomaCatManager(CatManager):
|
|
2133
2329
|
values, field = self._non_validated_values_field(k)
|
2134
2330
|
if len(values) == 0:
|
2135
2331
|
continue
|
2136
|
-
|
2137
|
-
|
2332
|
+
cat_column = CatColumn(
|
2333
|
+
values_getter=values,
|
2138
2334
|
field=field,
|
2139
2335
|
key=k,
|
2140
|
-
validated_only=False,
|
2141
|
-
organism=self._organism,
|
2142
2336
|
source=self._sources.get(k),
|
2143
|
-
**kwargs,
|
2144
2337
|
)
|
2338
|
+
cat_column.add_new()
|
2145
2339
|
# update non-validated values list but keep the key there
|
2146
2340
|
# it will be removed by .validate()
|
2147
2341
|
if k in self._non_validated_values:
|
@@ -2173,7 +2367,6 @@ class TiledbsomaCatManager(CatManager):
|
|
2173
2367
|
categoricals=self._obs_fields,
|
2174
2368
|
slots={"columns": self._columns_field, **self._var_fields_flat},
|
2175
2369
|
public=public,
|
2176
|
-
organism=self._organism,
|
2177
2370
|
sources=self._sources,
|
2178
2371
|
)
|
2179
2372
|
|
@@ -2210,12 +2403,14 @@ class TiledbsomaCatManager(CatManager):
|
|
2210
2403
|
else:
|
2211
2404
|
slot = lambda experiment: experiment.obs
|
2212
2405
|
slot_key = k
|
2213
|
-
|
2214
|
-
|
2406
|
+
cat_column = CatColumn(
|
2407
|
+
values_getter=values,
|
2215
2408
|
field=field,
|
2409
|
+
key=k,
|
2216
2410
|
source=self._sources.get(k),
|
2217
|
-
organism=self._organism,
|
2218
2411
|
)
|
2412
|
+
cat_column.validate()
|
2413
|
+
syn_mapper = cat_column._synonyms
|
2219
2414
|
if (n_syn_mapper := len(syn_mapper)) == 0:
|
2220
2415
|
continue
|
2221
2416
|
|
@@ -2300,14 +2495,12 @@ class TiledbsomaCatManager(CatManager):
|
|
2300
2495
|
df=mock_df,
|
2301
2496
|
field=self._columns_field,
|
2302
2497
|
mute=True,
|
2303
|
-
organism=self._organism,
|
2304
2498
|
)
|
2305
2499
|
for ms in self._var_fields:
|
2306
2500
|
var_key, var_field = self._var_fields[ms]
|
2307
2501
|
feature_sets[f"{ms}__var"] = Schema.from_values(
|
2308
2502
|
values=self._validated_values[f"{ms}__{var_key}"],
|
2309
2503
|
field=var_field,
|
2310
|
-
organism=self._organism,
|
2311
2504
|
raise_validation_error=False,
|
2312
2505
|
)
|
2313
2506
|
artifact._staged_feature_sets = feature_sets
|
@@ -2320,7 +2513,6 @@ class TiledbsomaCatManager(CatManager):
|
|
2320
2513
|
labels = registry.from_values(
|
2321
2514
|
values=self._validated_values[key],
|
2322
2515
|
field=field,
|
2323
|
-
organism=self._organism,
|
2324
2516
|
)
|
2325
2517
|
if len(labels) == 0:
|
2326
2518
|
continue
|
@@ -2359,12 +2551,10 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2359
2551
|
self,
|
2360
2552
|
adata: ad.AnnData,
|
2361
2553
|
categoricals: dict[str, FieldAttr] | None = None,
|
2362
|
-
organism: Literal["human", "mouse"] = "human",
|
2363
2554
|
*,
|
2364
2555
|
schema_version: Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
|
2365
2556
|
defaults: dict[str, str] = None,
|
2366
2557
|
extra_sources: dict[str, Record] = None,
|
2367
|
-
verbosity: str = "hint",
|
2368
2558
|
) -> None:
|
2369
2559
|
"""CELLxGENE schema curator.
|
2370
2560
|
|
@@ -2372,13 +2562,11 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2372
2562
|
adata: Path to or AnnData object to curate against the CELLxGENE schema.
|
2373
2563
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
2374
2564
|
The CELLxGENE Curator maps against the required CELLxGENE fields by default.
|
2375
|
-
organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
|
2376
2565
|
schema_version: The CELLxGENE schema version to curate against.
|
2377
2566
|
defaults: Default values that are set if columns or column values are missing.
|
2378
2567
|
extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
|
2379
2568
|
These extra sources are joined with the CELLxGENE fixed sources.
|
2380
2569
|
Use this parameter when subclassing.
|
2381
|
-
verbosity: The verbosity level.
|
2382
2570
|
"""
|
2383
2571
|
import bionty as bt
|
2384
2572
|
|
@@ -2399,6 +2587,7 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2399
2587
|
categoricals = _restrict_obs_fields(adata.obs, categoricals)
|
2400
2588
|
|
2401
2589
|
# Configure sources
|
2590
|
+
organism: Literal["human", "mouse"] = "human"
|
2402
2591
|
sources = _create_sources(categoricals, schema_version, organism)
|
2403
2592
|
self.schema_version = schema_version
|
2404
2593
|
self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
|
@@ -2413,16 +2602,9 @@ class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
|
2413
2602
|
data=adata,
|
2414
2603
|
var_index=bt.Gene.ensembl_gene_id,
|
2415
2604
|
categoricals=categoricals,
|
2416
|
-
verbosity=verbosity,
|
2417
|
-
organism=organism,
|
2418
2605
|
sources=sources,
|
2419
2606
|
)
|
2420
2607
|
|
2421
|
-
@classmethod
|
2422
|
-
@deprecated(new_name="cxg_categoricals_defaults")
|
2423
|
-
def _get_categoricals_defaults(cls) -> dict[str, str]:
|
2424
|
-
return cls.cxg_categoricals_defaults
|
2425
|
-
|
2426
2608
|
@classmethod
|
2427
2609
|
def _get_cxg_categoricals(cls) -> dict[str, FieldAttr]:
|
2428
2610
|
"""Returns the CELLxGENE schema mapped fields."""
|
@@ -2695,7 +2877,6 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
2695
2877
|
pert_time: bool = True,
|
2696
2878
|
*,
|
2697
2879
|
cxg_schema_version: Literal["5.0.0", "5.1.0", "5.2.0"] = "5.2.0",
|
2698
|
-
verbosity: str = "hint",
|
2699
2880
|
):
|
2700
2881
|
"""Initialize the curator with configuration and validation settings."""
|
2701
2882
|
self._pert_time = pert_time
|
@@ -2708,10 +2889,8 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
2708
2889
|
adata=adata,
|
2709
2890
|
categoricals=categoricals,
|
2710
2891
|
defaults=categoricals_defaults,
|
2711
|
-
organism=organism,
|
2712
2892
|
extra_sources=self._configure_sources(adata),
|
2713
2893
|
schema_version=cxg_schema_version,
|
2714
|
-
verbosity=verbosity,
|
2715
2894
|
)
|
2716
2895
|
|
2717
2896
|
def _configure_categoricals(self, adata: ad.AnnData):
|
@@ -2952,7 +3131,7 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
|
2952
3131
|
|
2953
3132
|
|
2954
3133
|
def get_organism_kwargs(
|
2955
|
-
field: FieldAttr, organism: str | None = None
|
3134
|
+
field: FieldAttr, organism: str | None = None, values: Any = None
|
2956
3135
|
) -> dict[str, str]:
|
2957
3136
|
"""Check if a registry needs an organism and return the organism name."""
|
2958
3137
|
registry = field.field.model
|
@@ -2967,245 +3146,47 @@ def get_organism_kwargs(
|
|
2967
3146
|
return {"organism": organism or bt.settings.organism.name}
|
2968
3147
|
else:
|
2969
3148
|
organism_record = get_organism_record_from_field(
|
2970
|
-
field, organism=organism
|
3149
|
+
field, organism=organism, values=values
|
2971
3150
|
)
|
2972
3151
|
if organism_record is not None:
|
2973
3152
|
return {"organism": organism_record.name}
|
2974
3153
|
return {}
|
2975
3154
|
|
2976
3155
|
|
2977
|
-
def
|
2978
|
-
|
2979
|
-
field: FieldAttr,
|
2980
|
-
key: str,
|
2981
|
-
organism: str | None = None,
|
2982
|
-
source: Record | None = None,
|
2983
|
-
hint_print: str | None = None,
|
2984
|
-
curator: CatManager | None = None,
|
2985
|
-
) -> tuple[bool, list[str]]:
|
2986
|
-
"""Validate ontology terms using LaminDB registries.
|
2987
|
-
|
2988
|
-
Args:
|
2989
|
-
values: The values to validate.
|
2990
|
-
field: The field attribute.
|
2991
|
-
key: The key referencing the slot in the DataFrame.
|
2992
|
-
organism: The organism name.
|
2993
|
-
source: The source record.
|
2994
|
-
standardize: Whether to standardize the values.
|
2995
|
-
hint_print: The hint to print that suggests fixing non-validated values.
|
2996
|
-
"""
|
2997
|
-
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
2998
|
-
|
2999
|
-
def _log_mapping_info():
|
3000
|
-
logger.indent = ""
|
3001
|
-
logger.info(f'mapping "{key}" on {colors.italic(model_field)}')
|
3002
|
-
logger.indent = " "
|
3003
|
-
|
3004
|
-
registry = field.field.model
|
3005
|
-
|
3006
|
-
kwargs_current = get_current_filter_kwargs(
|
3007
|
-
registry, {"organism": organism, "source": source}
|
3008
|
-
)
|
3009
|
-
|
3010
|
-
# inspect values from the default instance
|
3011
|
-
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs_current)
|
3012
|
-
non_validated = inspect_result.non_validated
|
3013
|
-
syn_mapper = inspect_result.synonyms_mapper
|
3014
|
-
|
3015
|
-
# inspect the non-validated values from public (BioRecord only)
|
3016
|
-
values_validated = []
|
3017
|
-
if hasattr(registry, "public"):
|
3018
|
-
public_records = registry.from_values(
|
3019
|
-
non_validated,
|
3020
|
-
field=field,
|
3021
|
-
mute=True,
|
3022
|
-
**kwargs_current,
|
3023
|
-
)
|
3024
|
-
values_validated += [getattr(r, field.field.name) for r in public_records]
|
3025
|
-
|
3026
|
-
# logging messages
|
3027
|
-
non_validated_hint_print = hint_print or f'.add_new_from("{key}")'
|
3028
|
-
non_validated = [i for i in non_validated if i not in values_validated]
|
3029
|
-
n_non_validated = len(non_validated)
|
3030
|
-
if n_non_validated == 0:
|
3031
|
-
logger.indent = ""
|
3032
|
-
logger.success(f'"{key}" is validated against {colors.italic(model_field)}')
|
3033
|
-
return True, []
|
3034
|
-
else:
|
3035
|
-
are = "is" if n_non_validated == 1 else "are"
|
3036
|
-
s = "" if n_non_validated == 1 else "s"
|
3037
|
-
print_values = _format_values(non_validated)
|
3038
|
-
warning_message = f"{colors.red(f'{n_non_validated} term{s}')} {are} not validated: {colors.red(print_values)}\n"
|
3039
|
-
if syn_mapper:
|
3040
|
-
s = "" if len(syn_mapper) == 1 else "s"
|
3041
|
-
syn_mapper_print = _format_values(
|
3042
|
-
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
3043
|
-
)
|
3044
|
-
hint_msg = f'.standardize("{key}")'
|
3045
|
-
warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}"
|
3046
|
-
if n_non_validated > len(syn_mapper):
|
3047
|
-
if syn_mapper:
|
3048
|
-
warning_message += "\n for remaining terms:\n"
|
3049
|
-
warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}"
|
3050
|
-
|
3051
|
-
if logger.indent == "":
|
3052
|
-
_log_mapping_info()
|
3053
|
-
logger.warning(warning_message)
|
3054
|
-
if curator is not None:
|
3055
|
-
curator._validate_category_error_messages = strip_ansi_codes(
|
3056
|
-
warning_message
|
3057
|
-
)
|
3058
|
-
logger.indent = ""
|
3059
|
-
return False, non_validated
|
3060
|
-
|
3061
|
-
|
3062
|
-
def standardize_categories(
|
3063
|
-
values: Iterable[str],
|
3064
|
-
field: FieldAttr,
|
3065
|
-
organism: str | None = None,
|
3066
|
-
source: Record | None = None,
|
3067
|
-
) -> dict:
|
3068
|
-
"""Get a synonym mapper."""
|
3069
|
-
registry = field.field.model
|
3070
|
-
if not hasattr(registry, "standardize"):
|
3071
|
-
return {}
|
3072
|
-
# standardize values using the default instance
|
3073
|
-
syn_mapper = registry.standardize(
|
3074
|
-
values,
|
3075
|
-
field=field.field.name,
|
3076
|
-
organism=organism,
|
3077
|
-
source=source,
|
3078
|
-
mute=True,
|
3079
|
-
return_mapper=True,
|
3080
|
-
)
|
3081
|
-
return syn_mapper
|
3082
|
-
|
3083
|
-
|
3084
|
-
def validate_categories_in_df(
|
3085
|
-
df: pd.DataFrame,
|
3086
|
-
fields: dict[str, FieldAttr],
|
3087
|
-
sources: dict[str, Record] = None,
|
3088
|
-
curator: CatManager | None = None,
|
3089
|
-
**kwargs,
|
3090
|
-
) -> tuple[bool, dict]:
|
3091
|
-
"""Validate categories in DataFrame columns using LaminDB registries."""
|
3092
|
-
if not fields:
|
3093
|
-
return True, {}
|
3094
|
-
|
3095
|
-
if sources is None:
|
3096
|
-
sources = {}
|
3097
|
-
validated = True
|
3098
|
-
non_validated = {}
|
3099
|
-
for key, field in fields.items():
|
3100
|
-
is_val, non_val = validate_categories(
|
3101
|
-
df[key],
|
3102
|
-
field=field,
|
3103
|
-
key=key,
|
3104
|
-
source=sources.get(key),
|
3105
|
-
curator=curator,
|
3106
|
-
**kwargs,
|
3107
|
-
)
|
3108
|
-
validated &= is_val
|
3109
|
-
if len(non_val) > 0:
|
3110
|
-
non_validated[key] = non_val
|
3111
|
-
return validated, non_validated
|
3112
|
-
|
3113
|
-
|
3114
|
-
def save_artifact(
|
3115
|
-
data: pd.DataFrame | ScverseDataStructures,
|
3156
|
+
def annotate_artifact(
|
3157
|
+
artifact: Artifact,
|
3116
3158
|
*,
|
3117
|
-
fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
|
3118
|
-
index_field: FieldAttr | dict[str, FieldAttr] | None = None,
|
3119
|
-
description: str | None = None,
|
3120
|
-
organism: str | None = None,
|
3121
|
-
key: str | None = None,
|
3122
|
-
artifact: Artifact | None = None,
|
3123
|
-
revises: Artifact | None = None,
|
3124
|
-
run: Run | None = None,
|
3125
3159
|
schema: Schema | None = None,
|
3160
|
+
cat_columns: dict[str, CatColumn] | None = None,
|
3161
|
+
index_field: FieldAttr | dict[str, FieldAttr] | None = None,
|
3126
3162
|
**kwargs,
|
3127
3163
|
) -> Artifact:
|
3128
|
-
"""Save all metadata with an Artifact.
|
3129
|
-
|
3130
|
-
Args:
|
3131
|
-
data: The object to save.
|
3132
|
-
fields: A dictionary mapping obs_column to registry_field.
|
3133
|
-
index_field: The registry field to validate variables index against.
|
3134
|
-
description: A description of the artifact.
|
3135
|
-
organism: The organism name.
|
3136
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
3137
|
-
artifact: A already registered artifact. Passing this will not save a new artifact from data.
|
3138
|
-
revises: Previous version of the artifact. Triggers a revision.
|
3139
|
-
run: The run that creates the artifact.
|
3140
|
-
schema: The Schema to associate with the Artifact.
|
3141
|
-
|
3142
|
-
Returns:
|
3143
|
-
The saved Artifact.
|
3144
|
-
"""
|
3145
3164
|
from ..models.artifact import add_labels
|
3146
3165
|
|
3147
|
-
if
|
3148
|
-
|
3149
|
-
artifact = Artifact.from_df(
|
3150
|
-
data, description=description, key=key, revises=revises, run=run
|
3151
|
-
)
|
3152
|
-
elif isinstance(data, AnnData):
|
3153
|
-
artifact = Artifact.from_anndata(
|
3154
|
-
data, description=description, key=key, revises=revises, run=run
|
3155
|
-
)
|
3156
|
-
elif data_is_mudata(data):
|
3157
|
-
artifact = Artifact.from_mudata(
|
3158
|
-
data, description=description, key=key, revises=revises, run=run
|
3159
|
-
)
|
3160
|
-
elif data_is_spatialdata(data):
|
3161
|
-
artifact = Artifact.from_spatialdata(
|
3162
|
-
data, description=description, key=key, revises=revises, run=run
|
3163
|
-
)
|
3164
|
-
else:
|
3165
|
-
raise InvalidArgument( # pragma: no cover
|
3166
|
-
"data must be one of pd.Dataframe, AnnData, MuData, SpatialData."
|
3167
|
-
)
|
3168
|
-
artifact.save()
|
3166
|
+
if cat_columns is None:
|
3167
|
+
cat_columns = {}
|
3169
3168
|
|
3170
|
-
|
3171
|
-
|
3172
|
-
|
3173
|
-
|
3174
|
-
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3180
|
-
|
3181
|
-
|
3182
|
-
|
3183
|
-
|
3184
|
-
|
3185
|
-
|
3186
|
-
else:
|
3187
|
-
values = df[key].unique()
|
3188
|
-
labels = registry.from_values(values, field=field, **filter_kwargs)
|
3189
|
-
if len(labels) == 0:
|
3190
|
-
continue
|
3191
|
-
label_ref_is_name = None
|
3192
|
-
if hasattr(registry, "_name_field"):
|
3193
|
-
label_ref_is_name = field.field.name == registry._name_field
|
3194
|
-
add_labels(
|
3195
|
-
artifact,
|
3196
|
-
records=labels,
|
3197
|
-
feature=feature,
|
3198
|
-
feature_ref_is_name=feature_ref_is_name,
|
3199
|
-
label_ref_is_name=label_ref_is_name,
|
3200
|
-
from_curator=True,
|
3201
|
-
)
|
3169
|
+
# annotate with labels
|
3170
|
+
for key, cat_column in cat_columns.items():
|
3171
|
+
if (
|
3172
|
+
cat_column._field.field.model == Feature
|
3173
|
+
or key == "columns"
|
3174
|
+
or key == "var_index"
|
3175
|
+
):
|
3176
|
+
continue
|
3177
|
+
add_labels(
|
3178
|
+
artifact,
|
3179
|
+
records=cat_column.labels,
|
3180
|
+
feature=cat_column.feature,
|
3181
|
+
feature_ref_is_name=None, # do not need anymore
|
3182
|
+
label_ref_is_name=cat_column.label_ref_is_name,
|
3183
|
+
from_curator=True,
|
3184
|
+
)
|
3202
3185
|
|
3186
|
+
# annotate with inferred feature sets
|
3203
3187
|
match artifact.otype:
|
3204
3188
|
case "DataFrame":
|
3205
|
-
artifact.features._add_set_from_df(field=index_field
|
3206
|
-
_add_labels(
|
3207
|
-
data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
|
3208
|
-
)
|
3189
|
+
artifact.features._add_set_from_df(field=index_field) # type: ignore
|
3209
3190
|
case "AnnData":
|
3210
3191
|
if schema is not None and "uns" in schema.slots:
|
3211
3192
|
uns_field = parse_cat_dtype(schema.slots["uns"].itype, is_itype=True)[
|
@@ -3214,80 +3195,25 @@ def save_artifact(
|
|
3214
3195
|
else:
|
3215
3196
|
uns_field = None
|
3216
3197
|
artifact.features._add_set_from_anndata( # type: ignore
|
3217
|
-
var_field=index_field, uns_field=uns_field
|
3218
|
-
)
|
3219
|
-
_add_labels(
|
3220
|
-
data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
|
3198
|
+
var_field=index_field, uns_field=uns_field
|
3221
3199
|
)
|
3222
3200
|
case "MuData":
|
3223
|
-
artifact.features._add_set_from_mudata( # type: ignore
|
3224
|
-
var_fields=index_field, organism=organism
|
3225
|
-
)
|
3226
|
-
for modality, modality_fields in fields.items():
|
3227
|
-
column_field_modality = index_field.get(modality)
|
3228
|
-
if modality == "obs":
|
3229
|
-
_add_labels(
|
3230
|
-
data,
|
3231
|
-
artifact,
|
3232
|
-
modality_fields,
|
3233
|
-
feature_ref_is_name=(
|
3234
|
-
None
|
3235
|
-
if column_field_modality is None
|
3236
|
-
else _ref_is_name(column_field_modality)
|
3237
|
-
),
|
3238
|
-
)
|
3239
|
-
else:
|
3240
|
-
_add_labels(
|
3241
|
-
data[modality],
|
3242
|
-
artifact,
|
3243
|
-
modality_fields,
|
3244
|
-
feature_ref_is_name=(
|
3245
|
-
None
|
3246
|
-
if column_field_modality is None
|
3247
|
-
else _ref_is_name(column_field_modality)
|
3248
|
-
),
|
3249
|
-
)
|
3201
|
+
artifact.features._add_set_from_mudata(var_fields=index_field) # type: ignore
|
3250
3202
|
case "SpatialData":
|
3251
3203
|
artifact.features._add_set_from_spatialdata( # type: ignore
|
3252
3204
|
sample_metadata_key=kwargs.get("sample_metadata_key", "sample"),
|
3253
3205
|
var_fields=index_field,
|
3254
|
-
organism=organism,
|
3255
3206
|
)
|
3256
|
-
sample_metadata_key = kwargs.get("sample_metadata_key", "sample")
|
3257
|
-
for accessor, accessor_fields in fields.items():
|
3258
|
-
column_field = index_field.get(accessor)
|
3259
|
-
if accessor == sample_metadata_key:
|
3260
|
-
_add_labels(
|
3261
|
-
data.get_attrs(
|
3262
|
-
key=sample_metadata_key, return_as="df", flatten=True
|
3263
|
-
),
|
3264
|
-
artifact,
|
3265
|
-
accessor_fields,
|
3266
|
-
feature_ref_is_name=(
|
3267
|
-
None if column_field is None else _ref_is_name(column_field)
|
3268
|
-
),
|
3269
|
-
)
|
3270
|
-
else:
|
3271
|
-
_add_labels(
|
3272
|
-
data.tables[accessor],
|
3273
|
-
artifact,
|
3274
|
-
accessor_fields,
|
3275
|
-
feature_ref_is_name=(
|
3276
|
-
None if column_field is None else _ref_is_name(column_field)
|
3277
|
-
),
|
3278
|
-
)
|
3279
3207
|
case _:
|
3280
3208
|
raise NotImplementedError # pragma: no cover
|
3281
3209
|
|
3282
|
-
artifact.schema = schema
|
3283
|
-
artifact.save()
|
3284
|
-
|
3285
3210
|
slug = ln_setup.settings.instance.slug
|
3286
3211
|
if ln_setup.settings.instance.is_remote: # pdagma: no cover
|
3287
3212
|
logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
|
3288
3213
|
return artifact
|
3289
3214
|
|
3290
3215
|
|
3216
|
+
# TODO: need this function to support mutli-value columns
|
3291
3217
|
def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:
|
3292
3218
|
"""Flatten a Pandas series containing lists or single items into a unique list of elements."""
|
3293
3219
|
result = set()
|
@@ -3301,138 +3227,6 @@ def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:
|
|
3301
3227
|
return list(result)
|
3302
3228
|
|
3303
3229
|
|
3304
|
-
def update_registry(
|
3305
|
-
values: list[str],
|
3306
|
-
field: FieldAttr,
|
3307
|
-
key: str,
|
3308
|
-
validated_only: bool = True,
|
3309
|
-
df: pd.DataFrame | None = None,
|
3310
|
-
organism: str | None = None,
|
3311
|
-
dtype: str | None = None,
|
3312
|
-
source: Record | None = None,
|
3313
|
-
**create_kwargs,
|
3314
|
-
) -> None:
|
3315
|
-
"""Save features or labels records in the default instance..
|
3316
|
-
|
3317
|
-
Args:
|
3318
|
-
values: A list of values to be saved as labels.
|
3319
|
-
field: The FieldAttr object representing the field for which labels are being saved.
|
3320
|
-
key: The name of the feature to save.
|
3321
|
-
validated_only: If True, only save validated labels.
|
3322
|
-
df: A DataFrame to save labels from.
|
3323
|
-
organism: The organism name.
|
3324
|
-
dtype: The type of the feature.
|
3325
|
-
source: The source record.
|
3326
|
-
**create_kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
3327
|
-
"""
|
3328
|
-
from lamindb.models.save import save as ln_save
|
3329
|
-
|
3330
|
-
registry = field.field.model
|
3331
|
-
filter_kwargs = get_current_filter_kwargs(
|
3332
|
-
registry, {"organism": organism, "source": source}
|
3333
|
-
)
|
3334
|
-
values = [i for i in values if isinstance(i, str) and i]
|
3335
|
-
if not values:
|
3336
|
-
return
|
3337
|
-
|
3338
|
-
labels_saved: dict = {"from public": [], "new": []}
|
3339
|
-
|
3340
|
-
# inspect the default instance and save validated records from public
|
3341
|
-
existing_and_public_records = registry.from_values(
|
3342
|
-
list(values), field=field, **filter_kwargs, mute=True
|
3343
|
-
)
|
3344
|
-
existing_and_public_labels = [
|
3345
|
-
getattr(r, field.field.name) for r in existing_and_public_records
|
3346
|
-
]
|
3347
|
-
# public records that are not already in the database
|
3348
|
-
public_records = [r for r in existing_and_public_records if r._state.adding]
|
3349
|
-
# here we check to only save the public records if they are from the specified source
|
3350
|
-
# we check the uid because r.source and source can be from different instances
|
3351
|
-
if source:
|
3352
|
-
public_records = [r for r in public_records if r.source.uid == source.uid]
|
3353
|
-
if len(public_records) > 0:
|
3354
|
-
logger.info(f"saving validated records of '{key}'")
|
3355
|
-
ln_save(public_records)
|
3356
|
-
labels_saved["from public"] = [
|
3357
|
-
getattr(r, field.field.name) for r in public_records
|
3358
|
-
]
|
3359
|
-
# non-validated records from the default instance
|
3360
|
-
non_validated_labels = [i for i in values if i not in existing_and_public_labels]
|
3361
|
-
|
3362
|
-
# save non-validated/new records
|
3363
|
-
labels_saved["new"] = non_validated_labels
|
3364
|
-
if not validated_only:
|
3365
|
-
non_validated_records: RecordList[Any] = [] # type: ignore
|
3366
|
-
if df is not None and registry == Feature:
|
3367
|
-
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
3368
|
-
non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
|
3369
|
-
else:
|
3370
|
-
if (
|
3371
|
-
organism
|
3372
|
-
and hasattr(registry, "organism")
|
3373
|
-
and registry._meta.get_field("organism").is_relation
|
3374
|
-
):
|
3375
|
-
# make sure organism record is saved to the current instance
|
3376
|
-
create_kwargs["organism"] = _save_organism(name=organism)
|
3377
|
-
|
3378
|
-
for value in labels_saved["new"]:
|
3379
|
-
init_kwargs = {field.field.name: value}
|
3380
|
-
if registry == Feature:
|
3381
|
-
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
3382
|
-
non_validated_records.append(registry(**init_kwargs, **create_kwargs))
|
3383
|
-
ln_save(non_validated_records)
|
3384
|
-
|
3385
|
-
# save parent labels for ulabels, for example a parent label "project" for label "project001"
|
3386
|
-
if registry == ULabel and field.field.name == "name":
|
3387
|
-
save_ulabels_type(values, field=field, key=key)
|
3388
|
-
|
3389
|
-
log_saved_labels(
|
3390
|
-
labels_saved,
|
3391
|
-
key=key,
|
3392
|
-
model_field=f"{registry.__name__}.{field.field.name}",
|
3393
|
-
validated_only=validated_only,
|
3394
|
-
)
|
3395
|
-
|
3396
|
-
|
3397
|
-
def log_saved_labels(
|
3398
|
-
labels_saved: dict,
|
3399
|
-
key: str,
|
3400
|
-
model_field: str,
|
3401
|
-
validated_only: bool = True,
|
3402
|
-
) -> None:
|
3403
|
-
"""Log the saved labels."""
|
3404
|
-
from ..models._from_values import _format_values
|
3405
|
-
|
3406
|
-
model_field = colors.italic(model_field)
|
3407
|
-
for k, labels in labels_saved.items():
|
3408
|
-
if not labels:
|
3409
|
-
continue
|
3410
|
-
if k == "new" and validated_only:
|
3411
|
-
continue
|
3412
|
-
else:
|
3413
|
-
k = "" if k == "new" else f"{colors.green(k)} "
|
3414
|
-
# the term "transferred" stresses that this is always in the context of transferring
|
3415
|
-
# labels from a public ontology or a different instance to the present instance
|
3416
|
-
s = "s" if len(labels) > 1 else ""
|
3417
|
-
logger.success(
|
3418
|
-
f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {_format_values(labels)}'
|
3419
|
-
)
|
3420
|
-
|
3421
|
-
|
3422
|
-
def save_ulabels_type(values: list[str], field: FieldAttr, key: str) -> None:
|
3423
|
-
"""Save the ULabel type of the given labels."""
|
3424
|
-
registry = field.field.model
|
3425
|
-
assert registry == ULabel # noqa: S101
|
3426
|
-
all_records = registry.filter(**{field.field.name: list(values)}).all()
|
3427
|
-
# so `tissue_type` becomes `TissueType`
|
3428
|
-
type_name = "".join([i.capitalize() for i in key.lower().split("_")])
|
3429
|
-
ulabel_type = registry.filter(name=type_name, is_type=True).one_or_none()
|
3430
|
-
if ulabel_type is None:
|
3431
|
-
ulabel_type = registry(name=type_name, is_type=True).save()
|
3432
|
-
logger.important(f"Created a ULabel type: {ulabel_type}")
|
3433
|
-
all_records.update(type=ulabel_type)
|
3434
|
-
|
3435
|
-
|
3436
3230
|
def _save_organism(name: str):
|
3437
3231
|
"""Save an organism record."""
|
3438
3232
|
import bionty as bt
|
@@ -3469,15 +3263,14 @@ def from_df(
|
|
3469
3263
|
df: pd.DataFrame,
|
3470
3264
|
categoricals: dict[str, FieldAttr] | None = None,
|
3471
3265
|
columns: FieldAttr = Feature.name,
|
3472
|
-
verbosity: str = "hint",
|
3473
3266
|
organism: str | None = None,
|
3474
3267
|
) -> DataFrameCatManager:
|
3268
|
+
if organism is not None:
|
3269
|
+
logger.warning("organism is ignored, define it on the dtype level")
|
3475
3270
|
return DataFrameCatManager(
|
3476
3271
|
df=df,
|
3477
3272
|
categoricals=categoricals,
|
3478
3273
|
columns=columns,
|
3479
|
-
verbosity=verbosity,
|
3480
|
-
organism=organism,
|
3481
3274
|
)
|
3482
3275
|
|
3483
3276
|
|
@@ -3488,17 +3281,16 @@ def from_anndata(
|
|
3488
3281
|
var_index: FieldAttr,
|
3489
3282
|
categoricals: dict[str, FieldAttr] | None = None,
|
3490
3283
|
obs_columns: FieldAttr = Feature.name,
|
3491
|
-
verbosity: str = "hint",
|
3492
3284
|
organism: str | None = None,
|
3493
3285
|
sources: dict[str, Record] | None = None,
|
3494
3286
|
) -> AnnDataCatManager:
|
3287
|
+
if organism is not None:
|
3288
|
+
logger.warning("organism is ignored, define it on the dtype level")
|
3495
3289
|
return AnnDataCatManager(
|
3496
3290
|
data=data,
|
3497
3291
|
var_index=var_index,
|
3498
3292
|
categoricals=categoricals,
|
3499
3293
|
obs_columns=obs_columns,
|
3500
|
-
verbosity=verbosity,
|
3501
|
-
organism=organism,
|
3502
3294
|
sources=sources,
|
3503
3295
|
)
|
3504
3296
|
|
@@ -3509,15 +3301,16 @@ def from_mudata(
|
|
3509
3301
|
mdata: MuData | UPathStr,
|
3510
3302
|
var_index: dict[str, dict[str, FieldAttr]],
|
3511
3303
|
categoricals: dict[str, FieldAttr] | None = None,
|
3512
|
-
verbosity: str = "hint",
|
3513
3304
|
organism: str | None = None,
|
3514
3305
|
) -> MuDataCatManager:
|
3306
|
+
if not is_package_installed("mudata"):
|
3307
|
+
raise ImportError("Please install mudata: pip install mudata")
|
3308
|
+
if organism is not None:
|
3309
|
+
logger.warning("organism is ignored, define it on the dtype level")
|
3515
3310
|
return MuDataCatManager(
|
3516
3311
|
mdata=mdata,
|
3517
3312
|
var_index=var_index,
|
3518
3313
|
categoricals=categoricals,
|
3519
|
-
verbosity=verbosity,
|
3520
|
-
organism=organism,
|
3521
3314
|
)
|
3522
3315
|
|
3523
3316
|
|
@@ -3531,12 +3324,13 @@ def from_tiledbsoma(
|
|
3531
3324
|
organism: str | None = None,
|
3532
3325
|
sources: dict[str, Record] | None = None,
|
3533
3326
|
) -> TiledbsomaCatManager:
|
3327
|
+
if organism is not None:
|
3328
|
+
logger.warning("organism is ignored, define it on the dtype level")
|
3534
3329
|
return TiledbsomaCatManager(
|
3535
3330
|
experiment_uri=experiment_uri,
|
3536
3331
|
var_index=var_index,
|
3537
3332
|
categoricals=categoricals,
|
3538
3333
|
obs_columns=obs_columns,
|
3539
|
-
organism=organism,
|
3540
3334
|
sources=sources,
|
3541
3335
|
)
|
3542
3336
|
|
@@ -3549,21 +3343,17 @@ def from_spatialdata(
|
|
3549
3343
|
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
3550
3344
|
organism: str | None = None,
|
3551
3345
|
sources: dict[str, dict[str, Record]] | None = None,
|
3552
|
-
verbosity: str = "hint",
|
3553
3346
|
*,
|
3554
3347
|
sample_metadata_key: str = "sample",
|
3555
3348
|
):
|
3556
|
-
|
3557
|
-
|
3558
|
-
|
3559
|
-
|
3560
|
-
|
3349
|
+
if not is_package_installed("spatialdata"):
|
3350
|
+
raise ImportError("Please install spatialdata: pip install spatialdata")
|
3351
|
+
if organism is not None:
|
3352
|
+
logger.warning("organism is ignored, define it on the dtype level")
|
3561
3353
|
return SpatialDataCatManager(
|
3562
3354
|
sdata=sdata,
|
3563
3355
|
var_index=var_index,
|
3564
3356
|
categoricals=categoricals,
|
3565
|
-
verbosity=verbosity,
|
3566
|
-
organism=organism,
|
3567
3357
|
sources=sources,
|
3568
3358
|
sample_metadata_key=sample_metadata_key,
|
3569
3359
|
)
|