lamindb 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_view.py +2 -2
- lamindb/base/types.py +50 -11
- lamindb/core/types.py +1 -1
- lamindb/curators/__init__.py +232 -222
- lamindb/curators/_cellxgene_schemas/__init__.py +1 -1
- lamindb/models/_feature_manager.py +21 -28
- lamindb/models/_from_values.py +53 -97
- lamindb/models/_label_manager.py +17 -10
- lamindb/models/artifact.py +30 -6
- lamindb/models/can_curate.py +20 -20
- lamindb/models/feature.py +47 -48
- lamindb/models/record.py +29 -25
- lamindb/models/run.py +4 -8
- lamindb/models/schema.py +7 -7
- {lamindb-1.3.0.dist-info → lamindb-1.3.1.dist-info}/METADATA +3 -3
- {lamindb-1.3.0.dist-info → lamindb-1.3.1.dist-info}/RECORD +19 -19
- {lamindb-1.3.0.dist-info → lamindb-1.3.1.dist-info}/LICENSE +0 -0
- {lamindb-1.3.0.dist-info → lamindb-1.3.1.dist-info}/WHEEL +0 -0
lamindb/curators/__init__.py
CHANGED
@@ -30,7 +30,7 @@ from __future__ import annotations
|
|
30
30
|
import copy
|
31
31
|
import re
|
32
32
|
from itertools import chain
|
33
|
-
from typing import TYPE_CHECKING, Any, Literal
|
33
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal
|
34
34
|
|
35
35
|
import anndata as ad
|
36
36
|
import lamindb_setup as ln_setup
|
@@ -65,7 +65,7 @@ from lamindb.models.artifact import (
|
|
65
65
|
data_is_mudata,
|
66
66
|
data_is_spatialdata,
|
67
67
|
)
|
68
|
-
from lamindb.models.feature import parse_dtype,
|
68
|
+
from lamindb.models.feature import parse_dtype, parse_cat_dtype
|
69
69
|
from lamindb.models._from_values import _format_values
|
70
70
|
|
71
71
|
from ..errors import InvalidArgument, ValidationError
|
@@ -106,16 +106,22 @@ class CatLookup:
|
|
106
106
|
categoricals: dict[str, FieldAttr],
|
107
107
|
slots: dict[str, FieldAttr] = None,
|
108
108
|
public: bool = False,
|
109
|
+
organism: str | None = None,
|
110
|
+
sources: dict[str, Record] | None = None,
|
109
111
|
) -> None:
|
110
112
|
slots = slots or {}
|
111
113
|
self._categoricals = {**categoricals, **slots}
|
112
114
|
self._public = public
|
115
|
+
self._organism = organism
|
116
|
+
self._sources = sources
|
113
117
|
|
114
118
|
def __getattr__(self, name):
|
115
119
|
if name in self._categoricals:
|
116
120
|
registry = self._categoricals[name].field.model
|
117
121
|
if self._public and hasattr(registry, "public"):
|
118
|
-
return registry.public(
|
122
|
+
return registry.public(
|
123
|
+
organism=self._organism, source=self._sources.get(name)
|
124
|
+
).lookup()
|
119
125
|
else:
|
120
126
|
return registry.lookup()
|
121
127
|
raise AttributeError(
|
@@ -126,7 +132,9 @@ class CatLookup:
|
|
126
132
|
if name in self._categoricals:
|
127
133
|
registry = self._categoricals[name].field.model
|
128
134
|
if self._public and hasattr(registry, "public"):
|
129
|
-
return registry.public(
|
135
|
+
return registry.public(
|
136
|
+
organism=self._organism, source=self._sources.get(name)
|
137
|
+
).lookup()
|
130
138
|
else:
|
131
139
|
return registry.lookup()
|
132
140
|
raise AttributeError(
|
@@ -229,7 +237,7 @@ class Curator:
|
|
229
237
|
"""{}""" # noqa: D415
|
230
238
|
# Note that this docstring has to be consistent with the Artifact()
|
231
239
|
# constructor signature
|
232
|
-
pass
|
240
|
+
pass # pragma: no cover
|
233
241
|
|
234
242
|
|
235
243
|
class SlotsCurator(Curator):
|
@@ -295,6 +303,28 @@ class SlotsCurator(Curator):
|
|
295
303
|
)
|
296
304
|
|
297
305
|
|
306
|
+
def check_dtype(expected_type) -> Callable:
|
307
|
+
"""Creates a check function for Pandera that validates a column's dtype.
|
308
|
+
|
309
|
+
Args:
|
310
|
+
expected_type: String identifier for the expected type ('int', 'float', or 'num')
|
311
|
+
|
312
|
+
Returns:
|
313
|
+
A function that checks if a series has the expected dtype
|
314
|
+
"""
|
315
|
+
|
316
|
+
def check_function(series):
|
317
|
+
if expected_type == "int":
|
318
|
+
is_valid = pd.api.types.is_integer_dtype(series.dtype)
|
319
|
+
elif expected_type == "float":
|
320
|
+
is_valid = pd.api.types.is_float_dtype(series.dtype)
|
321
|
+
elif expected_type == "num":
|
322
|
+
is_valid = pd.api.types.is_numeric_dtype(series.dtype)
|
323
|
+
return is_valid
|
324
|
+
|
325
|
+
return check_function
|
326
|
+
|
327
|
+
|
298
328
|
class DataFrameCurator(Curator):
|
299
329
|
# the example in the docstring is tested in test_curators_quickstart_example
|
300
330
|
"""Curator for `DataFrame`.
|
@@ -348,14 +378,33 @@ class DataFrameCurator(Curator):
|
|
348
378
|
# populate features
|
349
379
|
pandera_columns = {}
|
350
380
|
for feature in schema.features.all():
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
381
|
+
if feature.dtype in {"int", "float", "num"}:
|
382
|
+
dtype = (
|
383
|
+
self._dataset[feature.name].dtype
|
384
|
+
if feature.name in self._dataset.columns
|
385
|
+
else None
|
386
|
+
)
|
387
|
+
pandera_columns[feature.name] = pandera.Column(
|
388
|
+
dtype=None,
|
389
|
+
checks=pandera.Check(
|
390
|
+
check_dtype(feature.dtype),
|
391
|
+
element_wise=False,
|
392
|
+
error=f"Column '{feature.name}' failed dtype check for '{feature.dtype}': got {dtype}",
|
393
|
+
),
|
394
|
+
nullable=feature.nullable,
|
395
|
+
coerce=feature.coerce_dtype,
|
396
|
+
)
|
397
|
+
else:
|
398
|
+
pandera_dtype = (
|
399
|
+
feature.dtype
|
400
|
+
if not feature.dtype.startswith("cat")
|
401
|
+
else "category"
|
402
|
+
)
|
403
|
+
pandera_columns[feature.name] = pandera.Column(
|
404
|
+
pandera_dtype,
|
405
|
+
nullable=feature.nullable,
|
406
|
+
coerce=feature.coerce_dtype,
|
407
|
+
)
|
359
408
|
if feature.dtype.startswith("cat"):
|
360
409
|
categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
|
361
410
|
self._pandera_schema = pandera.DataFrameSchema(
|
@@ -365,7 +414,7 @@ class DataFrameCurator(Curator):
|
|
365
414
|
assert schema.itype is not None # noqa: S101
|
366
415
|
self._cat_manager = DataFrameCatManager(
|
367
416
|
self._dataset,
|
368
|
-
columns=
|
417
|
+
columns=parse_cat_dtype(schema.itype, is_itype=True)["field"],
|
369
418
|
categoricals=categoricals,
|
370
419
|
)
|
371
420
|
|
@@ -454,7 +503,7 @@ class DataFrameCurator(Curator):
|
|
454
503
|
"""{}""" # noqa: D415
|
455
504
|
if not self._is_validated:
|
456
505
|
self.validate() # raises ValidationError if doesn't validate
|
457
|
-
result =
|
506
|
+
result = parse_cat_dtype(self._schema.itype, is_itype=True)
|
458
507
|
return save_artifact( # type: ignore
|
459
508
|
self._dataset,
|
460
509
|
description=description,
|
@@ -545,7 +594,7 @@ class AnnDataCurator(SlotsCurator):
|
|
545
594
|
slot_schema,
|
546
595
|
)
|
547
596
|
for slot, slot_schema in schema.slots.items()
|
548
|
-
if slot in {"obs", "var"}
|
597
|
+
if slot in {"obs", "var", "uns"}
|
549
598
|
}
|
550
599
|
|
551
600
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
@@ -560,14 +609,16 @@ class AnnDataCurator(SlotsCurator):
|
|
560
609
|
"""{}""" # noqa: D415
|
561
610
|
if not self._is_validated:
|
562
611
|
self.validate()
|
612
|
+
if "obs" in self.slots:
|
613
|
+
categoricals = self.slots["obs"]._cat_manager.categoricals
|
614
|
+
else:
|
615
|
+
categoricals = {}
|
563
616
|
return save_artifact( # type: ignore
|
564
617
|
self._dataset,
|
565
618
|
description=description,
|
566
|
-
fields=
|
619
|
+
fields=categoricals,
|
567
620
|
index_field=(
|
568
|
-
|
569
|
-
"field"
|
570
|
-
]
|
621
|
+
parse_cat_dtype(self.slots["var"]._schema.itype, is_itype=True)["field"]
|
571
622
|
if "var" in self._slots
|
572
623
|
else None
|
573
624
|
),
|
@@ -595,7 +646,7 @@ def _assign_var_fields_categoricals_multimodal(
|
|
595
646
|
categoricals[modality] = {}
|
596
647
|
|
597
648
|
if slot_type == "var":
|
598
|
-
var_field =
|
649
|
+
var_field = parse_cat_dtype(slot_schema.itype, is_itype=True)["field"]
|
599
650
|
if modality is None:
|
600
651
|
# This should rarely/never be used since tables should have different var fields
|
601
652
|
var_fields[slot] = var_field # pragma: no cover
|
@@ -870,10 +921,16 @@ class CatManager:
|
|
870
921
|
# shared until here
|
871
922
|
self._categoricals = categoricals or {}
|
872
923
|
self._non_validated = None
|
873
|
-
self._organism = organism
|
874
924
|
self._sources = sources or {}
|
875
925
|
self._columns_field = columns_field
|
876
926
|
self._validate_category_error_messages: str = ""
|
927
|
+
# make sure to only fetch organism once at the beginning
|
928
|
+
if organism:
|
929
|
+
self._organism = organism
|
930
|
+
else:
|
931
|
+
fields = list(self._categoricals.values()) + [columns_field]
|
932
|
+
organisms = {get_organism_kwargs(field).get("organism") for field in fields}
|
933
|
+
self._organism = organisms.pop() if len(organisms) > 0 else None
|
877
934
|
|
878
935
|
@property
|
879
936
|
def non_validated(self) -> dict[str, list[str]]:
|
@@ -918,7 +975,7 @@ class CatManager:
|
|
918
975
|
Returns:
|
919
976
|
The boolean `True` if the dataset is validated. Otherwise, a string with the error message.
|
920
977
|
"""
|
921
|
-
pass
|
978
|
+
pass # pragma: no cover
|
922
979
|
|
923
980
|
def standardize(self, key: str) -> None:
|
924
981
|
"""Replace synonyms with standardized values.
|
@@ -943,31 +1000,24 @@ class CatManager:
|
|
943
1000
|
run: Run | None = None,
|
944
1001
|
) -> Artifact:
|
945
1002
|
"""{}""" # noqa: D415
|
946
|
-
|
947
|
-
|
1003
|
+
# Make sure all labels are saved in the current instance
|
948
1004
|
if not self._is_validated:
|
949
1005
|
self.validate() # returns True or False
|
950
1006
|
if not self._is_validated: # need to raise error manually
|
951
1007
|
raise ValidationError("Dataset does not validate. Please curate.")
|
952
1008
|
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
self.
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
run=run,
|
966
|
-
schema=None,
|
967
|
-
organism=self._organism,
|
968
|
-
)
|
969
|
-
finally:
|
970
|
-
settings.verbosity = verbosity
|
1009
|
+
self._artifact = save_artifact( # type: ignore
|
1010
|
+
self._dataset,
|
1011
|
+
key=key,
|
1012
|
+
description=description,
|
1013
|
+
fields=self.categoricals,
|
1014
|
+
index_field=self._columns_field,
|
1015
|
+
artifact=self._artifact,
|
1016
|
+
revises=revises,
|
1017
|
+
run=run,
|
1018
|
+
schema=None,
|
1019
|
+
organism=self._organism,
|
1020
|
+
)
|
971
1021
|
|
972
1022
|
return self._artifact
|
973
1023
|
|
@@ -984,8 +1034,6 @@ class DataFrameCatManager(CatManager):
|
|
984
1034
|
organism: str | None = None,
|
985
1035
|
sources: dict[str, Record] | None = None,
|
986
1036
|
) -> None:
|
987
|
-
from lamindb.core._settings import settings
|
988
|
-
|
989
1037
|
if organism is not None and not isinstance(organism, str):
|
990
1038
|
raise ValueError("organism must be a string such as 'human' or 'mouse'!")
|
991
1039
|
|
@@ -1010,6 +1058,8 @@ class DataFrameCatManager(CatManager):
|
|
1010
1058
|
categoricals=self._categoricals,
|
1011
1059
|
slots={"columns": self._columns_field},
|
1012
1060
|
public=public,
|
1061
|
+
organism=self._organism,
|
1062
|
+
sources=self._sources,
|
1013
1063
|
)
|
1014
1064
|
|
1015
1065
|
def _save_columns(self, validated_only: bool = True) -> None:
|
@@ -1018,18 +1068,18 @@ class DataFrameCatManager(CatManager):
|
|
1018
1068
|
update_registry(
|
1019
1069
|
values=list(self.categoricals.keys()),
|
1020
1070
|
field=self._columns_field,
|
1021
|
-
key="columns",
|
1071
|
+
key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
|
1022
1072
|
validated_only=False,
|
1023
1073
|
source=self._sources.get("columns"),
|
1024
1074
|
)
|
1025
1075
|
|
1026
1076
|
# Save the rest of the columns based on validated_only
|
1027
|
-
additional_columns = set(self._dataset.
|
1077
|
+
additional_columns = set(self._dataset.keys()) - set(self.categoricals.keys())
|
1028
1078
|
if additional_columns:
|
1029
1079
|
update_registry(
|
1030
1080
|
values=list(additional_columns),
|
1031
1081
|
field=self._columns_field,
|
1032
|
-
key="columns",
|
1082
|
+
key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
|
1033
1083
|
validated_only=validated_only,
|
1034
1084
|
df=self._dataset, # Get the Feature type from df
|
1035
1085
|
source=self._sources.get("columns"),
|
@@ -1037,7 +1087,7 @@ class DataFrameCatManager(CatManager):
|
|
1037
1087
|
|
1038
1088
|
@deprecated(new_name="is run by default")
|
1039
1089
|
def add_new_from_columns(self, organism: str | None = None, **kwargs):
|
1040
|
-
pass
|
1090
|
+
pass # pragma: no cover
|
1041
1091
|
|
1042
1092
|
def validate(self) -> bool:
|
1043
1093
|
"""Validate variables and categorical observations.
|
@@ -1093,7 +1143,7 @@ class DataFrameCatManager(CatManager):
|
|
1093
1143
|
else:
|
1094
1144
|
if key not in avail_keys:
|
1095
1145
|
if key in self._categoricals:
|
1096
|
-
logger.
|
1146
|
+
logger.warning(f"No non-standardized values found for {key!r}")
|
1097
1147
|
else:
|
1098
1148
|
raise KeyError(
|
1099
1149
|
f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
|
@@ -1173,7 +1223,9 @@ class AnnDataCatManager(CatManager):
|
|
1173
1223
|
sources: dict[str, Record] | None = None,
|
1174
1224
|
) -> None:
|
1175
1225
|
if isinstance(var_index, str):
|
1176
|
-
raise TypeError(
|
1226
|
+
raise TypeError(
|
1227
|
+
"var_index parameter has to be a field, e.g. Gene.ensembl_gene_id"
|
1228
|
+
)
|
1177
1229
|
|
1178
1230
|
if not data_is_anndata(data):
|
1179
1231
|
raise TypeError("data has to be an AnnData object")
|
@@ -1223,6 +1275,8 @@ class AnnDataCatManager(CatManager):
|
|
1223
1275
|
categoricals=self._obs_fields,
|
1224
1276
|
slots={"columns": self._columns_field, "var_index": self._var_field},
|
1225
1277
|
public=public,
|
1278
|
+
organism=self._organism,
|
1279
|
+
sources=self._sources,
|
1226
1280
|
)
|
1227
1281
|
|
1228
1282
|
def _save_from_var_index(
|
@@ -1433,6 +1487,8 @@ class MuDataCatManager(CatManager):
|
|
1433
1487
|
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
1434
1488
|
},
|
1435
1489
|
public=public,
|
1490
|
+
organism=self._organism,
|
1491
|
+
sources=self._sources,
|
1436
1492
|
)
|
1437
1493
|
|
1438
1494
|
@deprecated(new_name="is run by default")
|
@@ -1442,7 +1498,7 @@ class MuDataCatManager(CatManager):
|
|
1442
1498
|
column_names: list[str] | None = None,
|
1443
1499
|
**kwargs,
|
1444
1500
|
):
|
1445
|
-
pass
|
1501
|
+
pass # pragma: no cover
|
1446
1502
|
|
1447
1503
|
def add_new_from_var_index(self, modality: str, **kwargs):
|
1448
1504
|
"""Update variable records.
|
@@ -1487,13 +1543,7 @@ class MuDataCatManager(CatManager):
|
|
1487
1543
|
def validate(self) -> bool:
|
1488
1544
|
"""Validate categories."""
|
1489
1545
|
# add all validated records to the current instance
|
1490
|
-
|
1491
|
-
try:
|
1492
|
-
settings.verbosity = "error"
|
1493
|
-
self._update_registry_all()
|
1494
|
-
finally:
|
1495
|
-
settings.verbosity = verbosity
|
1496
|
-
|
1546
|
+
self._update_registry_all()
|
1497
1547
|
self._non_validated = {} # type: ignore
|
1498
1548
|
|
1499
1549
|
obs_validated = True
|
@@ -1684,6 +1734,8 @@ class SpatialDataCatManager(CatManager):
|
|
1684
1734
|
categoricals=cat_values_dict,
|
1685
1735
|
slots={"accessors": cat_values_dict.keys()},
|
1686
1736
|
public=public,
|
1737
|
+
organism=self._organism,
|
1738
|
+
sources=self._sources,
|
1687
1739
|
)
|
1688
1740
|
|
1689
1741
|
def _update_registry_all(self) -> None:
|
@@ -1799,12 +1851,7 @@ class SpatialDataCatManager(CatManager):
|
|
1799
1851
|
Whether the SpatialData object is validated.
|
1800
1852
|
"""
|
1801
1853
|
# add all validated records to the current instance
|
1802
|
-
|
1803
|
-
try:
|
1804
|
-
settings.verbosity = "error"
|
1805
|
-
self._update_registry_all()
|
1806
|
-
finally:
|
1807
|
-
settings.verbosity = verbosity
|
1854
|
+
self._update_registry_all()
|
1808
1855
|
|
1809
1856
|
self._non_validated = {} # type: ignore
|
1810
1857
|
|
@@ -1957,15 +2004,12 @@ class TiledbsomaCatManager(CatManager):
|
|
1957
2004
|
|
1958
2005
|
# register obs columns' names
|
1959
2006
|
register_columns = list(self._obs_fields.keys())
|
1960
|
-
organism = configure_organism(
|
1961
|
-
self._columns_field.field.model, self._organism
|
1962
|
-
).get("organism")
|
1963
2007
|
update_registry(
|
1964
2008
|
values=register_columns,
|
1965
2009
|
field=self._columns_field,
|
1966
2010
|
key="columns",
|
1967
2011
|
validated_only=False,
|
1968
|
-
organism=
|
2012
|
+
organism=self._organism,
|
1969
2013
|
source=self._sources.get("columns"),
|
1970
2014
|
)
|
1971
2015
|
additional_columns = [k for k in valid_obs_keys if k not in register_columns]
|
@@ -1979,7 +2023,7 @@ class TiledbsomaCatManager(CatManager):
|
|
1979
2023
|
field=self._columns_field,
|
1980
2024
|
key="columns",
|
1981
2025
|
validated_only=True,
|
1982
|
-
organism=
|
2026
|
+
organism=self._organism,
|
1983
2027
|
source=self._sources.get("columns"),
|
1984
2028
|
)
|
1985
2029
|
|
@@ -1999,22 +2043,19 @@ class TiledbsomaCatManager(CatManager):
|
|
1999
2043
|
var_ms_values = (
|
2000
2044
|
var_ms.read(column_names=[key]).concat()[key].to_pylist()
|
2001
2045
|
)
|
2002
|
-
organism = configure_organism(field.field.model, self._organism).get(
|
2003
|
-
"organism"
|
2004
|
-
)
|
2005
2046
|
update_registry(
|
2006
2047
|
values=var_ms_values,
|
2007
2048
|
field=field,
|
2008
2049
|
key=var_ms_key,
|
2009
2050
|
validated_only=True,
|
2010
|
-
organism=
|
2051
|
+
organism=self._organism,
|
2011
2052
|
source=self._sources.get(var_ms_key),
|
2012
2053
|
)
|
2013
2054
|
_, non_val = validate_categories(
|
2014
2055
|
values=var_ms_values,
|
2015
2056
|
field=field,
|
2016
2057
|
key=var_ms_key,
|
2017
|
-
organism=
|
2058
|
+
organism=self._organism,
|
2018
2059
|
source=self._sources.get(var_ms_key),
|
2019
2060
|
)
|
2020
2061
|
if len(non_val) > 0:
|
@@ -2031,22 +2072,19 @@ class TiledbsomaCatManager(CatManager):
|
|
2031
2072
|
values = pa.compute.unique(
|
2032
2073
|
obs.read(column_names=[key]).concat()[key]
|
2033
2074
|
).to_pylist()
|
2034
|
-
organism = configure_organism(field.field.model, self._organism).get(
|
2035
|
-
"organism"
|
2036
|
-
)
|
2037
2075
|
update_registry(
|
2038
2076
|
values=values,
|
2039
2077
|
field=field,
|
2040
2078
|
key=key,
|
2041
2079
|
validated_only=True,
|
2042
|
-
organism=
|
2080
|
+
organism=self._organism,
|
2043
2081
|
source=self._sources.get(key),
|
2044
2082
|
)
|
2045
2083
|
_, non_val = validate_categories(
|
2046
2084
|
values=values,
|
2047
2085
|
field=field,
|
2048
2086
|
key=key,
|
2049
|
-
organism=
|
2087
|
+
organism=self._organism,
|
2050
2088
|
source=self._sources.get(key),
|
2051
2089
|
)
|
2052
2090
|
if len(non_val) > 0:
|
@@ -2095,15 +2133,12 @@ class TiledbsomaCatManager(CatManager):
|
|
2095
2133
|
values, field = self._non_validated_values_field(k)
|
2096
2134
|
if len(values) == 0:
|
2097
2135
|
continue
|
2098
|
-
organism = configure_organism(field.field.model, self._organism).get(
|
2099
|
-
"organism"
|
2100
|
-
)
|
2101
2136
|
update_registry(
|
2102
2137
|
values=values,
|
2103
2138
|
field=field,
|
2104
2139
|
key=k,
|
2105
2140
|
validated_only=False,
|
2106
|
-
organism=
|
2141
|
+
organism=self._organism,
|
2107
2142
|
source=self._sources.get(k),
|
2108
2143
|
**kwargs,
|
2109
2144
|
)
|
@@ -2138,6 +2173,8 @@ class TiledbsomaCatManager(CatManager):
|
|
2138
2173
|
categoricals=self._obs_fields,
|
2139
2174
|
slots={"columns": self._columns_field, **self._var_fields_flat},
|
2140
2175
|
public=public,
|
2176
|
+
organism=self._organism,
|
2177
|
+
sources=self._sources,
|
2141
2178
|
)
|
2142
2179
|
|
2143
2180
|
def standardize(self, key: str):
|
@@ -2173,16 +2210,11 @@ class TiledbsomaCatManager(CatManager):
|
|
2173
2210
|
else:
|
2174
2211
|
slot = lambda experiment: experiment.obs
|
2175
2212
|
slot_key = k
|
2176
|
-
# errors if public ontology and the model has no organism
|
2177
|
-
# has to be fixed in bionty
|
2178
|
-
organism = configure_organism(field.field.model, self._organism).get(
|
2179
|
-
"organism"
|
2180
|
-
)
|
2181
2213
|
syn_mapper = standardize_categories(
|
2182
2214
|
values=values,
|
2183
2215
|
field=field,
|
2184
2216
|
source=self._sources.get(k),
|
2185
|
-
organism=
|
2217
|
+
organism=self._organism,
|
2186
2218
|
)
|
2187
2219
|
if (n_syn_mapper := len(syn_mapper)) == 0:
|
2188
2220
|
continue
|
@@ -2259,9 +2291,6 @@ class TiledbsomaCatManager(CatManager):
|
|
2259
2291
|
|
2260
2292
|
feature_sets = {}
|
2261
2293
|
if len(self._obs_fields) > 0:
|
2262
|
-
organism = configure_organism(
|
2263
|
-
self._columns_field.field.model, self._organism
|
2264
|
-
).get("organism")
|
2265
2294
|
empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
|
2266
2295
|
mock_df = pa.Table.from_pydict(
|
2267
2296
|
empty_dict, schema=self._obs_pa_schema
|
@@ -2271,17 +2300,14 @@ class TiledbsomaCatManager(CatManager):
|
|
2271
2300
|
df=mock_df,
|
2272
2301
|
field=self._columns_field,
|
2273
2302
|
mute=True,
|
2274
|
-
organism=
|
2303
|
+
organism=self._organism,
|
2275
2304
|
)
|
2276
2305
|
for ms in self._var_fields:
|
2277
2306
|
var_key, var_field = self._var_fields[ms]
|
2278
|
-
organism = configure_organism(var_field.field.model, self._organism).get(
|
2279
|
-
"organism"
|
2280
|
-
)
|
2281
2307
|
feature_sets[f"{ms}__var"] = Schema.from_values(
|
2282
2308
|
values=self._validated_values[f"{ms}__{var_key}"],
|
2283
2309
|
field=var_field,
|
2284
|
-
organism=
|
2310
|
+
organism=self._organism,
|
2285
2311
|
raise_validation_error=False,
|
2286
2312
|
)
|
2287
2313
|
artifact._staged_feature_sets = feature_sets
|
@@ -2291,11 +2317,10 @@ class TiledbsomaCatManager(CatManager):
|
|
2291
2317
|
for key, field in self._obs_fields.items():
|
2292
2318
|
feature = features.get(key)
|
2293
2319
|
registry = field.field.model
|
2294
|
-
organism = configure_organism(field.field.model, self._organism).get(
|
2295
|
-
"organism"
|
2296
|
-
)
|
2297
2320
|
labels = registry.from_values(
|
2298
|
-
values=self._validated_values[key],
|
2321
|
+
values=self._validated_values[key],
|
2322
|
+
field=field,
|
2323
|
+
organism=self._organism,
|
2299
2324
|
)
|
2300
2325
|
if len(labels) == 0:
|
2301
2326
|
continue
|
@@ -2722,10 +2747,11 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
|
2722
2747
|
import wetlab as wl
|
2723
2748
|
|
2724
2749
|
sources = {}
|
2725
|
-
|
2726
|
-
|
2727
|
-
|
2728
|
-
|
2750
|
+
# # do not yet specify cell_line source
|
2751
|
+
# if "cell_line" in adata.obs.columns:
|
2752
|
+
# sources["cell_line"] = bt.Source.filter(
|
2753
|
+
# entity="bionty.CellLine", name="depmap"
|
2754
|
+
# ).first()
|
2729
2755
|
if "pert_compound" in adata.obs.columns:
|
2730
2756
|
with logger.mute():
|
2731
2757
|
chebi_source = bt.Source.filter(
|
@@ -2908,35 +2934,43 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
|
2908
2934
|
source = kwargs.get("source")
|
2909
2935
|
organism = kwargs.get("organism")
|
2910
2936
|
filter_kwargs = kwargs.copy()
|
2911
|
-
|
2912
|
-
|
2913
|
-
|
2914
|
-
|
2915
|
-
|
2916
|
-
|
2917
|
-
|
2918
|
-
|
2919
|
-
|
2920
|
-
|
2921
|
-
|
2922
|
-
|
2923
|
-
|
2924
|
-
|
2925
|
-
filter_kwargs["source"] = source_default
|
2926
|
-
finally:
|
2927
|
-
settings.verbosity = verbosity
|
2937
|
+
|
2938
|
+
if isinstance(organism, Record) and organism._state.db != "default":
|
2939
|
+
if db is None or db == "default":
|
2940
|
+
organism_default = copy.copy(organism)
|
2941
|
+
# save the organism record in the default database
|
2942
|
+
organism_default.save()
|
2943
|
+
filter_kwargs["organism"] = organism_default
|
2944
|
+
if isinstance(source, Record) and source._state.db != "default":
|
2945
|
+
if db is None or db == "default":
|
2946
|
+
source_default = copy.copy(source)
|
2947
|
+
# save the source record in the default database
|
2948
|
+
source_default.save()
|
2949
|
+
filter_kwargs["source"] = source_default
|
2950
|
+
|
2928
2951
|
return filter_kwargs
|
2929
2952
|
|
2930
2953
|
|
2931
|
-
def
|
2954
|
+
def get_organism_kwargs(
|
2955
|
+
field: FieldAttr, organism: str | None = None
|
2956
|
+
) -> dict[str, str]:
|
2932
2957
|
"""Check if a registry needs an organism and return the organism name."""
|
2933
|
-
|
2934
|
-
|
2935
|
-
if _is_organism_required(registry):
|
2958
|
+
registry = field.field.model
|
2959
|
+
if registry.__base__.__name__ == "BioRecord":
|
2936
2960
|
import bionty as bt
|
2961
|
+
from bionty._organism import is_organism_required
|
2937
2962
|
|
2938
|
-
|
2939
|
-
|
2963
|
+
from ..models._from_values import get_organism_record_from_field
|
2964
|
+
|
2965
|
+
if is_organism_required(registry):
|
2966
|
+
if organism is not None or bt.settings.organism is not None:
|
2967
|
+
return {"organism": organism or bt.settings.organism.name}
|
2968
|
+
else:
|
2969
|
+
organism_record = get_organism_record_from_field(
|
2970
|
+
field, organism=organism
|
2971
|
+
)
|
2972
|
+
if organism_record is not None:
|
2973
|
+
return {"organism": organism_record.name}
|
2940
2974
|
return {}
|
2941
2975
|
|
2942
2976
|
|
@@ -2969,17 +3003,16 @@ def validate_categories(
|
|
2969
3003
|
|
2970
3004
|
registry = field.field.model
|
2971
3005
|
|
2972
|
-
|
2973
|
-
|
2974
|
-
|
2975
|
-
kwargs_current = get_current_filter_kwargs(registry, kwargs)
|
3006
|
+
kwargs_current = get_current_filter_kwargs(
|
3007
|
+
registry, {"organism": organism, "source": source}
|
3008
|
+
)
|
2976
3009
|
|
2977
3010
|
# inspect values from the default instance
|
2978
3011
|
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs_current)
|
2979
3012
|
non_validated = inspect_result.non_validated
|
2980
3013
|
syn_mapper = inspect_result.synonyms_mapper
|
2981
3014
|
|
2982
|
-
# inspect the non-validated values from public (
|
3015
|
+
# inspect the non-validated values from public (BioRecord only)
|
2983
3016
|
values_validated = []
|
2984
3017
|
if hasattr(registry, "public"):
|
2985
3018
|
public_records = registry.from_values(
|
@@ -3134,18 +3167,6 @@ def save_artifact(
|
|
3134
3167
|
)
|
3135
3168
|
artifact.save()
|
3136
3169
|
|
3137
|
-
if organism is not None and index_field is not None:
|
3138
|
-
feature_kwargs = configure_organism(
|
3139
|
-
(
|
3140
|
-
list(index_field.values())[0].field.model
|
3141
|
-
if isinstance(index_field, dict)
|
3142
|
-
else index_field.field.model
|
3143
|
-
),
|
3144
|
-
organism,
|
3145
|
-
)
|
3146
|
-
else:
|
3147
|
-
feature_kwargs = {}
|
3148
|
-
|
3149
3170
|
def _add_labels(
|
3150
3171
|
data: pd.DataFrame | ScverseDataStructures,
|
3151
3172
|
artifact: Artifact,
|
@@ -3156,19 +3177,15 @@ def save_artifact(
|
|
3156
3177
|
for key, field in fields.items():
|
3157
3178
|
feature = features.get(key)
|
3158
3179
|
registry = field.field.model
|
3159
|
-
|
3160
|
-
|
3180
|
+
# we don't need source here because all records are already in the DB
|
3181
|
+
filter_kwargs = get_current_filter_kwargs(registry, {"organism": organism})
|
3161
3182
|
df = data if isinstance(data, pd.DataFrame) else data.obs
|
3162
3183
|
# multi-value columns are separated by "|"
|
3163
3184
|
if not df[key].isna().all() and df[key].str.contains("|").any():
|
3164
3185
|
values = df[key].str.split("|").explode().unique()
|
3165
3186
|
else:
|
3166
3187
|
values = df[key].unique()
|
3167
|
-
labels = registry.from_values(
|
3168
|
-
values,
|
3169
|
-
field=field,
|
3170
|
-
**filter_kwargs_current,
|
3171
|
-
)
|
3188
|
+
labels = registry.from_values(values, field=field, **filter_kwargs)
|
3172
3189
|
if len(labels) == 0:
|
3173
3190
|
continue
|
3174
3191
|
label_ref_is_name = None
|
@@ -3185,20 +3202,26 @@ def save_artifact(
|
|
3185
3202
|
|
3186
3203
|
match artifact.otype:
|
3187
3204
|
case "DataFrame":
|
3188
|
-
artifact.features._add_set_from_df(field=index_field,
|
3205
|
+
artifact.features._add_set_from_df(field=index_field, organism=organism) # type: ignore
|
3189
3206
|
_add_labels(
|
3190
3207
|
data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
|
3191
3208
|
)
|
3192
3209
|
case "AnnData":
|
3210
|
+
if schema is not None and "uns" in schema.slots:
|
3211
|
+
uns_field = parse_cat_dtype(schema.slots["uns"].itype, is_itype=True)[
|
3212
|
+
"field"
|
3213
|
+
]
|
3214
|
+
else:
|
3215
|
+
uns_field = None
|
3193
3216
|
artifact.features._add_set_from_anndata( # type: ignore
|
3194
|
-
var_field=index_field,
|
3217
|
+
var_field=index_field, uns_field=uns_field, organism=organism
|
3195
3218
|
)
|
3196
3219
|
_add_labels(
|
3197
3220
|
data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
|
3198
3221
|
)
|
3199
3222
|
case "MuData":
|
3200
3223
|
artifact.features._add_set_from_mudata( # type: ignore
|
3201
|
-
var_fields=index_field,
|
3224
|
+
var_fields=index_field, organism=organism
|
3202
3225
|
)
|
3203
3226
|
for modality, modality_fields in fields.items():
|
3204
3227
|
column_field_modality = index_field.get(modality)
|
@@ -3228,7 +3251,7 @@ def save_artifact(
|
|
3228
3251
|
artifact.features._add_set_from_spatialdata( # type: ignore
|
3229
3252
|
sample_metadata_key=kwargs.get("sample_metadata_key", "sample"),
|
3230
3253
|
var_fields=index_field,
|
3231
|
-
|
3254
|
+
organism=organism,
|
3232
3255
|
)
|
3233
3256
|
sample_metadata_key = kwargs.get("sample_metadata_key", "sample")
|
3234
3257
|
for accessor, accessor_fields in fields.items():
|
@@ -3305,77 +3328,63 @@ def update_registry(
|
|
3305
3328
|
from lamindb.models.save import save as ln_save
|
3306
3329
|
|
3307
3330
|
registry = field.field.model
|
3308
|
-
filter_kwargs =
|
3309
|
-
|
3331
|
+
filter_kwargs = get_current_filter_kwargs(
|
3332
|
+
registry, {"organism": organism, "source": source}
|
3333
|
+
)
|
3310
3334
|
values = [i for i in values if isinstance(i, str) and i]
|
3311
3335
|
if not values:
|
3312
3336
|
return
|
3313
3337
|
|
3314
|
-
|
3315
|
-
try:
|
3316
|
-
settings.verbosity = "error"
|
3317
|
-
labels_saved: dict = {"from public": [], "new": []}
|
3338
|
+
labels_saved: dict = {"from public": [], "new": []}
|
3318
3339
|
|
3319
|
-
|
3320
|
-
|
3321
|
-
|
3322
|
-
|
3323
|
-
|
3324
|
-
|
3325
|
-
|
3326
|
-
|
3327
|
-
|
3328
|
-
|
3329
|
-
|
3330
|
-
|
3331
|
-
if source
|
3332
|
-
|
3333
|
-
|
3334
|
-
|
3335
|
-
|
3336
|
-
|
3337
|
-
ln_save(public_records)
|
3338
|
-
labels_saved["from public"] = [
|
3339
|
-
getattr(r, field.field.name) for r in public_records
|
3340
|
-
]
|
3341
|
-
# non-validated records from the default instance
|
3342
|
-
non_validated_labels = [
|
3343
|
-
i for i in values if i not in existing_and_public_labels
|
3340
|
+
# inspect the default instance and save validated records from public
|
3341
|
+
existing_and_public_records = registry.from_values(
|
3342
|
+
list(values), field=field, **filter_kwargs, mute=True
|
3343
|
+
)
|
3344
|
+
existing_and_public_labels = [
|
3345
|
+
getattr(r, field.field.name) for r in existing_and_public_records
|
3346
|
+
]
|
3347
|
+
# public records that are not already in the database
|
3348
|
+
public_records = [r for r in existing_and_public_records if r._state.adding]
|
3349
|
+
# here we check to only save the public records if they are from the specified source
|
3350
|
+
# we check the uid because r.source and source can be from different instances
|
3351
|
+
if source:
|
3352
|
+
public_records = [r for r in public_records if r.source.uid == source.uid]
|
3353
|
+
if len(public_records) > 0:
|
3354
|
+
logger.info(f"saving validated records of '{key}'")
|
3355
|
+
ln_save(public_records)
|
3356
|
+
labels_saved["from public"] = [
|
3357
|
+
getattr(r, field.field.name) for r in public_records
|
3344
3358
|
]
|
3359
|
+
# non-validated records from the default instance
|
3360
|
+
non_validated_labels = [i for i in values if i not in existing_and_public_labels]
|
3361
|
+
|
3362
|
+
# save non-validated/new records
|
3363
|
+
labels_saved["new"] = non_validated_labels
|
3364
|
+
if not validated_only:
|
3365
|
+
non_validated_records: RecordList[Any] = [] # type: ignore
|
3366
|
+
if df is not None and registry == Feature:
|
3367
|
+
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
3368
|
+
non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
|
3369
|
+
else:
|
3370
|
+
if (
|
3371
|
+
organism
|
3372
|
+
and hasattr(registry, "organism")
|
3373
|
+
and registry._meta.get_field("organism").is_relation
|
3374
|
+
):
|
3375
|
+
# make sure organism record is saved to the current instance
|
3376
|
+
create_kwargs["organism"] = _save_organism(name=organism)
|
3345
3377
|
|
3346
|
-
|
3347
|
-
|
3348
|
-
|
3349
|
-
|
3350
|
-
|
3351
|
-
|
3352
|
-
non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
|
3353
|
-
else:
|
3354
|
-
if "organism" in filter_kwargs:
|
3355
|
-
# make sure organism record is saved to the current instance
|
3356
|
-
filter_kwargs["organism"] = _save_organism(name=organism)
|
3357
|
-
init_kwargs = {}
|
3358
|
-
for value in labels_saved["new"]:
|
3359
|
-
init_kwargs[field.field.name] = value
|
3360
|
-
if registry == Feature:
|
3361
|
-
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
3362
|
-
non_validated_records.append(
|
3363
|
-
registry(
|
3364
|
-
**init_kwargs,
|
3365
|
-
**{k: v for k, v in filter_kwargs.items() if k != "source"},
|
3366
|
-
**{
|
3367
|
-
k: v for k, v in create_kwargs.items() if k != "sources"
|
3368
|
-
},
|
3369
|
-
)
|
3370
|
-
)
|
3371
|
-
ln_save(non_validated_records)
|
3372
|
-
|
3373
|
-
# save parent labels for ulabels, for example a parent label "project" for label "project001"
|
3374
|
-
if registry == ULabel and field.field.name == "name":
|
3375
|
-
save_ulabels_type(values, field=field, key=key)
|
3378
|
+
for value in labels_saved["new"]:
|
3379
|
+
init_kwargs = {field.field.name: value}
|
3380
|
+
if registry == Feature:
|
3381
|
+
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
3382
|
+
non_validated_records.append(registry(**init_kwargs, **create_kwargs))
|
3383
|
+
ln_save(non_validated_records)
|
3376
3384
|
|
3377
|
-
|
3378
|
-
|
3385
|
+
# save parent labels for ulabels, for example a parent label "project" for label "project001"
|
3386
|
+
if registry == ULabel and field.field.name == "name":
|
3387
|
+
save_ulabels_type(values, field=field, key=key)
|
3379
3388
|
|
3380
3389
|
log_saved_labels(
|
3381
3390
|
labels_saved,
|
@@ -3433,8 +3442,9 @@ def _save_organism(name: str):
|
|
3433
3442
|
organism = bt.Organism.from_source(name=name)
|
3434
3443
|
if organism is None:
|
3435
3444
|
raise ValidationError(
|
3436
|
-
f'Organism "{name}" not found\n'
|
3437
|
-
f' → please save it: bt.Organism(name="{name}").save()'
|
3445
|
+
f'Organism "{name}" not found from public reference\n'
|
3446
|
+
f' → please save it from a different source: bt.Organism.from_source(name="{name}", source).save()'
|
3447
|
+
f' → or manually save it without source: bt.Organism(name="{name}").save()'
|
3438
3448
|
)
|
3439
3449
|
organism.save()
|
3440
3450
|
return organism
|