lamindb 0.77.2__py3-none-any.whl → 0.77.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_can_curate.py +6 -4
- lamindb/_curate.py +589 -45
- lamindb/_finish.py +7 -7
- lamindb/_from_values.py +7 -7
- lamindb/_record.py +7 -5
- lamindb/_save.py +9 -2
- lamindb/_view.py +2 -1
- lamindb/core/__init__.py +2 -0
- lamindb/core/_context.py +2 -4
- lamindb/core/_django.py +30 -17
- lamindb/core/_feature_manager.py +64 -41
- lamindb/core/_label_manager.py +15 -14
- lamindb/core/loaders.py +18 -1
- lamindb/core/storage/_tiledbsoma.py +2 -0
- {lamindb-0.77.2.dist-info → lamindb-0.77.3.dist-info}/METADATA +4 -3
- {lamindb-0.77.2.dist-info → lamindb-0.77.3.dist-info}/RECORD +19 -19
- {lamindb-0.77.2.dist-info → lamindb-0.77.3.dist-info}/LICENSE +0 -0
- {lamindb-0.77.2.dist-info → lamindb-0.77.3.dist-info}/WHEEL +0 -0
lamindb/_curate.py
CHANGED
@@ -2,22 +2,26 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import copy
|
4
4
|
import warnings
|
5
|
+
from itertools import chain
|
5
6
|
from typing import TYPE_CHECKING
|
6
7
|
|
7
8
|
import anndata as ad
|
8
9
|
import lamindb_setup as ln_setup
|
9
10
|
import pandas as pd
|
11
|
+
import pyarrow as pa
|
10
12
|
from lamin_utils import colors, logger
|
11
13
|
from lamindb_setup.core._docs import doc_args
|
14
|
+
from lamindb_setup.core.upath import UPath
|
12
15
|
from lnschema_core import (
|
13
16
|
Artifact,
|
14
17
|
Feature,
|
18
|
+
FeatureSet,
|
15
19
|
Record,
|
16
20
|
Run,
|
17
21
|
ULabel,
|
18
22
|
)
|
19
23
|
|
20
|
-
from ._from_values import
|
24
|
+
from ._from_values import _format_values
|
21
25
|
from .core.exceptions import ValidationError
|
22
26
|
|
23
27
|
if TYPE_CHECKING:
|
@@ -40,8 +44,8 @@ class CurateLookup:
|
|
40
44
|
public: Whether to lookup from the public instance. Defaults to False.
|
41
45
|
|
42
46
|
Example:
|
43
|
-
>>>
|
44
|
-
>>>
|
47
|
+
>>> curator = ln.Curator.from_df(...)
|
48
|
+
>>> curator.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
|
45
49
|
<Category: alveolar_type_1_fibroblast_cell>
|
46
50
|
|
47
51
|
"""
|
@@ -96,7 +100,7 @@ class CurateLookup:
|
|
96
100
|
f"Lookup objects from the {colors.italic(ref)}:\n "
|
97
101
|
f"{colors.green(getattr_keys)}\n "
|
98
102
|
f"{colors.green(getitem_keys)}\n"
|
99
|
-
'Example:\n → categories =
|
103
|
+
'Example:\n → categories = curator.lookup()["cell_type"]\n'
|
100
104
|
" → categories.alveolar_type_1_fibroblast_cell\n\n"
|
101
105
|
"To look up public ontologies, use .lookup(public=True)"
|
102
106
|
)
|
@@ -107,6 +111,15 @@ class CurateLookup:
|
|
107
111
|
class BaseCurator:
|
108
112
|
"""Curate a dataset."""
|
109
113
|
|
114
|
+
def __init_subclass__(cls, **kwargs):
|
115
|
+
super().__init_subclass__(**kwargs)
|
116
|
+
import sys
|
117
|
+
|
118
|
+
# Deprecated methods
|
119
|
+
if "sphinx" not in sys.modules:
|
120
|
+
if hasattr(cls, "_add_new_from_columns"):
|
121
|
+
cls.add_new_from_columns = cls._add_new_from_columns
|
122
|
+
|
110
123
|
def validate(self) -> bool:
|
111
124
|
"""Validate dataset.
|
112
125
|
|
@@ -164,14 +177,16 @@ class DataFrameCurator(BaseCurator):
|
|
164
177
|
verbosity: The verbosity level.
|
165
178
|
organism: The organism name.
|
166
179
|
sources: A dictionary mapping column names to Source records.
|
167
|
-
exclude: A dictionary mapping column names to values to exclude.
|
180
|
+
exclude: A dictionary mapping column names to values to exclude from validation.
|
181
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
182
|
+
using the exclude parameter ensures they are not validated.
|
168
183
|
|
169
184
|
Returns:
|
170
185
|
A curator object.
|
171
186
|
|
172
187
|
Examples:
|
173
188
|
>>> import bionty as bt
|
174
|
-
>>>
|
189
|
+
>>> curator = ln.Curator.from_df(
|
175
190
|
... df,
|
176
191
|
... categoricals={
|
177
192
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
@@ -255,7 +270,7 @@ class DataFrameCurator(BaseCurator):
|
|
255
270
|
are = "are" if n > 1 else "is"
|
256
271
|
if len(nonval_keys) > 0:
|
257
272
|
raise ValidationError(
|
258
|
-
f"
|
273
|
+
f"key{s} passed to {name} {are} not present in columns: {colors.yellow(_format_values(nonval_keys))}"
|
259
274
|
)
|
260
275
|
|
261
276
|
def _save_columns(self, validated_only: bool = True) -> None:
|
@@ -300,7 +315,7 @@ class DataFrameCurator(BaseCurator):
|
|
300
315
|
self._kwargs.update({"organism": organism} if organism else {})
|
301
316
|
self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
|
302
317
|
|
303
|
-
def
|
318
|
+
def _add_new_from_columns(self, organism: str | None = None, **kwargs):
|
304
319
|
"""Deprecated to run by default during init."""
|
305
320
|
warnings.warn(
|
306
321
|
"`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
|
@@ -323,7 +338,7 @@ class DataFrameCurator(BaseCurator):
|
|
323
338
|
# logging
|
324
339
|
n = len(syn_mapper)
|
325
340
|
if n > 0:
|
326
|
-
syn_mapper_print =
|
341
|
+
syn_mapper_print = _format_values(
|
327
342
|
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
328
343
|
)
|
329
344
|
s = "s" if n > 1 else ""
|
@@ -332,13 +347,13 @@ class DataFrameCurator(BaseCurator):
|
|
332
347
|
)
|
333
348
|
return std_values
|
334
349
|
|
335
|
-
def standardize(self, key: str):
|
350
|
+
def standardize(self, key: str) -> None:
|
336
351
|
"""Replace synonyms with standardized values.
|
337
352
|
|
338
|
-
Args:
|
339
|
-
key: The key referencing the slot in the DataFrame from which to draw terms.
|
340
|
-
|
341
353
|
Modifies the input dataset inplace.
|
354
|
+
|
355
|
+
Args:
|
356
|
+
key: The key referencing the column in the DataFrame to standardize.
|
342
357
|
"""
|
343
358
|
# list is needed to avoid RuntimeError: dictionary changed size during iteration
|
344
359
|
avail_keys = list(self.non_validated.keys())
|
@@ -359,9 +374,12 @@ class DataFrameCurator(BaseCurator):
|
|
359
374
|
self._df[k] = self._replace_synonyms(k, syn_mapper, self._df[k])
|
360
375
|
else:
|
361
376
|
if key not in avail_keys:
|
362
|
-
|
363
|
-
f
|
364
|
-
|
377
|
+
if key in self._fields:
|
378
|
+
logger.info(f"No unstandardized values found for {key!r}")
|
379
|
+
else:
|
380
|
+
raise KeyError(
|
381
|
+
f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
|
382
|
+
)
|
365
383
|
else:
|
366
384
|
if key in self._fields: # needed to exclude var_index
|
367
385
|
syn_mapper = standardize_categories(
|
@@ -375,7 +393,9 @@ class DataFrameCurator(BaseCurator):
|
|
375
393
|
key, syn_mapper, self._df[key]
|
376
394
|
)
|
377
395
|
|
378
|
-
def _update_registry(
|
396
|
+
def _update_registry(
|
397
|
+
self, categorical: str, validated_only: bool = True, **kwargs
|
398
|
+
) -> None:
|
379
399
|
if categorical == "all":
|
380
400
|
self._update_registry_all(validated_only=validated_only, **kwargs)
|
381
401
|
else:
|
@@ -441,7 +461,8 @@ class DataFrameCurator(BaseCurator):
|
|
441
461
|
|
442
462
|
Args:
|
443
463
|
description: Description of the DataFrame object.
|
444
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
464
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
465
|
+
Artifacts with the same key form a revision family.
|
445
466
|
revises: Previous version of the artifact. Triggers a revision.
|
446
467
|
run: The run that creates the artifact.
|
447
468
|
|
@@ -502,11 +523,13 @@ class AnnDataCurator(DataFrameCurator):
|
|
502
523
|
verbosity: The verbosity level.
|
503
524
|
organism: The organism name.
|
504
525
|
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
505
|
-
exclude: A dictionary mapping column names to values to exclude.
|
526
|
+
exclude: A dictionary mapping column names to values to exclude from validation.
|
527
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
528
|
+
using the exclude parameter ensures they are not validated.
|
506
529
|
|
507
530
|
Examples:
|
508
531
|
>>> import bionty as bt
|
509
|
-
>>>
|
532
|
+
>>> curator = ln.Curator.from_anndata(
|
510
533
|
... adata,
|
511
534
|
... var_index=bt.Gene.ensembl_gene_id,
|
512
535
|
... categoricals={
|
@@ -710,7 +733,8 @@ class AnnDataCurator(DataFrameCurator):
|
|
710
733
|
|
711
734
|
Args:
|
712
735
|
description: A description of the ``AnnData`` object.
|
713
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
736
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
737
|
+
Artifacts with the same key form a revision family.
|
714
738
|
revises: Previous version of the artifact. Triggers a revision.
|
715
739
|
run: The run that creates the artifact.
|
716
740
|
|
@@ -761,11 +785,13 @@ class MuDataCurator:
|
|
761
785
|
verbosity: The verbosity level.
|
762
786
|
organism: The organism name.
|
763
787
|
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
764
|
-
exclude: A dictionary mapping column names to values to exclude.
|
788
|
+
exclude: A dictionary mapping column names to values to exclude from validation.
|
789
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
790
|
+
using the exclude parameter ensures they are not validated.
|
765
791
|
|
766
792
|
Examples:
|
767
793
|
>>> import bionty as bt
|
768
|
-
>>>
|
794
|
+
>>> curator = ln.Curator.from_mudata(
|
769
795
|
... mdata,
|
770
796
|
... var_index={
|
771
797
|
... "rna": bt.Gene.ensembl_gene_id,
|
@@ -1058,6 +1084,503 @@ class MuDataCurator:
|
|
1058
1084
|
return self._artifact
|
1059
1085
|
|
1060
1086
|
|
1087
|
+
def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
1088
|
+
if (n := len(nonval_keys)) > 0:
|
1089
|
+
s = "s" if n > 1 else ""
|
1090
|
+
are = "are" if n > 1 else "is"
|
1091
|
+
raise ValidationError(
|
1092
|
+
f"key{s} passed to {name} {are} not present: {colors.yellow(_format_values(nonval_keys))}"
|
1093
|
+
)
|
1094
|
+
|
1095
|
+
|
1096
|
+
class SOMACurator(BaseCurator):
|
1097
|
+
"""Curation flow for ``tiledbsoma``.
|
1098
|
+
|
1099
|
+
See also :class:`~lamindb.Curator`.
|
1100
|
+
|
1101
|
+
Args:
|
1102
|
+
experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
|
1103
|
+
var_index: The registry fields for mapping the `.var` indices for measurements.
|
1104
|
+
Should be in the form `{"measurement name": ("var column", field)}`.
|
1105
|
+
These keys should be used in the flattened form (`'{measurement name}__{column name in .var}'`)
|
1106
|
+
in `.standardize` or `.add_new_from`, see the output of `.var_index`.
|
1107
|
+
categoricals: A dictionary mapping categorical `.obs` columns to a registry field.
|
1108
|
+
obs_columns: The registry field for mapping the names of the `.obs` columns.
|
1109
|
+
organism: The organism name.
|
1110
|
+
sources: A dictionary mapping `.obs` columns to Source records.
|
1111
|
+
exclude: A dictionary mapping column names to values to exclude from validation.
|
1112
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1113
|
+
using the exclude parameter ensures they are not validated.
|
1114
|
+
|
1115
|
+
Examples:
|
1116
|
+
>>> import bionty as bt
|
1117
|
+
>>> curator = ln.Curator.from_tiledbsoma(
|
1118
|
+
... "./my_array_store.tiledbsoma",
|
1119
|
+
... var_index={"RNA": ("var_id", bt.Gene.symbol)},
|
1120
|
+
... categoricals={
|
1121
|
+
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
1122
|
+
... "donor_id": ln.ULabel.name
|
1123
|
+
... },
|
1124
|
+
... organism="human",
|
1125
|
+
... )
|
1126
|
+
"""
|
1127
|
+
|
1128
|
+
def __init__(
|
1129
|
+
self,
|
1130
|
+
experiment_uri: UPathStr | Artifact,
|
1131
|
+
var_index: dict[str, tuple[str, FieldAttr]],
|
1132
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
1133
|
+
obs_columns: FieldAttr = Feature.name,
|
1134
|
+
organism: str | None = None,
|
1135
|
+
sources: dict[str, Record] | None = None,
|
1136
|
+
exclude: dict[str, str | list[str]] | None = None,
|
1137
|
+
using_key: str | None = None,
|
1138
|
+
):
|
1139
|
+
self._obs_fields = categoricals or {}
|
1140
|
+
self._var_fields = var_index
|
1141
|
+
self._columns_field = obs_columns
|
1142
|
+
if isinstance(experiment_uri, Artifact):
|
1143
|
+
self._experiment_uri = experiment_uri.path
|
1144
|
+
self._artifact = experiment_uri
|
1145
|
+
else:
|
1146
|
+
self._experiment_uri = UPath(experiment_uri)
|
1147
|
+
self._artifact = None
|
1148
|
+
self._organism = organism
|
1149
|
+
self._using_key = using_key
|
1150
|
+
self._sources = sources or {}
|
1151
|
+
self._exclude = exclude or {}
|
1152
|
+
|
1153
|
+
self._validated: bool | None = False
|
1154
|
+
self._non_validated_values: dict[str, list] | None = None
|
1155
|
+
self._validated_values: dict[str, list] = {}
|
1156
|
+
# filled by _check_save_keys
|
1157
|
+
self._n_obs: int | None = None
|
1158
|
+
self._valid_obs_keys: list[str] | None = None
|
1159
|
+
self._valid_var_keys: list[str] | None = None
|
1160
|
+
self._var_fields_flat: dict[str, FieldAttr] | None = None
|
1161
|
+
self._check_save_keys()
|
1162
|
+
|
1163
|
+
# check that the provided keys in var_index and categoricals are available in the store
|
1164
|
+
# and save features
|
1165
|
+
def _check_save_keys(self):
|
1166
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1167
|
+
|
1168
|
+
with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
|
1169
|
+
experiment_obs = experiment.obs
|
1170
|
+
self._n_obs = len(experiment_obs)
|
1171
|
+
valid_obs_keys = [k for k in experiment_obs.keys() if k != "soma_joinid"]
|
1172
|
+
self._valid_obs_keys = valid_obs_keys
|
1173
|
+
|
1174
|
+
valid_var_keys = []
|
1175
|
+
ms_list = []
|
1176
|
+
for ms in experiment.ms.keys():
|
1177
|
+
ms_list.append(ms)
|
1178
|
+
var_ms = experiment.ms[ms].var
|
1179
|
+
valid_var_keys += [
|
1180
|
+
f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
|
1181
|
+
]
|
1182
|
+
self._valid_var_keys = valid_var_keys
|
1183
|
+
|
1184
|
+
# check validity of keys in categoricals
|
1185
|
+
nonval_keys = []
|
1186
|
+
for obs_key in self._obs_fields.keys():
|
1187
|
+
if obs_key not in valid_obs_keys:
|
1188
|
+
nonval_keys.append(obs_key)
|
1189
|
+
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1190
|
+
|
1191
|
+
# check validity of keys in var_index
|
1192
|
+
self._var_fields_flat = {}
|
1193
|
+
nonval_keys = []
|
1194
|
+
for ms_key in self._var_fields.keys():
|
1195
|
+
var_key, var_field = self._var_fields[ms_key]
|
1196
|
+
var_key_flat = f"{ms_key}__{var_key}"
|
1197
|
+
if var_key_flat not in valid_var_keys:
|
1198
|
+
nonval_keys.append(f"({ms_key}, {var_key})")
|
1199
|
+
else:
|
1200
|
+
self._var_fields_flat[var_key_flat] = var_field
|
1201
|
+
_maybe_curation_keys_not_present(nonval_keys, "var_index")
|
1202
|
+
|
1203
|
+
# check validity of keys in sources and exclude
|
1204
|
+
valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
|
1205
|
+
for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
|
1206
|
+
nonval_keys = []
|
1207
|
+
for arg_key in dct.keys():
|
1208
|
+
if arg_key not in valid_arg_keys:
|
1209
|
+
nonval_keys.append(arg_key)
|
1210
|
+
_maybe_curation_keys_not_present(nonval_keys, name)
|
1211
|
+
|
1212
|
+
# register obs columns' names
|
1213
|
+
register_columns = list(self._obs_fields.keys())
|
1214
|
+
organism = check_registry_organism(
|
1215
|
+
self._columns_field.field.model, self._organism
|
1216
|
+
).get("organism")
|
1217
|
+
update_registry(
|
1218
|
+
values=register_columns,
|
1219
|
+
field=self._columns_field,
|
1220
|
+
key="columns",
|
1221
|
+
using_key=self._using_key,
|
1222
|
+
validated_only=False,
|
1223
|
+
organism=organism,
|
1224
|
+
source=self._sources.get("columns"),
|
1225
|
+
exclude=self._exclude.get("columns"),
|
1226
|
+
)
|
1227
|
+
additional_columns = [k for k in valid_obs_keys if k not in register_columns]
|
1228
|
+
# no need to register with validated_only=True if columns are features
|
1229
|
+
if (
|
1230
|
+
len(additional_columns) > 0
|
1231
|
+
and self._columns_field.field.model is not Feature
|
1232
|
+
):
|
1233
|
+
update_registry(
|
1234
|
+
values=additional_columns,
|
1235
|
+
field=self._columns_field,
|
1236
|
+
key="columns",
|
1237
|
+
using_key=self._using_key,
|
1238
|
+
validated_only=True,
|
1239
|
+
organism=organism,
|
1240
|
+
source=self._sources.get("columns"),
|
1241
|
+
exclude=self._exclude.get("columns"),
|
1242
|
+
)
|
1243
|
+
|
1244
|
+
def validate(self):
|
1245
|
+
"""Validate categories."""
|
1246
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1247
|
+
|
1248
|
+
validated = True
|
1249
|
+
self._non_validated_values = {}
|
1250
|
+
with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
|
1251
|
+
for ms, (key, field) in self._var_fields.items():
|
1252
|
+
var_ms = experiment.ms[ms].var
|
1253
|
+
var_ms_key = f"{ms}__{key}"
|
1254
|
+
# it was already validated and cached
|
1255
|
+
if var_ms_key in self._validated_values:
|
1256
|
+
continue
|
1257
|
+
var_ms_values = (
|
1258
|
+
var_ms.read(column_names=[key]).concat()[key].to_pylist()
|
1259
|
+
)
|
1260
|
+
organism = check_registry_organism(
|
1261
|
+
field.field.model, self._organism
|
1262
|
+
).get("organism")
|
1263
|
+
update_registry(
|
1264
|
+
values=var_ms_values,
|
1265
|
+
field=field,
|
1266
|
+
key=var_ms_key,
|
1267
|
+
using_key=self._using_key,
|
1268
|
+
validated_only=True,
|
1269
|
+
organism=organism,
|
1270
|
+
source=self._sources.get(var_ms_key),
|
1271
|
+
exclude=self._exclude.get(var_ms_key),
|
1272
|
+
)
|
1273
|
+
_, non_val = validate_categories(
|
1274
|
+
values=var_ms_values,
|
1275
|
+
field=field,
|
1276
|
+
key=var_ms_key,
|
1277
|
+
using_key=self._using_key,
|
1278
|
+
organism=organism,
|
1279
|
+
source=self._sources.get(var_ms_key),
|
1280
|
+
exclude=self._exclude.get(var_ms_key),
|
1281
|
+
)
|
1282
|
+
if len(non_val) > 0:
|
1283
|
+
validated = False
|
1284
|
+
self._non_validated_values[var_ms_key] = non_val
|
1285
|
+
else:
|
1286
|
+
self._validated_values[var_ms_key] = var_ms_values
|
1287
|
+
|
1288
|
+
obs = experiment.obs
|
1289
|
+
for key, field in self._obs_fields.items():
|
1290
|
+
# already validated and cached
|
1291
|
+
if key in self._validated_values:
|
1292
|
+
continue
|
1293
|
+
values = pa.compute.unique(
|
1294
|
+
obs.read(column_names=[key]).concat()[key]
|
1295
|
+
).to_pylist()
|
1296
|
+
organism = check_registry_organism(
|
1297
|
+
field.field.model, self._organism
|
1298
|
+
).get("organism")
|
1299
|
+
update_registry(
|
1300
|
+
values=values,
|
1301
|
+
field=field,
|
1302
|
+
key=key,
|
1303
|
+
using_key=self._using_key,
|
1304
|
+
validated_only=True,
|
1305
|
+
organism=organism,
|
1306
|
+
source=self._sources.get(key),
|
1307
|
+
exclude=self._exclude.get(key),
|
1308
|
+
)
|
1309
|
+
_, non_val = validate_categories(
|
1310
|
+
values=values,
|
1311
|
+
field=field,
|
1312
|
+
key=key,
|
1313
|
+
using_key=self._using_key,
|
1314
|
+
organism=organism,
|
1315
|
+
source=self._sources.get(key),
|
1316
|
+
exclude=self._exclude.get(key),
|
1317
|
+
)
|
1318
|
+
if len(non_val) > 0:
|
1319
|
+
validated = False
|
1320
|
+
self._non_validated_values[key] = non_val
|
1321
|
+
else:
|
1322
|
+
self._validated_values[key] = values
|
1323
|
+
self._validated = validated
|
1324
|
+
return self._validated
|
1325
|
+
|
1326
|
+
def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
|
1327
|
+
assert self._non_validated_values is not None # noqa: S101
|
1328
|
+
|
1329
|
+
if key in self._valid_obs_keys:
|
1330
|
+
field = self._obs_fields[key]
|
1331
|
+
elif key in self._valid_var_keys:
|
1332
|
+
ms = key.partition("__")[0]
|
1333
|
+
field = self._var_fields[ms][1]
|
1334
|
+
else:
|
1335
|
+
raise KeyError(f"key {key} is invalid!")
|
1336
|
+
values = self._non_validated_values.get(key, [])
|
1337
|
+
return values, field
|
1338
|
+
|
1339
|
+
def add_new_from(self, key: str) -> None:
|
1340
|
+
"""Add validated & new categories.
|
1341
|
+
|
1342
|
+
Args:
|
1343
|
+
key: The key referencing the slot in the `tiledbsoma` store.
|
1344
|
+
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
1345
|
+
or a column name in `.obs`.
|
1346
|
+
"""
|
1347
|
+
if self._non_validated_values is None:
|
1348
|
+
raise ValidationError("Run .validate() first.")
|
1349
|
+
if key == "all":
|
1350
|
+
keys = list(self._non_validated_values.keys())
|
1351
|
+
else:
|
1352
|
+
avail_keys = list(
|
1353
|
+
chain(self._non_validated_values.keys(), self._validated_values.keys())
|
1354
|
+
)
|
1355
|
+
if key not in avail_keys:
|
1356
|
+
raise KeyError(
|
1357
|
+
f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
|
1358
|
+
)
|
1359
|
+
keys = [key]
|
1360
|
+
for k in keys:
|
1361
|
+
values, field = self._non_validated_values_field(k)
|
1362
|
+
if len(values) == 0:
|
1363
|
+
continue
|
1364
|
+
organism = check_registry_organism(field.field.model, self._organism).get(
|
1365
|
+
"organism"
|
1366
|
+
)
|
1367
|
+
update_registry(
|
1368
|
+
values=values,
|
1369
|
+
field=field,
|
1370
|
+
key=k,
|
1371
|
+
using_key=self._using_key,
|
1372
|
+
validated_only=False,
|
1373
|
+
organism=organism,
|
1374
|
+
source=self._sources.get(k),
|
1375
|
+
exclude=self._exclude.get(k),
|
1376
|
+
)
|
1377
|
+
# update non-validated values list but keep the key there
|
1378
|
+
# it will be removed by .validate()
|
1379
|
+
if k in self._non_validated_values:
|
1380
|
+
self._non_validated_values[k] = []
|
1381
|
+
|
1382
|
+
@property
|
1383
|
+
def non_validated(self) -> dict[str, list]:
|
1384
|
+
"""Return the non-validated features and labels."""
|
1385
|
+
non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
|
1386
|
+
return non_val
|
1387
|
+
|
1388
|
+
@property
|
1389
|
+
def var_index(self) -> dict[str, FieldAttr]:
|
1390
|
+
"""Return the registry fields with flattened keys to validate variables indices against."""
|
1391
|
+
return self._var_fields_flat
|
1392
|
+
|
1393
|
+
@property
|
1394
|
+
def categoricals(self) -> dict[str, FieldAttr]:
|
1395
|
+
"""Return the obs fields to validate against."""
|
1396
|
+
return self._obs_fields
|
1397
|
+
|
1398
|
+
def lookup(
|
1399
|
+
self, using_key: str | None = None, public: bool = False
|
1400
|
+
) -> CurateLookup:
|
1401
|
+
"""Lookup categories.
|
1402
|
+
|
1403
|
+
Args:
|
1404
|
+
using_key: The instance where the lookup is performed.
|
1405
|
+
if "public", the lookup is performed on the public reference.
|
1406
|
+
"""
|
1407
|
+
return CurateLookup(
|
1408
|
+
categoricals=self._obs_fields,
|
1409
|
+
slots={"columns": self._columns_field, **self._var_fields_flat},
|
1410
|
+
using_key=using_key or self._using_key,
|
1411
|
+
public=public,
|
1412
|
+
)
|
1413
|
+
|
1414
|
+
def standardize(self, key: str):
|
1415
|
+
"""Replace synonyms with standardized values.
|
1416
|
+
|
1417
|
+
Modifies the dataset inplace.
|
1418
|
+
|
1419
|
+
Args:
|
1420
|
+
key: The key referencing the slot in the `tiledbsoma` store.
|
1421
|
+
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
1422
|
+
or a column name in `.obs`.
|
1423
|
+
"""
|
1424
|
+
if len(self.non_validated) == 0:
|
1425
|
+
logger.warning("values are already standardized")
|
1426
|
+
return
|
1427
|
+
avail_keys = list(self._non_validated_values.keys())
|
1428
|
+
if key == "all":
|
1429
|
+
keys = avail_keys
|
1430
|
+
else:
|
1431
|
+
if key not in avail_keys:
|
1432
|
+
raise KeyError(
|
1433
|
+
f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
|
1434
|
+
)
|
1435
|
+
keys = [key]
|
1436
|
+
|
1437
|
+
for k in keys:
|
1438
|
+
values, field = self._non_validated_values_field(k)
|
1439
|
+
if len(values) == 0:
|
1440
|
+
continue
|
1441
|
+
if k in self._valid_var_keys:
|
1442
|
+
ms, _, slot_key = k.partition("__")
|
1443
|
+
slot = lambda experiment: experiment.ms[ms].var # noqa: B023
|
1444
|
+
else:
|
1445
|
+
slot = lambda experiment: experiment.obs
|
1446
|
+
slot_key = k
|
1447
|
+
# errors if public ontology and the model has no organism
|
1448
|
+
# has to be fixed in bionty
|
1449
|
+
organism = check_registry_organism(field.field.model, self._organism).get(
|
1450
|
+
"organism"
|
1451
|
+
)
|
1452
|
+
syn_mapper = standardize_categories(
|
1453
|
+
values=values,
|
1454
|
+
field=field,
|
1455
|
+
using_key=self._using_key,
|
1456
|
+
source=self._sources.get(k),
|
1457
|
+
organism=organism,
|
1458
|
+
)
|
1459
|
+
if (n_syn_mapper := len(syn_mapper)) == 0:
|
1460
|
+
continue
|
1461
|
+
|
1462
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1463
|
+
|
1464
|
+
with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
|
1465
|
+
value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
|
1466
|
+
table = slot(experiment).read(value_filter=value_filter).concat()
|
1467
|
+
|
1468
|
+
if len(table) == 0:
|
1469
|
+
continue
|
1470
|
+
|
1471
|
+
df = table.to_pandas()
|
1472
|
+
# map values
|
1473
|
+
df[slot_key] = df[slot_key].map(
|
1474
|
+
lambda val: syn_mapper.get(val, val) # noqa
|
1475
|
+
)
|
1476
|
+
# write the mapped values
|
1477
|
+
with _open_tiledbsoma(self._experiment_uri, mode="w") as experiment:
|
1478
|
+
slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
|
1479
|
+
# update non_validated dict
|
1480
|
+
non_val_k = [
|
1481
|
+
nv for nv in self._non_validated_values[k] if nv not in syn_mapper
|
1482
|
+
]
|
1483
|
+
self._non_validated_values[k] = non_val_k
|
1484
|
+
|
1485
|
+
syn_mapper_print = _format_values(
|
1486
|
+
[f'"{m_k}" → "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
|
1487
|
+
)
|
1488
|
+
s = "s" if n_syn_mapper > 1 else ""
|
1489
|
+
logger.success(
|
1490
|
+
f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
|
1491
|
+
)
|
1492
|
+
|
1493
|
+
def save_artifact(
|
1494
|
+
self,
|
1495
|
+
description: str | None = None,
|
1496
|
+
key: str | None = None,
|
1497
|
+
revises: Artifact | None = None,
|
1498
|
+
run: Run | None = None,
|
1499
|
+
) -> Artifact:
|
1500
|
+
"""Save the validated `tiledbsoma` store and metadata.
|
1501
|
+
|
1502
|
+
Args:
|
1503
|
+
description: A description of the ``tiledbsoma`` store.
|
1504
|
+
key: A path-like key to reference artifact in default storage,
|
1505
|
+
e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a revision family.
|
1506
|
+
revises: Previous version of the artifact. Triggers a revision.
|
1507
|
+
run: The run that creates the artifact.
|
1508
|
+
|
1509
|
+
Returns:
|
1510
|
+
A saved artifact record.
|
1511
|
+
"""
|
1512
|
+
from lamindb.core._data import add_labels
|
1513
|
+
|
1514
|
+
if not self._validated:
|
1515
|
+
self.validate()
|
1516
|
+
if not self._validated:
|
1517
|
+
raise ValidationError("Dataset does not validate. Please curate.")
|
1518
|
+
|
1519
|
+
if self._artifact is None:
|
1520
|
+
artifact = Artifact(
|
1521
|
+
self._experiment_uri,
|
1522
|
+
description=description,
|
1523
|
+
key=key,
|
1524
|
+
revises=revises,
|
1525
|
+
run=run,
|
1526
|
+
)
|
1527
|
+
artifact.n_observations = self._n_obs
|
1528
|
+
artifact._accessor = "tiledbsoma"
|
1529
|
+
artifact.save()
|
1530
|
+
else:
|
1531
|
+
artifact = self._artifact
|
1532
|
+
|
1533
|
+
feature_sets = {}
|
1534
|
+
if len(self._obs_fields) > 0:
|
1535
|
+
organism = check_registry_organism(
|
1536
|
+
self._columns_field.field.model, self._organism
|
1537
|
+
).get("organism")
|
1538
|
+
feature_sets["obs"] = FeatureSet.from_values(
|
1539
|
+
values=list(self._obs_fields.keys()),
|
1540
|
+
field=self._columns_field,
|
1541
|
+
organism=organism,
|
1542
|
+
raise_validation_error=False,
|
1543
|
+
)
|
1544
|
+
for ms in self._var_fields:
|
1545
|
+
var_key, var_field = self._var_fields[ms]
|
1546
|
+
organism = check_registry_organism(
|
1547
|
+
var_field.field.model, self._organism
|
1548
|
+
).get("organism")
|
1549
|
+
feature_sets[f"{ms}__var"] = FeatureSet.from_values(
|
1550
|
+
values=self._validated_values[f"{ms}__{var_key}"],
|
1551
|
+
field=var_field,
|
1552
|
+
organism=organism,
|
1553
|
+
raise_validation_error=False,
|
1554
|
+
)
|
1555
|
+
artifact._feature_sets = feature_sets
|
1556
|
+
|
1557
|
+
feature_ref_is_name = _ref_is_name(self._columns_field)
|
1558
|
+
features = Feature.lookup().dict()
|
1559
|
+
for key, field in self._obs_fields.items():
|
1560
|
+
feature = features.get(key)
|
1561
|
+
registry = field.field.model
|
1562
|
+
organism = check_registry_organism(field.field.model, self._organism).get(
|
1563
|
+
"organism"
|
1564
|
+
)
|
1565
|
+
labels = registry.from_values(
|
1566
|
+
values=self._validated_values[key], field=field, organism=organism
|
1567
|
+
)
|
1568
|
+
if len(labels) == 0:
|
1569
|
+
continue
|
1570
|
+
if hasattr(registry, "_name_field"):
|
1571
|
+
label_ref_is_name = field.field.name == registry._name_field
|
1572
|
+
add_labels(
|
1573
|
+
artifact,
|
1574
|
+
records=labels,
|
1575
|
+
feature=feature,
|
1576
|
+
feature_ref_is_name=feature_ref_is_name,
|
1577
|
+
label_ref_is_name=label_ref_is_name,
|
1578
|
+
from_curator=True,
|
1579
|
+
)
|
1580
|
+
|
1581
|
+
return artifact.save()
|
1582
|
+
|
1583
|
+
|
1061
1584
|
class Curator(BaseCurator):
|
1062
1585
|
"""Dataset curator.
|
1063
1586
|
|
@@ -1072,7 +1595,7 @@ class Curator(BaseCurator):
|
|
1072
1595
|
>>> categoricals={"perturbation": ln.ULabel.name}, # map categories
|
1073
1596
|
>>> )
|
1074
1597
|
>>> curator.validate() # validate the data in df
|
1075
|
-
>>> artifact =
|
1598
|
+
>>> artifact = curator.save_artifact(description="my RNA-seq")
|
1076
1599
|
>>> artifact.describe() # see annotations
|
1077
1600
|
|
1078
1601
|
`curator.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
|
@@ -1150,6 +1673,31 @@ class Curator(BaseCurator):
|
|
1150
1673
|
organism=organism,
|
1151
1674
|
)
|
1152
1675
|
|
1676
|
+
@classmethod
|
1677
|
+
@doc_args(SOMACurator.__doc__)
|
1678
|
+
def from_tiledbsoma(
|
1679
|
+
cls,
|
1680
|
+
experiment_uri: UPathStr,
|
1681
|
+
var_index: dict[str, tuple[str, FieldAttr]],
|
1682
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
1683
|
+
obs_columns: FieldAttr = Feature.name,
|
1684
|
+
using_key: str | None = None,
|
1685
|
+
organism: str | None = None,
|
1686
|
+
sources: dict[str, Record] | None = None,
|
1687
|
+
exclude: dict[str, str | list[str]] | None = None,
|
1688
|
+
) -> SOMACurator:
|
1689
|
+
"""{}""" # noqa: D415
|
1690
|
+
return SOMACurator(
|
1691
|
+
experiment_uri=experiment_uri,
|
1692
|
+
var_index=var_index,
|
1693
|
+
categoricals=categoricals,
|
1694
|
+
obs_columns=obs_columns,
|
1695
|
+
using_key=using_key,
|
1696
|
+
organism=organism,
|
1697
|
+
sources=sources,
|
1698
|
+
exclude=exclude,
|
1699
|
+
)
|
1700
|
+
|
1153
1701
|
|
1154
1702
|
def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
|
1155
1703
|
"""Get a registry instance using a specific instance."""
|
@@ -1253,7 +1801,7 @@ def validate_categories(
|
|
1253
1801
|
standardize: Whether to standardize the values.
|
1254
1802
|
hint_print: The hint to print that suggests fixing non-validated values.
|
1255
1803
|
"""
|
1256
|
-
from lamindb._from_values import
|
1804
|
+
from lamindb._from_values import _format_values
|
1257
1805
|
from lamindb.core._settings import settings
|
1258
1806
|
|
1259
1807
|
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
@@ -1315,22 +1863,17 @@ def validate_categories(
|
|
1315
1863
|
non_validated = [i for i in non_validated if i not in values_validated]
|
1316
1864
|
n_non_validated = len(non_validated)
|
1317
1865
|
if n_non_validated == 0:
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
logger.success(f'"{key}" is validated against {colors.italic(model_field)}')
|
1322
|
-
return True, []
|
1323
|
-
else:
|
1324
|
-
# validated values still need to be saved to the current instance
|
1325
|
-
return False, []
|
1866
|
+
logger.indent = ""
|
1867
|
+
logger.success(f'"{key}" is validated against {colors.italic(model_field)}')
|
1868
|
+
return True, []
|
1326
1869
|
else:
|
1327
1870
|
are = "is" if n_non_validated == 1 else "are"
|
1328
1871
|
s = "" if n_non_validated == 1 else "s"
|
1329
|
-
print_values =
|
1872
|
+
print_values = _format_values(non_validated)
|
1330
1873
|
warning_message = f"{colors.red(f'{n_non_validated} term{s}')} {are} not validated: {colors.red(print_values)}\n"
|
1331
1874
|
if syn_mapper:
|
1332
1875
|
s = "" if len(syn_mapper) == 1 else "s"
|
1333
|
-
syn_mapper_print =
|
1876
|
+
syn_mapper_print = _format_values(
|
1334
1877
|
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
1335
1878
|
)
|
1336
1879
|
hint_msg = f'.standardize("{key}")'
|
@@ -1522,16 +2065,17 @@ def save_artifact(
|
|
1522
2065
|
)
|
1523
2066
|
if len(labels) == 0:
|
1524
2067
|
continue
|
2068
|
+
label_ref_is_name = None
|
1525
2069
|
if hasattr(registry, "_name_field"):
|
1526
2070
|
label_ref_is_name = field.field.name == registry._name_field
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
2071
|
+
add_labels(
|
2072
|
+
artifact,
|
2073
|
+
records=labels,
|
2074
|
+
feature=feature,
|
2075
|
+
feature_ref_is_name=feature_ref_is_name,
|
2076
|
+
label_ref_is_name=label_ref_is_name,
|
2077
|
+
from_curator=True,
|
2078
|
+
)
|
1535
2079
|
|
1536
2080
|
if artifact._accessor == "MuData":
|
1537
2081
|
for modality, modality_fields in fields.items():
|
@@ -1710,7 +2254,7 @@ def log_saved_labels(
|
|
1710
2254
|
validated_only: bool = True,
|
1711
2255
|
) -> None:
|
1712
2256
|
"""Log the saved labels."""
|
1713
|
-
from ._from_values import
|
2257
|
+
from ._from_values import _format_values
|
1714
2258
|
|
1715
2259
|
model_field = colors.italic(model_field)
|
1716
2260
|
for k, labels in labels_saved.items():
|
@@ -1724,7 +2268,7 @@ def log_saved_labels(
|
|
1724
2268
|
# labels from a public ontology or a different instance to the present instance
|
1725
2269
|
s = "s" if len(labels) > 1 else ""
|
1726
2270
|
logger.success(
|
1727
|
-
f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {
|
2271
|
+
f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {_format_values(labels)}'
|
1728
2272
|
)
|
1729
2273
|
|
1730
2274
|
|