lamindb 0.77.2__py3-none-any.whl → 0.77.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_curate.py CHANGED
@@ -2,22 +2,26 @@ from __future__ import annotations
2
2
 
3
3
  import copy
4
4
  import warnings
5
+ from itertools import chain
5
6
  from typing import TYPE_CHECKING
6
7
 
7
8
  import anndata as ad
8
9
  import lamindb_setup as ln_setup
9
10
  import pandas as pd
11
+ import pyarrow as pa
10
12
  from lamin_utils import colors, logger
11
13
  from lamindb_setup.core._docs import doc_args
14
+ from lamindb_setup.core.upath import UPath
12
15
  from lnschema_core import (
13
16
  Artifact,
14
17
  Feature,
18
+ FeatureSet,
15
19
  Record,
16
20
  Run,
17
21
  ULabel,
18
22
  )
19
23
 
20
- from ._from_values import _print_values
24
+ from ._from_values import _format_values
21
25
  from .core.exceptions import ValidationError
22
26
 
23
27
  if TYPE_CHECKING:
@@ -40,8 +44,8 @@ class CurateLookup:
40
44
  public: Whether to lookup from the public instance. Defaults to False.
41
45
 
42
46
  Example:
43
- >>> validator = ln.Validator()
44
- >>> validator.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
47
+ >>> curator = ln.Curator.from_df(...)
48
+ >>> curator.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
45
49
  <Category: alveolar_type_1_fibroblast_cell>
46
50
 
47
51
  """
@@ -96,7 +100,7 @@ class CurateLookup:
96
100
  f"Lookup objects from the {colors.italic(ref)}:\n "
97
101
  f"{colors.green(getattr_keys)}\n "
98
102
  f"{colors.green(getitem_keys)}\n"
99
- 'Example:\n → categories = validator.lookup()["cell_type"]\n'
103
+ 'Example:\n → categories = curator.lookup()["cell_type"]\n'
100
104
  " → categories.alveolar_type_1_fibroblast_cell\n\n"
101
105
  "To look up public ontologies, use .lookup(public=True)"
102
106
  )
@@ -107,6 +111,15 @@ class CurateLookup:
107
111
  class BaseCurator:
108
112
  """Curate a dataset."""
109
113
 
114
+ def __init_subclass__(cls, **kwargs):
115
+ super().__init_subclass__(**kwargs)
116
+ import sys
117
+
118
+ # Deprecated methods
119
+ if "sphinx" not in sys.modules:
120
+ if hasattr(cls, "_add_new_from_columns"):
121
+ cls.add_new_from_columns = cls._add_new_from_columns
122
+
110
123
  def validate(self) -> bool:
111
124
  """Validate dataset.
112
125
 
@@ -164,14 +177,16 @@ class DataFrameCurator(BaseCurator):
164
177
  verbosity: The verbosity level.
165
178
  organism: The organism name.
166
179
  sources: A dictionary mapping column names to Source records.
167
- exclude: A dictionary mapping column names to values to exclude.
180
+ exclude: A dictionary mapping column names to values to exclude from validation.
181
+ When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
182
+ using the exclude parameter ensures they are not validated.
168
183
 
169
184
  Returns:
170
185
  A curator object.
171
186
 
172
187
  Examples:
173
188
  >>> import bionty as bt
174
- >>> curate = ln.Curator.from_df(
189
+ >>> curator = ln.Curator.from_df(
175
190
  ... df,
176
191
  ... categoricals={
177
192
  ... "cell_type_ontology_id": bt.CellType.ontology_id,
@@ -255,7 +270,7 @@ class DataFrameCurator(BaseCurator):
255
270
  are = "are" if n > 1 else "is"
256
271
  if len(nonval_keys) > 0:
257
272
  raise ValidationError(
258
- f"the following {n} key{s} passed to {name} {are} not allowed: {colors.yellow(_print_values(nonval_keys))}"
273
+ f"key{s} passed to {name} {are} not present in columns: {colors.yellow(_format_values(nonval_keys))}"
259
274
  )
260
275
 
261
276
  def _save_columns(self, validated_only: bool = True) -> None:
@@ -300,7 +315,7 @@ class DataFrameCurator(BaseCurator):
300
315
  self._kwargs.update({"organism": organism} if organism else {})
301
316
  self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
302
317
 
303
- def add_new_from_columns(self, organism: str | None = None, **kwargs):
318
+ def _add_new_from_columns(self, organism: str | None = None, **kwargs):
304
319
  """Deprecated to run by default during init."""
305
320
  warnings.warn(
306
321
  "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
@@ -323,7 +338,7 @@ class DataFrameCurator(BaseCurator):
323
338
  # logging
324
339
  n = len(syn_mapper)
325
340
  if n > 0:
326
- syn_mapper_print = _print_values(
341
+ syn_mapper_print = _format_values(
327
342
  [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
328
343
  )
329
344
  s = "s" if n > 1 else ""
@@ -332,13 +347,13 @@ class DataFrameCurator(BaseCurator):
332
347
  )
333
348
  return std_values
334
349
 
335
- def standardize(self, key: str):
350
+ def standardize(self, key: str) -> None:
336
351
  """Replace synonyms with standardized values.
337
352
 
338
- Args:
339
- key: The key referencing the slot in the DataFrame from which to draw terms.
340
-
341
353
  Modifies the input dataset inplace.
354
+
355
+ Args:
356
+ key: The key referencing the column in the DataFrame to standardize.
342
357
  """
343
358
  # list is needed to avoid RuntimeError: dictionary changed size during iteration
344
359
  avail_keys = list(self.non_validated.keys())
@@ -359,9 +374,12 @@ class DataFrameCurator(BaseCurator):
359
374
  self._df[k] = self._replace_synonyms(k, syn_mapper, self._df[k])
360
375
  else:
361
376
  if key not in avail_keys:
362
- raise KeyError(
363
- f'"{key}" is not a valid key, available keys are: {_print_values(avail_keys)}!'
364
- )
377
+ if key in self._fields:
378
+ logger.info(f"No unstandardized values found for {key!r}")
379
+ else:
380
+ raise KeyError(
381
+ f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
382
+ )
365
383
  else:
366
384
  if key in self._fields: # needed to exclude var_index
367
385
  syn_mapper = standardize_categories(
@@ -375,7 +393,9 @@ class DataFrameCurator(BaseCurator):
375
393
  key, syn_mapper, self._df[key]
376
394
  )
377
395
 
378
- def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs):
396
+ def _update_registry(
397
+ self, categorical: str, validated_only: bool = True, **kwargs
398
+ ) -> None:
379
399
  if categorical == "all":
380
400
  self._update_registry_all(validated_only=validated_only, **kwargs)
381
401
  else:
@@ -441,7 +461,8 @@ class DataFrameCurator(BaseCurator):
441
461
 
442
462
  Args:
443
463
  description: Description of the DataFrame object.
444
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
464
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
465
+ Artifacts with the same key form a revision family.
445
466
  revises: Previous version of the artifact. Triggers a revision.
446
467
  run: The run that creates the artifact.
447
468
 
@@ -502,11 +523,13 @@ class AnnDataCurator(DataFrameCurator):
502
523
  verbosity: The verbosity level.
503
524
  organism: The organism name.
504
525
  sources: A dictionary mapping ``.obs.columns`` to Source records.
505
- exclude: A dictionary mapping column names to values to exclude.
526
+ exclude: A dictionary mapping column names to values to exclude from validation.
527
+ When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
528
+ using the exclude parameter ensures they are not validated.
506
529
 
507
530
  Examples:
508
531
  >>> import bionty as bt
509
- >>> curate = ln.Curator.from_anndata(
532
+ >>> curator = ln.Curator.from_anndata(
510
533
  ... adata,
511
534
  ... var_index=bt.Gene.ensembl_gene_id,
512
535
  ... categoricals={
@@ -710,7 +733,8 @@ class AnnDataCurator(DataFrameCurator):
710
733
 
711
734
  Args:
712
735
  description: A description of the ``AnnData`` object.
713
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
736
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
737
+ Artifacts with the same key form a revision family.
714
738
  revises: Previous version of the artifact. Triggers a revision.
715
739
  run: The run that creates the artifact.
716
740
 
@@ -761,11 +785,13 @@ class MuDataCurator:
761
785
  verbosity: The verbosity level.
762
786
  organism: The organism name.
763
787
  sources: A dictionary mapping ``.obs.columns`` to Source records.
764
- exclude: A dictionary mapping column names to values to exclude.
788
+ exclude: A dictionary mapping column names to values to exclude from validation.
789
+ When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
790
+ using the exclude parameter ensures they are not validated.
765
791
 
766
792
  Examples:
767
793
  >>> import bionty as bt
768
- >>> curate = ln.Curator.from_mudata(
794
+ >>> curator = ln.Curator.from_mudata(
769
795
  ... mdata,
770
796
  ... var_index={
771
797
  ... "rna": bt.Gene.ensembl_gene_id,
@@ -1058,6 +1084,503 @@ class MuDataCurator:
1058
1084
  return self._artifact
1059
1085
 
1060
1086
 
1087
+ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
1088
+ if (n := len(nonval_keys)) > 0:
1089
+ s = "s" if n > 1 else ""
1090
+ are = "are" if n > 1 else "is"
1091
+ raise ValidationError(
1092
+ f"key{s} passed to {name} {are} not present: {colors.yellow(_format_values(nonval_keys))}"
1093
+ )
1094
+
1095
+
1096
+ class SOMACurator(BaseCurator):
1097
+ """Curation flow for ``tiledbsoma``.
1098
+
1099
+ See also :class:`~lamindb.Curator`.
1100
+
1101
+ Args:
1102
+ experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
1103
+ var_index: The registry fields for mapping the `.var` indices for measurements.
1104
+ Should be in the form `{"measurement name": ("var column", field)}`.
1105
+ These keys should be used in the flattened form (`'{measurement name}__{column name in .var}'`)
1106
+ in `.standardize` or `.add_new_from`, see the output of `.var_index`.
1107
+ categoricals: A dictionary mapping categorical `.obs` columns to a registry field.
1108
+ obs_columns: The registry field for mapping the names of the `.obs` columns.
1109
+ organism: The organism name.
1110
+ sources: A dictionary mapping `.obs` columns to Source records.
1111
+ exclude: A dictionary mapping column names to values to exclude from validation.
1112
+ When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1113
+ using the exclude parameter ensures they are not validated.
1114
+
1115
+ Examples:
1116
+ >>> import bionty as bt
1117
+ >>> curator = ln.Curator.from_tiledbsoma(
1118
+ ... "./my_array_store.tiledbsoma",
1119
+ ... var_index={"RNA": ("var_id", bt.Gene.symbol)},
1120
+ ... categoricals={
1121
+ ... "cell_type_ontology_id": bt.CellType.ontology_id,
1122
+ ... "donor_id": ln.ULabel.name
1123
+ ... },
1124
+ ... organism="human",
1125
+ ... )
1126
+ """
1127
+
1128
+ def __init__(
1129
+ self,
1130
+ experiment_uri: UPathStr | Artifact,
1131
+ var_index: dict[str, tuple[str, FieldAttr]],
1132
+ categoricals: dict[str, FieldAttr] | None = None,
1133
+ obs_columns: FieldAttr = Feature.name,
1134
+ organism: str | None = None,
1135
+ sources: dict[str, Record] | None = None,
1136
+ exclude: dict[str, str | list[str]] | None = None,
1137
+ using_key: str | None = None,
1138
+ ):
1139
+ self._obs_fields = categoricals or {}
1140
+ self._var_fields = var_index
1141
+ self._columns_field = obs_columns
1142
+ if isinstance(experiment_uri, Artifact):
1143
+ self._experiment_uri = experiment_uri.path
1144
+ self._artifact = experiment_uri
1145
+ else:
1146
+ self._experiment_uri = UPath(experiment_uri)
1147
+ self._artifact = None
1148
+ self._organism = organism
1149
+ self._using_key = using_key
1150
+ self._sources = sources or {}
1151
+ self._exclude = exclude or {}
1152
+
1153
+ self._validated: bool | None = False
1154
+ self._non_validated_values: dict[str, list] | None = None
1155
+ self._validated_values: dict[str, list] = {}
1156
+ # filled by _check_save_keys
1157
+ self._n_obs: int | None = None
1158
+ self._valid_obs_keys: list[str] | None = None
1159
+ self._valid_var_keys: list[str] | None = None
1160
+ self._var_fields_flat: dict[str, FieldAttr] | None = None
1161
+ self._check_save_keys()
1162
+
1163
+ # check that the provided keys in var_index and categoricals are available in the store
1164
+ # and save features
1165
+ def _check_save_keys(self):
1166
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1167
+
1168
+ with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
1169
+ experiment_obs = experiment.obs
1170
+ self._n_obs = len(experiment_obs)
1171
+ valid_obs_keys = [k for k in experiment_obs.keys() if k != "soma_joinid"]
1172
+ self._valid_obs_keys = valid_obs_keys
1173
+
1174
+ valid_var_keys = []
1175
+ ms_list = []
1176
+ for ms in experiment.ms.keys():
1177
+ ms_list.append(ms)
1178
+ var_ms = experiment.ms[ms].var
1179
+ valid_var_keys += [
1180
+ f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
1181
+ ]
1182
+ self._valid_var_keys = valid_var_keys
1183
+
1184
+ # check validity of keys in categoricals
1185
+ nonval_keys = []
1186
+ for obs_key in self._obs_fields.keys():
1187
+ if obs_key not in valid_obs_keys:
1188
+ nonval_keys.append(obs_key)
1189
+ _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1190
+
1191
+ # check validity of keys in var_index
1192
+ self._var_fields_flat = {}
1193
+ nonval_keys = []
1194
+ for ms_key in self._var_fields.keys():
1195
+ var_key, var_field = self._var_fields[ms_key]
1196
+ var_key_flat = f"{ms_key}__{var_key}"
1197
+ if var_key_flat not in valid_var_keys:
1198
+ nonval_keys.append(f"({ms_key}, {var_key})")
1199
+ else:
1200
+ self._var_fields_flat[var_key_flat] = var_field
1201
+ _maybe_curation_keys_not_present(nonval_keys, "var_index")
1202
+
1203
+ # check validity of keys in sources and exclude
1204
+ valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
1205
+ for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
1206
+ nonval_keys = []
1207
+ for arg_key in dct.keys():
1208
+ if arg_key not in valid_arg_keys:
1209
+ nonval_keys.append(arg_key)
1210
+ _maybe_curation_keys_not_present(nonval_keys, name)
1211
+
1212
+ # register obs columns' names
1213
+ register_columns = list(self._obs_fields.keys())
1214
+ organism = check_registry_organism(
1215
+ self._columns_field.field.model, self._organism
1216
+ ).get("organism")
1217
+ update_registry(
1218
+ values=register_columns,
1219
+ field=self._columns_field,
1220
+ key="columns",
1221
+ using_key=self._using_key,
1222
+ validated_only=False,
1223
+ organism=organism,
1224
+ source=self._sources.get("columns"),
1225
+ exclude=self._exclude.get("columns"),
1226
+ )
1227
+ additional_columns = [k for k in valid_obs_keys if k not in register_columns]
1228
+ # no need to register with validated_only=True if columns are features
1229
+ if (
1230
+ len(additional_columns) > 0
1231
+ and self._columns_field.field.model is not Feature
1232
+ ):
1233
+ update_registry(
1234
+ values=additional_columns,
1235
+ field=self._columns_field,
1236
+ key="columns",
1237
+ using_key=self._using_key,
1238
+ validated_only=True,
1239
+ organism=organism,
1240
+ source=self._sources.get("columns"),
1241
+ exclude=self._exclude.get("columns"),
1242
+ )
1243
+
1244
+ def validate(self):
1245
+ """Validate categories."""
1246
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1247
+
1248
+ validated = True
1249
+ self._non_validated_values = {}
1250
+ with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
1251
+ for ms, (key, field) in self._var_fields.items():
1252
+ var_ms = experiment.ms[ms].var
1253
+ var_ms_key = f"{ms}__{key}"
1254
+ # it was already validated and cached
1255
+ if var_ms_key in self._validated_values:
1256
+ continue
1257
+ var_ms_values = (
1258
+ var_ms.read(column_names=[key]).concat()[key].to_pylist()
1259
+ )
1260
+ organism = check_registry_organism(
1261
+ field.field.model, self._organism
1262
+ ).get("organism")
1263
+ update_registry(
1264
+ values=var_ms_values,
1265
+ field=field,
1266
+ key=var_ms_key,
1267
+ using_key=self._using_key,
1268
+ validated_only=True,
1269
+ organism=organism,
1270
+ source=self._sources.get(var_ms_key),
1271
+ exclude=self._exclude.get(var_ms_key),
1272
+ )
1273
+ _, non_val = validate_categories(
1274
+ values=var_ms_values,
1275
+ field=field,
1276
+ key=var_ms_key,
1277
+ using_key=self._using_key,
1278
+ organism=organism,
1279
+ source=self._sources.get(var_ms_key),
1280
+ exclude=self._exclude.get(var_ms_key),
1281
+ )
1282
+ if len(non_val) > 0:
1283
+ validated = False
1284
+ self._non_validated_values[var_ms_key] = non_val
1285
+ else:
1286
+ self._validated_values[var_ms_key] = var_ms_values
1287
+
1288
+ obs = experiment.obs
1289
+ for key, field in self._obs_fields.items():
1290
+ # already validated and cached
1291
+ if key in self._validated_values:
1292
+ continue
1293
+ values = pa.compute.unique(
1294
+ obs.read(column_names=[key]).concat()[key]
1295
+ ).to_pylist()
1296
+ organism = check_registry_organism(
1297
+ field.field.model, self._organism
1298
+ ).get("organism")
1299
+ update_registry(
1300
+ values=values,
1301
+ field=field,
1302
+ key=key,
1303
+ using_key=self._using_key,
1304
+ validated_only=True,
1305
+ organism=organism,
1306
+ source=self._sources.get(key),
1307
+ exclude=self._exclude.get(key),
1308
+ )
1309
+ _, non_val = validate_categories(
1310
+ values=values,
1311
+ field=field,
1312
+ key=key,
1313
+ using_key=self._using_key,
1314
+ organism=organism,
1315
+ source=self._sources.get(key),
1316
+ exclude=self._exclude.get(key),
1317
+ )
1318
+ if len(non_val) > 0:
1319
+ validated = False
1320
+ self._non_validated_values[key] = non_val
1321
+ else:
1322
+ self._validated_values[key] = values
1323
+ self._validated = validated
1324
+ return self._validated
1325
+
1326
+ def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
1327
+ assert self._non_validated_values is not None # noqa: S101
1328
+
1329
+ if key in self._valid_obs_keys:
1330
+ field = self._obs_fields[key]
1331
+ elif key in self._valid_var_keys:
1332
+ ms = key.partition("__")[0]
1333
+ field = self._var_fields[ms][1]
1334
+ else:
1335
+ raise KeyError(f"key {key} is invalid!")
1336
+ values = self._non_validated_values.get(key, [])
1337
+ return values, field
1338
+
1339
+ def add_new_from(self, key: str) -> None:
1340
+ """Add validated & new categories.
1341
+
1342
+ Args:
1343
+ key: The key referencing the slot in the `tiledbsoma` store.
1344
+ It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
1345
+ or a column name in `.obs`.
1346
+ """
1347
+ if self._non_validated_values is None:
1348
+ raise ValidationError("Run .validate() first.")
1349
+ if key == "all":
1350
+ keys = list(self._non_validated_values.keys())
1351
+ else:
1352
+ avail_keys = list(
1353
+ chain(self._non_validated_values.keys(), self._validated_values.keys())
1354
+ )
1355
+ if key not in avail_keys:
1356
+ raise KeyError(
1357
+ f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
1358
+ )
1359
+ keys = [key]
1360
+ for k in keys:
1361
+ values, field = self._non_validated_values_field(k)
1362
+ if len(values) == 0:
1363
+ continue
1364
+ organism = check_registry_organism(field.field.model, self._organism).get(
1365
+ "organism"
1366
+ )
1367
+ update_registry(
1368
+ values=values,
1369
+ field=field,
1370
+ key=k,
1371
+ using_key=self._using_key,
1372
+ validated_only=False,
1373
+ organism=organism,
1374
+ source=self._sources.get(k),
1375
+ exclude=self._exclude.get(k),
1376
+ )
1377
+ # update non-validated values list but keep the key there
1378
+ # it will be removed by .validate()
1379
+ if k in self._non_validated_values:
1380
+ self._non_validated_values[k] = []
1381
+
1382
+ @property
1383
+ def non_validated(self) -> dict[str, list]:
1384
+ """Return the non-validated features and labels."""
1385
+ non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
1386
+ return non_val
1387
+
1388
+ @property
1389
+ def var_index(self) -> dict[str, FieldAttr]:
1390
+ """Return the registry fields with flattened keys to validate variables indices against."""
1391
+ return self._var_fields_flat
1392
+
1393
+ @property
1394
+ def categoricals(self) -> dict[str, FieldAttr]:
1395
+ """Return the obs fields to validate against."""
1396
+ return self._obs_fields
1397
+
1398
+ def lookup(
1399
+ self, using_key: str | None = None, public: bool = False
1400
+ ) -> CurateLookup:
1401
+ """Lookup categories.
1402
+
1403
+ Args:
1404
+ using_key: The instance where the lookup is performed.
1405
+ if "public", the lookup is performed on the public reference.
1406
+ """
1407
+ return CurateLookup(
1408
+ categoricals=self._obs_fields,
1409
+ slots={"columns": self._columns_field, **self._var_fields_flat},
1410
+ using_key=using_key or self._using_key,
1411
+ public=public,
1412
+ )
1413
+
1414
+ def standardize(self, key: str):
1415
+ """Replace synonyms with standardized values.
1416
+
1417
+ Modifies the dataset inplace.
1418
+
1419
+ Args:
1420
+ key: The key referencing the slot in the `tiledbsoma` store.
1421
+ It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
1422
+ or a column name in `.obs`.
1423
+ """
1424
+ if len(self.non_validated) == 0:
1425
+ logger.warning("values are already standardized")
1426
+ return
1427
+ avail_keys = list(self._non_validated_values.keys())
1428
+ if key == "all":
1429
+ keys = avail_keys
1430
+ else:
1431
+ if key not in avail_keys:
1432
+ raise KeyError(
1433
+ f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
1434
+ )
1435
+ keys = [key]
1436
+
1437
+ for k in keys:
1438
+ values, field = self._non_validated_values_field(k)
1439
+ if len(values) == 0:
1440
+ continue
1441
+ if k in self._valid_var_keys:
1442
+ ms, _, slot_key = k.partition("__")
1443
+ slot = lambda experiment: experiment.ms[ms].var # noqa: B023
1444
+ else:
1445
+ slot = lambda experiment: experiment.obs
1446
+ slot_key = k
1447
+ # errors if public ontology and the model has no organism
1448
+ # has to be fixed in bionty
1449
+ organism = check_registry_organism(field.field.model, self._organism).get(
1450
+ "organism"
1451
+ )
1452
+ syn_mapper = standardize_categories(
1453
+ values=values,
1454
+ field=field,
1455
+ using_key=self._using_key,
1456
+ source=self._sources.get(k),
1457
+ organism=organism,
1458
+ )
1459
+ if (n_syn_mapper := len(syn_mapper)) == 0:
1460
+ continue
1461
+
1462
+ from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1463
+
1464
+ with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
1465
+ value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
1466
+ table = slot(experiment).read(value_filter=value_filter).concat()
1467
+
1468
+ if len(table) == 0:
1469
+ continue
1470
+
1471
+ df = table.to_pandas()
1472
+ # map values
1473
+ df[slot_key] = df[slot_key].map(
1474
+ lambda val: syn_mapper.get(val, val) # noqa
1475
+ )
1476
+ # write the mapped values
1477
+ with _open_tiledbsoma(self._experiment_uri, mode="w") as experiment:
1478
+ slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
1479
+ # update non_validated dict
1480
+ non_val_k = [
1481
+ nv for nv in self._non_validated_values[k] if nv not in syn_mapper
1482
+ ]
1483
+ self._non_validated_values[k] = non_val_k
1484
+
1485
+ syn_mapper_print = _format_values(
1486
+ [f'"{m_k}" → "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
1487
+ )
1488
+ s = "s" if n_syn_mapper > 1 else ""
1489
+ logger.success(
1490
+ f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
1491
+ )
1492
+
1493
+ def save_artifact(
1494
+ self,
1495
+ description: str | None = None,
1496
+ key: str | None = None,
1497
+ revises: Artifact | None = None,
1498
+ run: Run | None = None,
1499
+ ) -> Artifact:
1500
+ """Save the validated `tiledbsoma` store and metadata.
1501
+
1502
+ Args:
1503
+ description: A description of the ``tiledbsoma`` store.
1504
+ key: A path-like key to reference artifact in default storage,
1505
+ e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a revision family.
1506
+ revises: Previous version of the artifact. Triggers a revision.
1507
+ run: The run that creates the artifact.
1508
+
1509
+ Returns:
1510
+ A saved artifact record.
1511
+ """
1512
+ from lamindb.core._data import add_labels
1513
+
1514
+ if not self._validated:
1515
+ self.validate()
1516
+ if not self._validated:
1517
+ raise ValidationError("Dataset does not validate. Please curate.")
1518
+
1519
+ if self._artifact is None:
1520
+ artifact = Artifact(
1521
+ self._experiment_uri,
1522
+ description=description,
1523
+ key=key,
1524
+ revises=revises,
1525
+ run=run,
1526
+ )
1527
+ artifact.n_observations = self._n_obs
1528
+ artifact._accessor = "tiledbsoma"
1529
+ artifact.save()
1530
+ else:
1531
+ artifact = self._artifact
1532
+
1533
+ feature_sets = {}
1534
+ if len(self._obs_fields) > 0:
1535
+ organism = check_registry_organism(
1536
+ self._columns_field.field.model, self._organism
1537
+ ).get("organism")
1538
+ feature_sets["obs"] = FeatureSet.from_values(
1539
+ values=list(self._obs_fields.keys()),
1540
+ field=self._columns_field,
1541
+ organism=organism,
1542
+ raise_validation_error=False,
1543
+ )
1544
+ for ms in self._var_fields:
1545
+ var_key, var_field = self._var_fields[ms]
1546
+ organism = check_registry_organism(
1547
+ var_field.field.model, self._organism
1548
+ ).get("organism")
1549
+ feature_sets[f"{ms}__var"] = FeatureSet.from_values(
1550
+ values=self._validated_values[f"{ms}__{var_key}"],
1551
+ field=var_field,
1552
+ organism=organism,
1553
+ raise_validation_error=False,
1554
+ )
1555
+ artifact._feature_sets = feature_sets
1556
+
1557
+ feature_ref_is_name = _ref_is_name(self._columns_field)
1558
+ features = Feature.lookup().dict()
1559
+ for key, field in self._obs_fields.items():
1560
+ feature = features.get(key)
1561
+ registry = field.field.model
1562
+ organism = check_registry_organism(field.field.model, self._organism).get(
1563
+ "organism"
1564
+ )
1565
+ labels = registry.from_values(
1566
+ values=self._validated_values[key], field=field, organism=organism
1567
+ )
1568
+ if len(labels) == 0:
1569
+ continue
1570
+ if hasattr(registry, "_name_field"):
1571
+ label_ref_is_name = field.field.name == registry._name_field
1572
+ add_labels(
1573
+ artifact,
1574
+ records=labels,
1575
+ feature=feature,
1576
+ feature_ref_is_name=feature_ref_is_name,
1577
+ label_ref_is_name=label_ref_is_name,
1578
+ from_curator=True,
1579
+ )
1580
+
1581
+ return artifact.save()
1582
+
1583
+
1061
1584
  class Curator(BaseCurator):
1062
1585
  """Dataset curator.
1063
1586
 
@@ -1072,7 +1595,7 @@ class Curator(BaseCurator):
1072
1595
  >>> categoricals={"perturbation": ln.ULabel.name}, # map categories
1073
1596
  >>> )
1074
1597
  >>> curator.validate() # validate the data in df
1075
- >>> artifact = curate.save_artifact(description="my RNA-seq")
1598
+ >>> artifact = curator.save_artifact(description="my RNA-seq")
1076
1599
  >>> artifact.describe() # see annotations
1077
1600
 
1078
1601
  `curator.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
@@ -1150,6 +1673,31 @@ class Curator(BaseCurator):
1150
1673
  organism=organism,
1151
1674
  )
1152
1675
 
1676
+ @classmethod
1677
+ @doc_args(SOMACurator.__doc__)
1678
+ def from_tiledbsoma(
1679
+ cls,
1680
+ experiment_uri: UPathStr,
1681
+ var_index: dict[str, tuple[str, FieldAttr]],
1682
+ categoricals: dict[str, FieldAttr] | None = None,
1683
+ obs_columns: FieldAttr = Feature.name,
1684
+ using_key: str | None = None,
1685
+ organism: str | None = None,
1686
+ sources: dict[str, Record] | None = None,
1687
+ exclude: dict[str, str | list[str]] | None = None,
1688
+ ) -> SOMACurator:
1689
+ """{}""" # noqa: D415
1690
+ return SOMACurator(
1691
+ experiment_uri=experiment_uri,
1692
+ var_index=var_index,
1693
+ categoricals=categoricals,
1694
+ obs_columns=obs_columns,
1695
+ using_key=using_key,
1696
+ organism=organism,
1697
+ sources=sources,
1698
+ exclude=exclude,
1699
+ )
1700
+
1153
1701
 
1154
1702
  def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
1155
1703
  """Get a registry instance using a specific instance."""
@@ -1253,7 +1801,7 @@ def validate_categories(
1253
1801
  standardize: Whether to standardize the values.
1254
1802
  hint_print: The hint to print that suggests fixing non-validated values.
1255
1803
  """
1256
- from lamindb._from_values import _print_values
1804
+ from lamindb._from_values import _format_values
1257
1805
  from lamindb.core._settings import settings
1258
1806
 
1259
1807
  model_field = f"{field.field.model.__name__}.{field.field.name}"
@@ -1315,22 +1863,17 @@ def validate_categories(
1315
1863
  non_validated = [i for i in non_validated if i not in values_validated]
1316
1864
  n_non_validated = len(non_validated)
1317
1865
  if n_non_validated == 0:
1318
- if len(values_validated) == 0:
1319
- # nothing to validate
1320
- logger.indent = ""
1321
- logger.success(f'"{key}" is validated against {colors.italic(model_field)}')
1322
- return True, []
1323
- else:
1324
- # validated values still need to be saved to the current instance
1325
- return False, []
1866
+ logger.indent = ""
1867
+ logger.success(f'"{key}" is validated against {colors.italic(model_field)}')
1868
+ return True, []
1326
1869
  else:
1327
1870
  are = "is" if n_non_validated == 1 else "are"
1328
1871
  s = "" if n_non_validated == 1 else "s"
1329
- print_values = _print_values(non_validated)
1872
+ print_values = _format_values(non_validated)
1330
1873
  warning_message = f"{colors.red(f'{n_non_validated} term{s}')} {are} not validated: {colors.red(print_values)}\n"
1331
1874
  if syn_mapper:
1332
1875
  s = "" if len(syn_mapper) == 1 else "s"
1333
- syn_mapper_print = _print_values(
1876
+ syn_mapper_print = _format_values(
1334
1877
  [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
1335
1878
  )
1336
1879
  hint_msg = f'.standardize("{key}")'
@@ -1522,16 +2065,17 @@ def save_artifact(
1522
2065
  )
1523
2066
  if len(labels) == 0:
1524
2067
  continue
2068
+ label_ref_is_name = None
1525
2069
  if hasattr(registry, "_name_field"):
1526
2070
  label_ref_is_name = field.field.name == registry._name_field
1527
- add_labels(
1528
- artifact,
1529
- records=labels,
1530
- feature=feature,
1531
- feature_ref_is_name=feature_ref_is_name,
1532
- label_ref_is_name=label_ref_is_name,
1533
- from_curator=True,
1534
- )
2071
+ add_labels(
2072
+ artifact,
2073
+ records=labels,
2074
+ feature=feature,
2075
+ feature_ref_is_name=feature_ref_is_name,
2076
+ label_ref_is_name=label_ref_is_name,
2077
+ from_curator=True,
2078
+ )
1535
2079
 
1536
2080
  if artifact._accessor == "MuData":
1537
2081
  for modality, modality_fields in fields.items():
@@ -1710,7 +2254,7 @@ def log_saved_labels(
1710
2254
  validated_only: bool = True,
1711
2255
  ) -> None:
1712
2256
  """Log the saved labels."""
1713
- from ._from_values import _print_values
2257
+ from ._from_values import _format_values
1714
2258
 
1715
2259
  model_field = colors.italic(model_field)
1716
2260
  for k, labels in labels_saved.items():
@@ -1724,7 +2268,7 @@ def log_saved_labels(
1724
2268
  # labels from a public ontology or a different instance to the present instance
1725
2269
  s = "s" if len(labels) > 1 else ""
1726
2270
  logger.success(
1727
- f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {_print_values(labels)}'
2271
+ f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {_format_values(labels)}'
1728
2272
  )
1729
2273
 
1730
2274