lamindb 0.77.2__py3-none-any.whl → 1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +39 -32
- lamindb/_artifact.py +95 -64
- lamindb/_can_curate.py +19 -10
- lamindb/_collection.py +51 -49
- lamindb/_feature.py +9 -9
- lamindb/_finish.py +99 -86
- lamindb/_from_values.py +20 -17
- lamindb/_is_versioned.py +2 -1
- lamindb/_parents.py +23 -16
- lamindb/_query_manager.py +3 -3
- lamindb/_query_set.py +85 -18
- lamindb/_record.py +121 -46
- lamindb/_run.py +3 -3
- lamindb/_save.py +14 -8
- lamindb/{_feature_set.py → _schema.py} +34 -31
- lamindb/_storage.py +2 -1
- lamindb/_transform.py +51 -23
- lamindb/_ulabel.py +17 -8
- lamindb/_view.py +15 -14
- lamindb/base/__init__.py +24 -0
- lamindb/base/fields.py +281 -0
- lamindb/base/ids.py +103 -0
- lamindb/base/types.py +51 -0
- lamindb/base/users.py +30 -0
- lamindb/base/validation.py +67 -0
- lamindb/core/__init__.py +19 -14
- lamindb/core/_context.py +297 -228
- lamindb/core/_data.py +44 -49
- lamindb/core/_describe.py +41 -31
- lamindb/core/_django.py +59 -44
- lamindb/core/_feature_manager.py +192 -168
- lamindb/core/_label_manager.py +22 -22
- lamindb/core/_mapped_collection.py +17 -14
- lamindb/core/_settings.py +1 -12
- lamindb/core/_sync_git.py +56 -9
- lamindb/core/_track_environment.py +1 -1
- lamindb/core/datasets/_core.py +5 -6
- lamindb/core/exceptions.py +0 -7
- lamindb/core/fields.py +1 -1
- lamindb/core/loaders.py +18 -2
- lamindb/core/{schema.py → relations.py} +22 -19
- lamindb/core/storage/_anndata_accessor.py +1 -2
- lamindb/core/storage/_backed_access.py +2 -1
- lamindb/core/storage/_tiledbsoma.py +40 -13
- lamindb/core/storage/objects.py +1 -1
- lamindb/core/storage/paths.py +13 -8
- lamindb/core/subsettings/__init__.py +0 -2
- lamindb/core/types.py +2 -23
- lamindb/core/versioning.py +11 -7
- lamindb/{_curate.py → curators/__init__.py} +700 -57
- lamindb/curators/_spatial.py +528 -0
- lamindb/integrations/_vitessce.py +1 -3
- lamindb/migrations/0052_squashed.py +1261 -0
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +57 -0
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +35 -0
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +61 -0
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +22 -0
- lamindb/migrations/0057_link_models_latest_report_and_others.py +356 -0
- lamindb/migrations/0058_artifact__actions_collection__actions.py +22 -0
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +31 -0
- lamindb/migrations/0060_alter_artifact__actions.py +22 -0
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +45 -0
- lamindb/migrations/0062_add_is_latest_field.py +32 -0
- lamindb/migrations/0063_populate_latest_field.py +45 -0
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +33 -0
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +22 -0
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +352 -0
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +20 -0
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +20 -0
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +1294 -0
- lamindb/migrations/0069_squashed.py +1770 -0
- lamindb/migrations/0070_lamindbv1_migrate_data.py +78 -0
- lamindb/migrations/0071_lamindbv1_migrate_schema.py +741 -0
- lamindb/migrations/0072_remove_user__branch_code_remove_user_aux_and_more.py +148 -0
- lamindb/migrations/0073_merge_ourprojects.py +945 -0
- lamindb/migrations/0074_lamindbv1_part4.py +374 -0
- lamindb/migrations/0075_lamindbv1_part5.py +276 -0
- lamindb/migrations/0076_lamindbv1_part6.py +621 -0
- lamindb/migrations/0077_lamindbv1_part6b.py +228 -0
- lamindb/migrations/0078_lamindbv1_part6c.py +468 -0
- lamindb/migrations/0079_alter_rundata_value_json_and_more.py +36 -0
- lamindb/migrations/__init__.py +0 -0
- lamindb/models.py +4064 -0
- {lamindb-0.77.2.dist-info → lamindb-1.0rc1.dist-info}/METADATA +15 -20
- lamindb-1.0rc1.dist-info/RECORD +100 -0
- {lamindb-0.77.2.dist-info → lamindb-1.0rc1.dist-info}/WHEEL +1 -1
- lamindb/core/subsettings/_transform_settings.py +0 -21
- lamindb-0.77.2.dist-info/RECORD +0 -63
- {lamindb-0.77.2.dist-info → lamindb-1.0rc1.dist-info}/LICENSE +0 -0
@@ -2,31 +2,40 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import copy
|
4
4
|
import warnings
|
5
|
+
from itertools import chain
|
5
6
|
from typing import TYPE_CHECKING
|
6
7
|
|
7
8
|
import anndata as ad
|
8
9
|
import lamindb_setup as ln_setup
|
9
10
|
import pandas as pd
|
11
|
+
import pyarrow as pa
|
10
12
|
from lamin_utils import colors, logger
|
11
13
|
from lamindb_setup.core._docs import doc_args
|
12
|
-
from
|
14
|
+
from lamindb_setup.core.upath import UPath
|
15
|
+
|
16
|
+
from lamindb.models import (
|
13
17
|
Artifact,
|
14
18
|
Feature,
|
15
19
|
Record,
|
16
20
|
Run,
|
21
|
+
Schema,
|
17
22
|
ULabel,
|
18
23
|
)
|
19
24
|
|
20
|
-
from
|
21
|
-
from
|
25
|
+
from .._from_values import _format_values
|
26
|
+
from ..core.exceptions import ValidationError
|
22
27
|
|
23
28
|
if TYPE_CHECKING:
|
24
29
|
from collections.abc import Iterable
|
25
30
|
from typing import Any
|
26
31
|
|
27
32
|
from lamindb_setup.core.types import UPathStr
|
28
|
-
from lnschema_core.types import FieldAttr
|
29
33
|
from mudata import MuData
|
34
|
+
from spatialdata import SpatialData
|
35
|
+
|
36
|
+
from lamindb.base.types import FieldAttr
|
37
|
+
|
38
|
+
from ._spatial import SpatialDataCurator
|
30
39
|
|
31
40
|
|
32
41
|
class CurateLookup:
|
@@ -40,8 +49,8 @@ class CurateLookup:
|
|
40
49
|
public: Whether to lookup from the public instance. Defaults to False.
|
41
50
|
|
42
51
|
Example:
|
43
|
-
>>>
|
44
|
-
>>>
|
52
|
+
>>> curator = ln.Curator.from_df(...)
|
53
|
+
>>> curator.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
|
45
54
|
<Category: alveolar_type_1_fibroblast_cell>
|
46
55
|
|
47
56
|
"""
|
@@ -96,7 +105,7 @@ class CurateLookup:
|
|
96
105
|
f"Lookup objects from the {colors.italic(ref)}:\n "
|
97
106
|
f"{colors.green(getattr_keys)}\n "
|
98
107
|
f"{colors.green(getitem_keys)}\n"
|
99
|
-
'Example:\n → categories =
|
108
|
+
'Example:\n → categories = curator.lookup()["cell_type"]\n'
|
100
109
|
" → categories.alveolar_type_1_fibroblast_cell\n\n"
|
101
110
|
"To look up public ontologies, use .lookup(public=True)"
|
102
111
|
)
|
@@ -107,6 +116,15 @@ class CurateLookup:
|
|
107
116
|
class BaseCurator:
|
108
117
|
"""Curate a dataset."""
|
109
118
|
|
119
|
+
def __init_subclass__(cls, **kwargs):
|
120
|
+
super().__init_subclass__(**kwargs)
|
121
|
+
import sys
|
122
|
+
|
123
|
+
# Deprecated methods
|
124
|
+
if "sphinx" not in sys.modules:
|
125
|
+
if hasattr(cls, "_add_new_from_columns"):
|
126
|
+
cls.add_new_from_columns = cls._add_new_from_columns
|
127
|
+
|
110
128
|
def validate(self) -> bool:
|
111
129
|
"""Validate dataset.
|
112
130
|
|
@@ -164,14 +182,16 @@ class DataFrameCurator(BaseCurator):
|
|
164
182
|
verbosity: The verbosity level.
|
165
183
|
organism: The organism name.
|
166
184
|
sources: A dictionary mapping column names to Source records.
|
167
|
-
exclude: A dictionary mapping column names to values to exclude.
|
185
|
+
exclude: A dictionary mapping column names to values to exclude from validation.
|
186
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
187
|
+
using the exclude parameter ensures they are not validated.
|
168
188
|
|
169
189
|
Returns:
|
170
190
|
A curator object.
|
171
191
|
|
172
192
|
Examples:
|
173
193
|
>>> import bionty as bt
|
174
|
-
>>>
|
194
|
+
>>> curator = ln.Curator.from_df(
|
175
195
|
... df,
|
176
196
|
... categoricals={
|
177
197
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
@@ -194,6 +214,9 @@ class DataFrameCurator(BaseCurator):
|
|
194
214
|
) -> None:
|
195
215
|
from lamindb.core._settings import settings
|
196
216
|
|
217
|
+
if organism is not None and not isinstance(organism, str):
|
218
|
+
raise ValueError("organism must be a string such as 'human' or 'mouse'!")
|
219
|
+
|
197
220
|
self._df = df
|
198
221
|
self._fields = categoricals or {}
|
199
222
|
self._columns_field = columns
|
@@ -255,7 +278,7 @@ class DataFrameCurator(BaseCurator):
|
|
255
278
|
are = "are" if n > 1 else "is"
|
256
279
|
if len(nonval_keys) > 0:
|
257
280
|
raise ValidationError(
|
258
|
-
f"
|
281
|
+
f"key{s} passed to {name} {are} not present in columns: {colors.yellow(_format_values(nonval_keys))}"
|
259
282
|
)
|
260
283
|
|
261
284
|
def _save_columns(self, validated_only: bool = True) -> None:
|
@@ -300,7 +323,7 @@ class DataFrameCurator(BaseCurator):
|
|
300
323
|
self._kwargs.update({"organism": organism} if organism else {})
|
301
324
|
self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
|
302
325
|
|
303
|
-
def
|
326
|
+
def _add_new_from_columns(self, organism: str | None = None, **kwargs):
|
304
327
|
"""Deprecated to run by default during init."""
|
305
328
|
warnings.warn(
|
306
329
|
"`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
|
@@ -323,7 +346,7 @@ class DataFrameCurator(BaseCurator):
|
|
323
346
|
# logging
|
324
347
|
n = len(syn_mapper)
|
325
348
|
if n > 0:
|
326
|
-
syn_mapper_print =
|
349
|
+
syn_mapper_print = _format_values(
|
327
350
|
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
328
351
|
)
|
329
352
|
s = "s" if n > 1 else ""
|
@@ -332,13 +355,13 @@ class DataFrameCurator(BaseCurator):
|
|
332
355
|
)
|
333
356
|
return std_values
|
334
357
|
|
335
|
-
def standardize(self, key: str):
|
358
|
+
def standardize(self, key: str) -> None:
|
336
359
|
"""Replace synonyms with standardized values.
|
337
360
|
|
338
|
-
Args:
|
339
|
-
key: The key referencing the slot in the DataFrame from which to draw terms.
|
340
|
-
|
341
361
|
Modifies the input dataset inplace.
|
362
|
+
|
363
|
+
Args:
|
364
|
+
key: The key referencing the column in the DataFrame to standardize.
|
342
365
|
"""
|
343
366
|
# list is needed to avoid RuntimeError: dictionary changed size during iteration
|
344
367
|
avail_keys = list(self.non_validated.keys())
|
@@ -359,9 +382,12 @@ class DataFrameCurator(BaseCurator):
|
|
359
382
|
self._df[k] = self._replace_synonyms(k, syn_mapper, self._df[k])
|
360
383
|
else:
|
361
384
|
if key not in avail_keys:
|
362
|
-
|
363
|
-
f
|
364
|
-
|
385
|
+
if key in self._fields:
|
386
|
+
logger.info(f"No unstandardized values found for {key!r}")
|
387
|
+
else:
|
388
|
+
raise KeyError(
|
389
|
+
f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
|
390
|
+
)
|
365
391
|
else:
|
366
392
|
if key in self._fields: # needed to exclude var_index
|
367
393
|
syn_mapper = standardize_categories(
|
@@ -375,7 +401,9 @@ class DataFrameCurator(BaseCurator):
|
|
375
401
|
key, syn_mapper, self._df[key]
|
376
402
|
)
|
377
403
|
|
378
|
-
def _update_registry(
|
404
|
+
def _update_registry(
|
405
|
+
self, categorical: str, validated_only: bool = True, **kwargs
|
406
|
+
) -> None:
|
379
407
|
if categorical == "all":
|
380
408
|
self._update_registry_all(validated_only=validated_only, **kwargs)
|
381
409
|
else:
|
@@ -441,7 +469,8 @@ class DataFrameCurator(BaseCurator):
|
|
441
469
|
|
442
470
|
Args:
|
443
471
|
description: Description of the DataFrame object.
|
444
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
472
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
473
|
+
Artifacts with the same key form a revision family.
|
445
474
|
revises: Previous version of the artifact. Triggers a revision.
|
446
475
|
run: The run that creates the artifact.
|
447
476
|
|
@@ -502,11 +531,13 @@ class AnnDataCurator(DataFrameCurator):
|
|
502
531
|
verbosity: The verbosity level.
|
503
532
|
organism: The organism name.
|
504
533
|
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
505
|
-
exclude: A dictionary mapping column names to values to exclude.
|
534
|
+
exclude: A dictionary mapping column names to values to exclude from validation.
|
535
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
536
|
+
using the exclude parameter ensures they are not validated.
|
506
537
|
|
507
538
|
Examples:
|
508
539
|
>>> import bionty as bt
|
509
|
-
>>>
|
540
|
+
>>> curator = ln.Curator.from_anndata(
|
510
541
|
... adata,
|
511
542
|
... var_index=bt.Gene.ensembl_gene_id,
|
512
543
|
... categoricals={
|
@@ -534,7 +565,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
534
565
|
if isinstance(var_index, str):
|
535
566
|
raise TypeError("var_index parameter has to be a bionty field")
|
536
567
|
|
537
|
-
from
|
568
|
+
from .._artifact import data_is_anndata
|
538
569
|
|
539
570
|
if sources is None:
|
540
571
|
sources = {}
|
@@ -710,7 +741,8 @@ class AnnDataCurator(DataFrameCurator):
|
|
710
741
|
|
711
742
|
Args:
|
712
743
|
description: A description of the ``AnnData`` object.
|
713
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
744
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
745
|
+
Artifacts with the same key form a revision family.
|
714
746
|
revises: Previous version of the artifact. Triggers a revision.
|
715
747
|
run: The run that creates the artifact.
|
716
748
|
|
@@ -761,11 +793,13 @@ class MuDataCurator:
|
|
761
793
|
verbosity: The verbosity level.
|
762
794
|
organism: The organism name.
|
763
795
|
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
764
|
-
exclude: A dictionary mapping column names to values to exclude.
|
796
|
+
exclude: A dictionary mapping column names to values to exclude from validation.
|
797
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
798
|
+
using the exclude parameter ensures they are not validated.
|
765
799
|
|
766
800
|
Examples:
|
767
801
|
>>> import bionty as bt
|
768
|
-
>>>
|
802
|
+
>>> curator = ln.Curator.from_mudata(
|
769
803
|
... mdata,
|
770
804
|
... var_index={
|
771
805
|
... "rna": bt.Gene.ensembl_gene_id,
|
@@ -1058,6 +1092,514 @@ class MuDataCurator:
|
|
1058
1092
|
return self._artifact
|
1059
1093
|
|
1060
1094
|
|
1095
|
+
def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
1096
|
+
if (n := len(nonval_keys)) > 0:
|
1097
|
+
s = "s" if n > 1 else ""
|
1098
|
+
are = "are" if n > 1 else "is"
|
1099
|
+
raise ValidationError(
|
1100
|
+
f"key{s} passed to {name} {are} not present: {colors.yellow(_format_values(nonval_keys))}"
|
1101
|
+
)
|
1102
|
+
|
1103
|
+
|
1104
|
+
class SOMACurator(BaseCurator):
|
1105
|
+
"""Curation flow for ``tiledbsoma``.
|
1106
|
+
|
1107
|
+
See also :class:`~lamindb.Curator`.
|
1108
|
+
|
1109
|
+
Args:
|
1110
|
+
experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
|
1111
|
+
var_index: The registry fields for mapping the `.var` indices for measurements.
|
1112
|
+
Should be in the form `{"measurement name": ("var column", field)}`.
|
1113
|
+
These keys should be used in the flattened form (`'{measurement name}__{column name in .var}'`)
|
1114
|
+
in `.standardize` or `.add_new_from`, see the output of `.var_index`.
|
1115
|
+
categoricals: A dictionary mapping categorical `.obs` columns to a registry field.
|
1116
|
+
obs_columns: The registry field for mapping the names of the `.obs` columns.
|
1117
|
+
organism: The organism name.
|
1118
|
+
sources: A dictionary mapping `.obs` columns to Source records.
|
1119
|
+
exclude: A dictionary mapping column names to values to exclude from validation.
|
1120
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1121
|
+
using the exclude parameter ensures they are not validated.
|
1122
|
+
|
1123
|
+
Examples:
|
1124
|
+
>>> import bionty as bt
|
1125
|
+
>>> curator = ln.Curator.from_tiledbsoma(
|
1126
|
+
... "./my_array_store.tiledbsoma",
|
1127
|
+
... var_index={"RNA": ("var_id", bt.Gene.symbol)},
|
1128
|
+
... categoricals={
|
1129
|
+
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
1130
|
+
... "donor_id": ln.ULabel.name
|
1131
|
+
... },
|
1132
|
+
... organism="human",
|
1133
|
+
... )
|
1134
|
+
"""
|
1135
|
+
|
1136
|
+
def __init__(
|
1137
|
+
self,
|
1138
|
+
experiment_uri: UPathStr | Artifact,
|
1139
|
+
var_index: dict[str, tuple[str, FieldAttr]],
|
1140
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
1141
|
+
obs_columns: FieldAttr = Feature.name,
|
1142
|
+
organism: str | None = None,
|
1143
|
+
sources: dict[str, Record] | None = None,
|
1144
|
+
exclude: dict[str, str | list[str]] | None = None,
|
1145
|
+
using_key: str | None = None,
|
1146
|
+
):
|
1147
|
+
self._obs_fields = categoricals or {}
|
1148
|
+
self._var_fields = var_index
|
1149
|
+
self._columns_field = obs_columns
|
1150
|
+
if isinstance(experiment_uri, Artifact):
|
1151
|
+
self._experiment_uri = experiment_uri.path
|
1152
|
+
self._artifact = experiment_uri
|
1153
|
+
else:
|
1154
|
+
self._experiment_uri = UPath(experiment_uri)
|
1155
|
+
self._artifact = None
|
1156
|
+
self._organism = organism
|
1157
|
+
self._using_key = using_key
|
1158
|
+
self._sources = sources or {}
|
1159
|
+
self._exclude = exclude or {}
|
1160
|
+
|
1161
|
+
self._validated: bool | None = False
|
1162
|
+
self._non_validated_values: dict[str, list] | None = None
|
1163
|
+
self._validated_values: dict[str, list] = {}
|
1164
|
+
# filled by _check_save_keys
|
1165
|
+
self._n_obs: int | None = None
|
1166
|
+
self._valid_obs_keys: list[str] | None = None
|
1167
|
+
self._obs_pa_schema: pa.lib.Schema | None = (
|
1168
|
+
None # this is needed to create the obs feature set
|
1169
|
+
)
|
1170
|
+
self._valid_var_keys: list[str] | None = None
|
1171
|
+
self._var_fields_flat: dict[str, FieldAttr] | None = None
|
1172
|
+
self._check_save_keys()
|
1173
|
+
|
1174
|
+
# check that the provided keys in var_index and categoricals are available in the store
|
1175
|
+
# and save features
|
1176
|
+
def _check_save_keys(self):
|
1177
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1178
|
+
|
1179
|
+
with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
|
1180
|
+
experiment_obs = experiment.obs
|
1181
|
+
self._n_obs = len(experiment_obs)
|
1182
|
+
self._obs_pa_schema = experiment_obs.schema
|
1183
|
+
valid_obs_keys = [
|
1184
|
+
k for k in self._obs_pa_schema.names if k != "soma_joinid"
|
1185
|
+
]
|
1186
|
+
self._valid_obs_keys = valid_obs_keys
|
1187
|
+
|
1188
|
+
valid_var_keys = []
|
1189
|
+
ms_list = []
|
1190
|
+
for ms in experiment.ms.keys():
|
1191
|
+
ms_list.append(ms)
|
1192
|
+
var_ms = experiment.ms[ms].var
|
1193
|
+
valid_var_keys += [
|
1194
|
+
f"{ms}__{k}" for k in var_ms.keys() if k != "soma_joinid"
|
1195
|
+
]
|
1196
|
+
self._valid_var_keys = valid_var_keys
|
1197
|
+
|
1198
|
+
# check validity of keys in categoricals
|
1199
|
+
nonval_keys = []
|
1200
|
+
for obs_key in self._obs_fields.keys():
|
1201
|
+
if obs_key not in valid_obs_keys:
|
1202
|
+
nonval_keys.append(obs_key)
|
1203
|
+
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1204
|
+
|
1205
|
+
# check validity of keys in var_index
|
1206
|
+
self._var_fields_flat = {}
|
1207
|
+
nonval_keys = []
|
1208
|
+
for ms_key in self._var_fields.keys():
|
1209
|
+
var_key, var_field = self._var_fields[ms_key]
|
1210
|
+
var_key_flat = f"{ms_key}__{var_key}"
|
1211
|
+
if var_key_flat not in valid_var_keys:
|
1212
|
+
nonval_keys.append(f"({ms_key}, {var_key})")
|
1213
|
+
else:
|
1214
|
+
self._var_fields_flat[var_key_flat] = var_field
|
1215
|
+
_maybe_curation_keys_not_present(nonval_keys, "var_index")
|
1216
|
+
|
1217
|
+
# check validity of keys in sources and exclude
|
1218
|
+
valid_arg_keys = valid_obs_keys + valid_var_keys + ["columns"]
|
1219
|
+
for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
|
1220
|
+
nonval_keys = []
|
1221
|
+
for arg_key in dct.keys():
|
1222
|
+
if arg_key not in valid_arg_keys:
|
1223
|
+
nonval_keys.append(arg_key)
|
1224
|
+
_maybe_curation_keys_not_present(nonval_keys, name)
|
1225
|
+
|
1226
|
+
# register obs columns' names
|
1227
|
+
register_columns = list(self._obs_fields.keys())
|
1228
|
+
organism = check_registry_organism(
|
1229
|
+
self._columns_field.field.model, self._organism
|
1230
|
+
).get("organism")
|
1231
|
+
update_registry(
|
1232
|
+
values=register_columns,
|
1233
|
+
field=self._columns_field,
|
1234
|
+
key="columns",
|
1235
|
+
using_key=self._using_key,
|
1236
|
+
validated_only=False,
|
1237
|
+
organism=organism,
|
1238
|
+
source=self._sources.get("columns"),
|
1239
|
+
exclude=self._exclude.get("columns"),
|
1240
|
+
)
|
1241
|
+
additional_columns = [k for k in valid_obs_keys if k not in register_columns]
|
1242
|
+
# no need to register with validated_only=True if columns are features
|
1243
|
+
if (
|
1244
|
+
len(additional_columns) > 0
|
1245
|
+
and self._columns_field.field.model is not Feature
|
1246
|
+
):
|
1247
|
+
update_registry(
|
1248
|
+
values=additional_columns,
|
1249
|
+
field=self._columns_field,
|
1250
|
+
key="columns",
|
1251
|
+
using_key=self._using_key,
|
1252
|
+
validated_only=True,
|
1253
|
+
organism=organism,
|
1254
|
+
source=self._sources.get("columns"),
|
1255
|
+
exclude=self._exclude.get("columns"),
|
1256
|
+
)
|
1257
|
+
|
1258
|
+
def validate(self):
|
1259
|
+
"""Validate categories."""
|
1260
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1261
|
+
|
1262
|
+
validated = True
|
1263
|
+
self._non_validated_values = {}
|
1264
|
+
with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
|
1265
|
+
for ms, (key, field) in self._var_fields.items():
|
1266
|
+
var_ms = experiment.ms[ms].var
|
1267
|
+
var_ms_key = f"{ms}__{key}"
|
1268
|
+
# it was already validated and cached
|
1269
|
+
if var_ms_key in self._validated_values:
|
1270
|
+
continue
|
1271
|
+
var_ms_values = (
|
1272
|
+
var_ms.read(column_names=[key]).concat()[key].to_pylist()
|
1273
|
+
)
|
1274
|
+
organism = check_registry_organism(
|
1275
|
+
field.field.model, self._organism
|
1276
|
+
).get("organism")
|
1277
|
+
update_registry(
|
1278
|
+
values=var_ms_values,
|
1279
|
+
field=field,
|
1280
|
+
key=var_ms_key,
|
1281
|
+
using_key=self._using_key,
|
1282
|
+
validated_only=True,
|
1283
|
+
organism=organism,
|
1284
|
+
source=self._sources.get(var_ms_key),
|
1285
|
+
exclude=self._exclude.get(var_ms_key),
|
1286
|
+
)
|
1287
|
+
_, non_val = validate_categories(
|
1288
|
+
values=var_ms_values,
|
1289
|
+
field=field,
|
1290
|
+
key=var_ms_key,
|
1291
|
+
using_key=self._using_key,
|
1292
|
+
organism=organism,
|
1293
|
+
source=self._sources.get(var_ms_key),
|
1294
|
+
exclude=self._exclude.get(var_ms_key),
|
1295
|
+
)
|
1296
|
+
if len(non_val) > 0:
|
1297
|
+
validated = False
|
1298
|
+
self._non_validated_values[var_ms_key] = non_val
|
1299
|
+
else:
|
1300
|
+
self._validated_values[var_ms_key] = var_ms_values
|
1301
|
+
|
1302
|
+
obs = experiment.obs
|
1303
|
+
for key, field in self._obs_fields.items():
|
1304
|
+
# already validated and cached
|
1305
|
+
if key in self._validated_values:
|
1306
|
+
continue
|
1307
|
+
values = pa.compute.unique(
|
1308
|
+
obs.read(column_names=[key]).concat()[key]
|
1309
|
+
).to_pylist()
|
1310
|
+
organism = check_registry_organism(
|
1311
|
+
field.field.model, self._organism
|
1312
|
+
).get("organism")
|
1313
|
+
update_registry(
|
1314
|
+
values=values,
|
1315
|
+
field=field,
|
1316
|
+
key=key,
|
1317
|
+
using_key=self._using_key,
|
1318
|
+
validated_only=True,
|
1319
|
+
organism=organism,
|
1320
|
+
source=self._sources.get(key),
|
1321
|
+
exclude=self._exclude.get(key),
|
1322
|
+
)
|
1323
|
+
_, non_val = validate_categories(
|
1324
|
+
values=values,
|
1325
|
+
field=field,
|
1326
|
+
key=key,
|
1327
|
+
using_key=self._using_key,
|
1328
|
+
organism=organism,
|
1329
|
+
source=self._sources.get(key),
|
1330
|
+
exclude=self._exclude.get(key),
|
1331
|
+
)
|
1332
|
+
if len(non_val) > 0:
|
1333
|
+
validated = False
|
1334
|
+
self._non_validated_values[key] = non_val
|
1335
|
+
else:
|
1336
|
+
self._validated_values[key] = values
|
1337
|
+
self._validated = validated
|
1338
|
+
return self._validated
|
1339
|
+
|
1340
|
+
def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
|
1341
|
+
assert self._non_validated_values is not None # noqa: S101
|
1342
|
+
|
1343
|
+
if key in self._valid_obs_keys:
|
1344
|
+
field = self._obs_fields[key]
|
1345
|
+
elif key in self._valid_var_keys:
|
1346
|
+
ms = key.partition("__")[0]
|
1347
|
+
field = self._var_fields[ms][1]
|
1348
|
+
else:
|
1349
|
+
raise KeyError(f"key {key} is invalid!")
|
1350
|
+
values = self._non_validated_values.get(key, [])
|
1351
|
+
return values, field
|
1352
|
+
|
1353
|
+
def add_new_from(self, key: str) -> None:
|
1354
|
+
"""Add validated & new categories.
|
1355
|
+
|
1356
|
+
Args:
|
1357
|
+
key: The key referencing the slot in the `tiledbsoma` store.
|
1358
|
+
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
1359
|
+
or a column name in `.obs`.
|
1360
|
+
"""
|
1361
|
+
if self._non_validated_values is None:
|
1362
|
+
raise ValidationError("Run .validate() first.")
|
1363
|
+
if key == "all":
|
1364
|
+
keys = list(self._non_validated_values.keys())
|
1365
|
+
else:
|
1366
|
+
avail_keys = list(
|
1367
|
+
chain(self._non_validated_values.keys(), self._validated_values.keys())
|
1368
|
+
)
|
1369
|
+
if key not in avail_keys:
|
1370
|
+
raise KeyError(
|
1371
|
+
f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
|
1372
|
+
)
|
1373
|
+
keys = [key]
|
1374
|
+
for k in keys:
|
1375
|
+
values, field = self._non_validated_values_field(k)
|
1376
|
+
if len(values) == 0:
|
1377
|
+
continue
|
1378
|
+
organism = check_registry_organism(field.field.model, self._organism).get(
|
1379
|
+
"organism"
|
1380
|
+
)
|
1381
|
+
update_registry(
|
1382
|
+
values=values,
|
1383
|
+
field=field,
|
1384
|
+
key=k,
|
1385
|
+
using_key=self._using_key,
|
1386
|
+
validated_only=False,
|
1387
|
+
organism=organism,
|
1388
|
+
source=self._sources.get(k),
|
1389
|
+
exclude=self._exclude.get(k),
|
1390
|
+
)
|
1391
|
+
# update non-validated values list but keep the key there
|
1392
|
+
# it will be removed by .validate()
|
1393
|
+
if k in self._non_validated_values:
|
1394
|
+
self._non_validated_values[k] = []
|
1395
|
+
|
1396
|
+
@property
|
1397
|
+
def non_validated(self) -> dict[str, list]:
|
1398
|
+
"""Return the non-validated features and labels."""
|
1399
|
+
non_val = {k: v for k, v in self._non_validated_values.items() if v != []}
|
1400
|
+
return non_val
|
1401
|
+
|
1402
|
+
@property
|
1403
|
+
def var_index(self) -> dict[str, FieldAttr]:
|
1404
|
+
"""Return the registry fields with flattened keys to validate variables indices against."""
|
1405
|
+
return self._var_fields_flat
|
1406
|
+
|
1407
|
+
@property
|
1408
|
+
def categoricals(self) -> dict[str, FieldAttr]:
|
1409
|
+
"""Return the obs fields to validate against."""
|
1410
|
+
return self._obs_fields
|
1411
|
+
|
1412
|
+
def lookup(
|
1413
|
+
self, using_key: str | None = None, public: bool = False
|
1414
|
+
) -> CurateLookup:
|
1415
|
+
"""Lookup categories.
|
1416
|
+
|
1417
|
+
Args:
|
1418
|
+
using_key: The instance where the lookup is performed.
|
1419
|
+
if "public", the lookup is performed on the public reference.
|
1420
|
+
"""
|
1421
|
+
return CurateLookup(
|
1422
|
+
categoricals=self._obs_fields,
|
1423
|
+
slots={"columns": self._columns_field, **self._var_fields_flat},
|
1424
|
+
using_key=using_key or self._using_key,
|
1425
|
+
public=public,
|
1426
|
+
)
|
1427
|
+
|
1428
|
+
def standardize(self, key: str):
|
1429
|
+
"""Replace synonyms with standardized values.
|
1430
|
+
|
1431
|
+
Modifies the dataset inplace.
|
1432
|
+
|
1433
|
+
Args:
|
1434
|
+
key: The key referencing the slot in the `tiledbsoma` store.
|
1435
|
+
It should be `'{measurement name}__{column name in .var}'` for columns in `.var`
|
1436
|
+
or a column name in `.obs`.
|
1437
|
+
"""
|
1438
|
+
if len(self.non_validated) == 0:
|
1439
|
+
logger.warning("values are already standardized")
|
1440
|
+
return
|
1441
|
+
avail_keys = list(self._non_validated_values.keys())
|
1442
|
+
if key == "all":
|
1443
|
+
keys = avail_keys
|
1444
|
+
else:
|
1445
|
+
if key not in avail_keys:
|
1446
|
+
raise KeyError(
|
1447
|
+
f"'{key!r}' is not a valid key, available keys are: {_format_values(avail_keys + ['all'])}!"
|
1448
|
+
)
|
1449
|
+
keys = [key]
|
1450
|
+
|
1451
|
+
for k in keys:
|
1452
|
+
values, field = self._non_validated_values_field(k)
|
1453
|
+
if len(values) == 0:
|
1454
|
+
continue
|
1455
|
+
if k in self._valid_var_keys:
|
1456
|
+
ms, _, slot_key = k.partition("__")
|
1457
|
+
slot = lambda experiment: experiment.ms[ms].var # noqa: B023
|
1458
|
+
else:
|
1459
|
+
slot = lambda experiment: experiment.obs
|
1460
|
+
slot_key = k
|
1461
|
+
# errors if public ontology and the model has no organism
|
1462
|
+
# has to be fixed in bionty
|
1463
|
+
organism = check_registry_organism(field.field.model, self._organism).get(
|
1464
|
+
"organism"
|
1465
|
+
)
|
1466
|
+
syn_mapper = standardize_categories(
|
1467
|
+
values=values,
|
1468
|
+
field=field,
|
1469
|
+
using_key=self._using_key,
|
1470
|
+
source=self._sources.get(k),
|
1471
|
+
organism=organism,
|
1472
|
+
)
|
1473
|
+
if (n_syn_mapper := len(syn_mapper)) == 0:
|
1474
|
+
continue
|
1475
|
+
|
1476
|
+
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1477
|
+
|
1478
|
+
with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
|
1479
|
+
value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
|
1480
|
+
table = slot(experiment).read(value_filter=value_filter).concat()
|
1481
|
+
|
1482
|
+
if len(table) == 0:
|
1483
|
+
continue
|
1484
|
+
|
1485
|
+
df = table.to_pandas()
|
1486
|
+
# map values
|
1487
|
+
df[slot_key] = df[slot_key].map(
|
1488
|
+
lambda val: syn_mapper.get(val, val) # noqa
|
1489
|
+
)
|
1490
|
+
# write the mapped values
|
1491
|
+
with _open_tiledbsoma(self._experiment_uri, mode="w") as experiment:
|
1492
|
+
slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
|
1493
|
+
# update non_validated dict
|
1494
|
+
non_val_k = [
|
1495
|
+
nv for nv in self._non_validated_values[k] if nv not in syn_mapper
|
1496
|
+
]
|
1497
|
+
self._non_validated_values[k] = non_val_k
|
1498
|
+
|
1499
|
+
syn_mapper_print = _format_values(
|
1500
|
+
[f'"{m_k}" → "{m_v}"' for m_k, m_v in syn_mapper.items()], sep=""
|
1501
|
+
)
|
1502
|
+
s = "s" if n_syn_mapper > 1 else ""
|
1503
|
+
logger.success(
|
1504
|
+
f'standardized {n_syn_mapper} synonym{s} in "{k}": {colors.green(syn_mapper_print)}'
|
1505
|
+
)
|
1506
|
+
|
1507
|
+
def save_artifact(
|
1508
|
+
self,
|
1509
|
+
description: str | None = None,
|
1510
|
+
key: str | None = None,
|
1511
|
+
revises: Artifact | None = None,
|
1512
|
+
run: Run | None = None,
|
1513
|
+
) -> Artifact:
|
1514
|
+
"""Save the validated `tiledbsoma` store and metadata.
|
1515
|
+
|
1516
|
+
Args:
|
1517
|
+
description: A description of the ``tiledbsoma`` store.
|
1518
|
+
key: A path-like key to reference artifact in default storage,
|
1519
|
+
e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a revision family.
|
1520
|
+
revises: Previous version of the artifact. Triggers a revision.
|
1521
|
+
run: The run that creates the artifact.
|
1522
|
+
|
1523
|
+
Returns:
|
1524
|
+
A saved artifact record.
|
1525
|
+
"""
|
1526
|
+
from lamindb.core._data import add_labels
|
1527
|
+
|
1528
|
+
if not self._validated:
|
1529
|
+
self.validate()
|
1530
|
+
if not self._validated:
|
1531
|
+
raise ValidationError("Dataset does not validate. Please curate.")
|
1532
|
+
|
1533
|
+
if self._artifact is None:
|
1534
|
+
artifact = Artifact(
|
1535
|
+
self._experiment_uri,
|
1536
|
+
description=description,
|
1537
|
+
key=key,
|
1538
|
+
revises=revises,
|
1539
|
+
run=run,
|
1540
|
+
)
|
1541
|
+
artifact.n_observations = self._n_obs
|
1542
|
+
artifact.otype = "tiledbsoma"
|
1543
|
+
artifact.save()
|
1544
|
+
else:
|
1545
|
+
artifact = self._artifact
|
1546
|
+
|
1547
|
+
_schemas_m2m = {}
|
1548
|
+
if len(self._obs_fields) > 0:
|
1549
|
+
organism = check_registry_organism(
|
1550
|
+
self._columns_field.field.model, self._organism
|
1551
|
+
).get("organism")
|
1552
|
+
empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
|
1553
|
+
mock_df = pa.Table.from_pydict(
|
1554
|
+
empty_dict, schema=self._obs_pa_schema
|
1555
|
+
).to_pandas()
|
1556
|
+
# in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
|
1557
|
+
_schemas_m2m["obs"] = Schema.from_df(
|
1558
|
+
df=mock_df,
|
1559
|
+
field=self._columns_field,
|
1560
|
+
mute=True,
|
1561
|
+
organism=organism,
|
1562
|
+
)
|
1563
|
+
for ms in self._var_fields:
|
1564
|
+
var_key, var_field = self._var_fields[ms]
|
1565
|
+
organism = check_registry_organism(
|
1566
|
+
var_field.field.model, self._organism
|
1567
|
+
).get("organism")
|
1568
|
+
_schemas_m2m[f"{ms}__var"] = Schema.from_values(
|
1569
|
+
values=self._validated_values[f"{ms}__{var_key}"],
|
1570
|
+
field=var_field,
|
1571
|
+
organism=organism,
|
1572
|
+
raise_validation_error=False,
|
1573
|
+
)
|
1574
|
+
artifact._staged__schemas_m2m = _schemas_m2m
|
1575
|
+
|
1576
|
+
feature_ref_is_name = _ref_is_name(self._columns_field)
|
1577
|
+
features = Feature.lookup().dict()
|
1578
|
+
for key, field in self._obs_fields.items():
|
1579
|
+
feature = features.get(key)
|
1580
|
+
registry = field.field.model
|
1581
|
+
organism = check_registry_organism(field.field.model, self._organism).get(
|
1582
|
+
"organism"
|
1583
|
+
)
|
1584
|
+
labels = registry.from_values(
|
1585
|
+
values=self._validated_values[key], field=field, organism=organism
|
1586
|
+
)
|
1587
|
+
if len(labels) == 0:
|
1588
|
+
continue
|
1589
|
+
if hasattr(registry, "_name_field"):
|
1590
|
+
label_ref_is_name = field.field.name == registry._name_field
|
1591
|
+
add_labels(
|
1592
|
+
artifact,
|
1593
|
+
records=labels,
|
1594
|
+
feature=feature,
|
1595
|
+
feature_ref_is_name=feature_ref_is_name,
|
1596
|
+
label_ref_is_name=label_ref_is_name,
|
1597
|
+
from_curator=True,
|
1598
|
+
)
|
1599
|
+
|
1600
|
+
return artifact.save()
|
1601
|
+
|
1602
|
+
|
1061
1603
|
class Curator(BaseCurator):
|
1062
1604
|
"""Dataset curator.
|
1063
1605
|
|
@@ -1072,7 +1614,7 @@ class Curator(BaseCurator):
|
|
1072
1614
|
>>> categoricals={"perturbation": ln.ULabel.name}, # map categories
|
1073
1615
|
>>> )
|
1074
1616
|
>>> curator.validate() # validate the data in df
|
1075
|
-
>>> artifact =
|
1617
|
+
>>> artifact = curator.save_artifact(description="my RNA-seq")
|
1076
1618
|
>>> artifact.describe() # see annotations
|
1077
1619
|
|
1078
1620
|
`curator.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
|
@@ -1150,6 +1692,105 @@ class Curator(BaseCurator):
|
|
1150
1692
|
organism=organism,
|
1151
1693
|
)
|
1152
1694
|
|
1695
|
+
@classmethod
|
1696
|
+
@doc_args(SOMACurator.__doc__)
|
1697
|
+
def from_tiledbsoma(
|
1698
|
+
cls,
|
1699
|
+
experiment_uri: UPathStr,
|
1700
|
+
var_index: dict[str, tuple[str, FieldAttr]],
|
1701
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
1702
|
+
obs_columns: FieldAttr = Feature.name,
|
1703
|
+
using_key: str | None = None,
|
1704
|
+
organism: str | None = None,
|
1705
|
+
sources: dict[str, Record] | None = None,
|
1706
|
+
exclude: dict[str, str | list[str]] | None = None,
|
1707
|
+
) -> SOMACurator:
|
1708
|
+
"""{}""" # noqa: D415
|
1709
|
+
return SOMACurator(
|
1710
|
+
experiment_uri=experiment_uri,
|
1711
|
+
var_index=var_index,
|
1712
|
+
categoricals=categoricals,
|
1713
|
+
obs_columns=obs_columns,
|
1714
|
+
using_key=using_key,
|
1715
|
+
organism=organism,
|
1716
|
+
sources=sources,
|
1717
|
+
exclude=exclude,
|
1718
|
+
)
|
1719
|
+
|
1720
|
+
@classmethod
|
1721
|
+
def from_spatialdata(
|
1722
|
+
cls,
|
1723
|
+
sdata: SpatialData,
|
1724
|
+
var_index: dict[str, FieldAttr],
|
1725
|
+
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
1726
|
+
using_key: str | None = None,
|
1727
|
+
organism: str | None = None,
|
1728
|
+
sources: dict[str, dict[str, Record]] | None = None,
|
1729
|
+
exclude: dict[str, dict] | None = None,
|
1730
|
+
verbosity: str = "hint",
|
1731
|
+
*,
|
1732
|
+
sample_metadata_key: str = "sample",
|
1733
|
+
) -> SpatialDataCurator:
|
1734
|
+
"""Curation flow for a ``Spatialdata`` object.
|
1735
|
+
|
1736
|
+
See also :class:`~lamindb.Curator`.
|
1737
|
+
|
1738
|
+
Note that if genes or other measurements are removed from the SpatialData object,
|
1739
|
+
the object should be recreated.
|
1740
|
+
|
1741
|
+
In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
|
1742
|
+
|
1743
|
+
Args:
|
1744
|
+
sdata: The SpatialData object to curate.
|
1745
|
+
var_index: A dictionary mapping table keys to the ``.var`` indices.
|
1746
|
+
categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
|
1747
|
+
using_key: A reference LaminDB instance.
|
1748
|
+
organism: The organism name.
|
1749
|
+
sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
|
1750
|
+
exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
|
1751
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1752
|
+
using the exclude parameter ensures they are not validated.
|
1753
|
+
verbosity: The verbosity level of the logger.
|
1754
|
+
sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
|
1755
|
+
|
1756
|
+
Examples:
|
1757
|
+
>>> import lamindb as ln
|
1758
|
+
>>> import bionty as bt
|
1759
|
+
>>> curator = ln.Curator.from_spatialdata(
|
1760
|
+
... sdata,
|
1761
|
+
... var_index={
|
1762
|
+
... "table_1": bt.Gene.ensembl_gene_id,
|
1763
|
+
... },
|
1764
|
+
... categoricals={
|
1765
|
+
... "table1":
|
1766
|
+
... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name},
|
1767
|
+
... "sample":
|
1768
|
+
... {"experimental_factor": bt.ExperimentalFactor.name},
|
1769
|
+
... },
|
1770
|
+
... organism="human",
|
1771
|
+
... )
|
1772
|
+
"""
|
1773
|
+
try:
|
1774
|
+
import spatialdata
|
1775
|
+
except ImportError as e:
|
1776
|
+
raise ImportError(
|
1777
|
+
"Please install spatialdata: pip install spatialdata"
|
1778
|
+
) from e
|
1779
|
+
|
1780
|
+
from ._spatial import SpatialDataCurator
|
1781
|
+
|
1782
|
+
return SpatialDataCurator(
|
1783
|
+
sdata=sdata,
|
1784
|
+
var_index=var_index,
|
1785
|
+
categoricals=categoricals,
|
1786
|
+
using_key=using_key,
|
1787
|
+
verbosity=verbosity,
|
1788
|
+
organism=organism,
|
1789
|
+
sources=sources,
|
1790
|
+
exclude=exclude,
|
1791
|
+
sample_metadata_key=sample_metadata_key,
|
1792
|
+
)
|
1793
|
+
|
1153
1794
|
|
1154
1795
|
def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
|
1155
1796
|
"""Get a registry instance using a specific instance."""
|
@@ -1253,7 +1894,7 @@ def validate_categories(
|
|
1253
1894
|
standardize: Whether to standardize the values.
|
1254
1895
|
hint_print: The hint to print that suggests fixing non-validated values.
|
1255
1896
|
"""
|
1256
|
-
from lamindb._from_values import
|
1897
|
+
from lamindb._from_values import _format_values
|
1257
1898
|
from lamindb.core._settings import settings
|
1258
1899
|
|
1259
1900
|
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
@@ -1315,22 +1956,17 @@ def validate_categories(
|
|
1315
1956
|
non_validated = [i for i in non_validated if i not in values_validated]
|
1316
1957
|
n_non_validated = len(non_validated)
|
1317
1958
|
if n_non_validated == 0:
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
logger.success(f'"{key}" is validated against {colors.italic(model_field)}')
|
1322
|
-
return True, []
|
1323
|
-
else:
|
1324
|
-
# validated values still need to be saved to the current instance
|
1325
|
-
return False, []
|
1959
|
+
logger.indent = ""
|
1960
|
+
logger.success(f'"{key}" is validated against {colors.italic(model_field)}')
|
1961
|
+
return True, []
|
1326
1962
|
else:
|
1327
1963
|
are = "is" if n_non_validated == 1 else "are"
|
1328
1964
|
s = "" if n_non_validated == 1 else "s"
|
1329
|
-
print_values =
|
1965
|
+
print_values = _format_values(non_validated)
|
1330
1966
|
warning_message = f"{colors.red(f'{n_non_validated} term{s}')} {are} not validated: {colors.red(print_values)}\n"
|
1331
1967
|
if syn_mapper:
|
1332
1968
|
s = "" if len(syn_mapper) == 1 else "s"
|
1333
|
-
syn_mapper_print =
|
1969
|
+
syn_mapper_print = _format_values(
|
1334
1970
|
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
1335
1971
|
)
|
1336
1972
|
hint_msg = f'.standardize("{key}")'
|
@@ -1445,8 +2081,8 @@ def save_artifact(
|
|
1445
2081
|
Returns:
|
1446
2082
|
The saved Artifact.
|
1447
2083
|
"""
|
1448
|
-
from
|
1449
|
-
from
|
2084
|
+
from .._artifact import data_is_anndata
|
2085
|
+
from ..core._data import add_labels
|
1450
2086
|
|
1451
2087
|
artifact = None
|
1452
2088
|
if data_is_anndata(data):
|
@@ -1489,13 +2125,13 @@ def save_artifact(
|
|
1489
2125
|
organism,
|
1490
2126
|
)
|
1491
2127
|
|
1492
|
-
if artifact.
|
2128
|
+
if artifact.otype == "DataFrame":
|
1493
2129
|
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
|
1494
|
-
elif artifact.
|
2130
|
+
elif artifact.otype == "AnnData":
|
1495
2131
|
artifact.features._add_set_from_anndata(
|
1496
2132
|
var_field=columns_field, **feature_kwargs
|
1497
2133
|
)
|
1498
|
-
elif artifact.
|
2134
|
+
elif artifact.otype == "MuData":
|
1499
2135
|
artifact.features._add_set_from_mudata(
|
1500
2136
|
var_fields=columns_field, **feature_kwargs
|
1501
2137
|
)
|
@@ -1515,25 +2151,31 @@ def save_artifact(
|
|
1515
2151
|
filter_kwargs = check_registry_organism(registry, organism)
|
1516
2152
|
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
1517
2153
|
df = data if isinstance(data, pd.DataFrame) else data.obs
|
2154
|
+
# multi-value columns are separated by "|"
|
2155
|
+
if df[key].str.contains("|").any():
|
2156
|
+
values = df[key].str.split("|").explode().unique()
|
2157
|
+
else:
|
2158
|
+
values = df[key].unique()
|
1518
2159
|
labels = registry.from_values(
|
1519
|
-
|
2160
|
+
values,
|
1520
2161
|
field=field,
|
1521
2162
|
**filter_kwargs_current,
|
1522
2163
|
)
|
1523
2164
|
if len(labels) == 0:
|
1524
2165
|
continue
|
2166
|
+
label_ref_is_name = None
|
1525
2167
|
if hasattr(registry, "_name_field"):
|
1526
2168
|
label_ref_is_name = field.field.name == registry._name_field
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
2169
|
+
add_labels(
|
2170
|
+
artifact,
|
2171
|
+
records=labels,
|
2172
|
+
feature=feature,
|
2173
|
+
feature_ref_is_name=feature_ref_is_name,
|
2174
|
+
label_ref_is_name=label_ref_is_name,
|
2175
|
+
from_curator=True,
|
2176
|
+
)
|
1535
2177
|
|
1536
|
-
if artifact.
|
2178
|
+
if artifact.otype == "MuData":
|
1537
2179
|
for modality, modality_fields in fields.items():
|
1538
2180
|
column_field_modality = columns_field.get(modality)
|
1539
2181
|
if modality == "obs":
|
@@ -1616,6 +2258,7 @@ def update_registry(
|
|
1616
2258
|
registry = field.field.model
|
1617
2259
|
filter_kwargs = check_registry_organism(registry, organism)
|
1618
2260
|
filter_kwargs.update({"source": source} if source else {})
|
2261
|
+
values = [i for i in values if isinstance(i, str) and i]
|
1619
2262
|
if not values:
|
1620
2263
|
return
|
1621
2264
|
|
@@ -1710,7 +2353,7 @@ def log_saved_labels(
|
|
1710
2353
|
validated_only: bool = True,
|
1711
2354
|
) -> None:
|
1712
2355
|
"""Log the saved labels."""
|
1713
|
-
from
|
2356
|
+
from .._from_values import _format_values
|
1714
2357
|
|
1715
2358
|
model_field = colors.italic(model_field)
|
1716
2359
|
for k, labels in labels_saved.items():
|
@@ -1724,7 +2367,7 @@ def log_saved_labels(
|
|
1724
2367
|
# labels from a public ontology or a different instance to the present instance
|
1725
2368
|
s = "s" if len(labels) > 1 else ""
|
1726
2369
|
logger.success(
|
1727
|
-
f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {
|
2370
|
+
f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {_format_values(labels)}'
|
1728
2371
|
)
|
1729
2372
|
|
1730
2373
|
|
@@ -1800,7 +2443,7 @@ def _save_organism(name: str):
|
|
1800
2443
|
|
1801
2444
|
def _ref_is_name(field: FieldAttr) -> bool | None:
|
1802
2445
|
"""Check if the reference field is a name field."""
|
1803
|
-
from
|
2446
|
+
from .._can_curate import get_name_field
|
1804
2447
|
|
1805
2448
|
name_field = get_name_field(field.field.model)
|
1806
2449
|
return field.field.name == name_field
|