lamindb 0.76.14__py3-none-any.whl → 0.76.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +44 -35
- lamindb/_can_validate.py +31 -22
- lamindb/_collection.py +6 -5
- lamindb/_curate.py +80 -48
- lamindb/_feature.py +2 -3
- lamindb/_feature_set.py +1 -2
- lamindb/_finish.py +12 -7
- lamindb/_is_versioned.py +1 -2
- lamindb/_parents.py +28 -5
- lamindb/_query_manager.py +1 -2
- lamindb/_query_set.py +51 -6
- lamindb/_record.py +125 -62
- lamindb/_save.py +2 -2
- lamindb/_transform.py +1 -2
- lamindb/_ulabel.py +1 -1
- lamindb/core/_context.py +48 -26
- lamindb/core/_label_manager.py +1 -1
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/storage/_anndata_accessor.py +7 -4
- lamindb/core/storage/_backed_access.py +16 -8
- lamindb/core/storage/_pyarrow_dataset.py +31 -0
- {lamindb-0.76.14.dist-info → lamindb-0.76.16.dist-info}/METADATA +20 -9
- {lamindb-0.76.14.dist-info → lamindb-0.76.16.dist-info}/RECORD +26 -26
- lamindb/_filter.py +0 -21
- {lamindb-0.76.14.dist-info → lamindb-0.76.16.dist-info}/LICENSE +0 -0
- {lamindb-0.76.14.dist-info → lamindb-0.76.16.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
lamindb/_artifact.py
CHANGED
@@ -28,39 +28,41 @@ from lnschema_core.types import (
|
|
28
28
|
VisibilityChoice,
|
29
29
|
)
|
30
30
|
|
31
|
-
from
|
32
|
-
from
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
31
|
+
from ._utils import attach_func_to_class_method
|
32
|
+
from .core._data import (
|
33
|
+
_track_run_input,
|
34
|
+
add_transform_to_kwargs,
|
35
|
+
describe,
|
36
|
+
get_run,
|
37
|
+
save_feature_set_links,
|
38
|
+
save_feature_sets,
|
39
|
+
view_lineage,
|
40
|
+
)
|
41
|
+
from .core._settings import settings
|
42
|
+
from .core.exceptions import IntegrityError, InvalidArgument
|
43
|
+
from .core.loaders import load_to_memory
|
44
|
+
from .core.storage import (
|
37
45
|
LocalPathClasses,
|
38
46
|
UPath,
|
39
47
|
delete_storage,
|
40
48
|
infer_suffix,
|
41
49
|
write_to_disk,
|
42
50
|
)
|
43
|
-
from
|
51
|
+
from .core.storage._pyarrow_dataset import PYARROW_SUFFIXES
|
52
|
+
from .core.storage.objects import _mudata_is_installed
|
53
|
+
from .core.storage.paths import (
|
54
|
+
AUTO_KEY_PREFIX,
|
44
55
|
auto_storage_key_from_artifact,
|
45
56
|
auto_storage_key_from_artifact_uid,
|
46
57
|
check_path_is_child_of_root,
|
47
58
|
filepath_cache_key_from_artifact,
|
48
59
|
filepath_from_artifact,
|
49
60
|
)
|
50
|
-
from
|
61
|
+
from .core.versioning import (
|
51
62
|
create_uid,
|
52
63
|
message_update_key_in_version_family,
|
53
64
|
)
|
54
65
|
|
55
|
-
from .core._data import (
|
56
|
-
add_transform_to_kwargs,
|
57
|
-
get_run,
|
58
|
-
save_feature_set_links,
|
59
|
-
save_feature_sets,
|
60
|
-
)
|
61
|
-
from .core.storage.objects import _mudata_is_installed
|
62
|
-
from .core.storage.paths import AUTO_KEY_PREFIX
|
63
|
-
|
64
66
|
try:
|
65
67
|
from .core.storage._zarr import zarr_is_adata
|
66
68
|
except ImportError:
|
@@ -72,6 +74,7 @@ except ImportError:
|
|
72
74
|
if TYPE_CHECKING:
|
73
75
|
from lamindb_setup.core.types import UPathStr
|
74
76
|
from mudata import MuData
|
77
|
+
from pyarrow.dataset import Dataset as PyArrowDataset
|
75
78
|
from tiledbsoma import Collection as SOMACollection
|
76
79
|
from tiledbsoma import Experiment as SOMAExperiment
|
77
80
|
|
@@ -108,7 +111,12 @@ def process_pathlike(
|
|
108
111
|
# for the storage root: the bucket
|
109
112
|
if not isinstance(filepath, LocalPathClasses):
|
110
113
|
# for a cloud path, new_root is always the bucket name
|
111
|
-
|
114
|
+
if filepath.protocol == "hf":
|
115
|
+
hf_path = filepath.fs.resolve_path(filepath.as_posix())
|
116
|
+
hf_path.path_in_repo = ""
|
117
|
+
new_root = "hf://" + hf_path.unresolve()
|
118
|
+
else:
|
119
|
+
new_root = list(filepath.parents)[-1]
|
112
120
|
# do not register remote storage locations on hub if the current instance
|
113
121
|
# is not managed on the hub
|
114
122
|
storage_settings, _ = init_storage(
|
@@ -210,9 +218,9 @@ def get_stat_or_artifact(
|
|
210
218
|
if stat is not None:
|
211
219
|
# convert UPathStatResult to fsspec info dict
|
212
220
|
stat = stat.as_info()
|
213
|
-
if
|
221
|
+
if (store_type := stat["type"]) == "file":
|
214
222
|
size, hash, hash_type = get_stat_file_cloud(stat)
|
215
|
-
elif
|
223
|
+
elif store_type == "directory":
|
216
224
|
size, hash, hash_type, n_objects = get_stat_dir_cloud(path)
|
217
225
|
if hash is None:
|
218
226
|
logger.warning(f"did not add hash for {path}")
|
@@ -237,7 +245,7 @@ def get_stat_or_artifact(
|
|
237
245
|
.order_by("-created_at")
|
238
246
|
.all()
|
239
247
|
)
|
240
|
-
artifact_with_same_hash_exists =
|
248
|
+
artifact_with_same_hash_exists = result.filter(hash=hash).count() > 0
|
241
249
|
if not artifact_with_same_hash_exists and len(result) > 0:
|
242
250
|
logger.important(
|
243
251
|
f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
|
@@ -772,19 +780,14 @@ def from_dir(
|
|
772
780
|
else:
|
773
781
|
folder_key_path = Path(key)
|
774
782
|
|
775
|
-
|
776
|
-
folder_key = folder_key_path.as_posix().rstrip("/")
|
777
|
-
|
778
|
-
# TODO: (non-local) UPath doesn't list the first level artifacts and dirs with "*"
|
779
|
-
pattern = "" if not isinstance(folderpath, LocalPathClasses) else "*"
|
780
|
-
|
783
|
+
folder_key = folder_key_path.as_posix()
|
781
784
|
# silence fine-grained logging
|
782
785
|
verbosity = settings.verbosity
|
783
786
|
verbosity_int = settings._verbosity_int
|
784
787
|
if verbosity_int >= 1:
|
785
788
|
settings.verbosity = "warning"
|
786
789
|
artifacts_dict = {}
|
787
|
-
for filepath in folderpath.rglob(
|
790
|
+
for filepath in folderpath.rglob("*"):
|
788
791
|
if filepath.is_file():
|
789
792
|
relative_path = get_relative_path_to_directory(filepath, folderpath)
|
790
793
|
artifact_key = folder_key + "/" + relative_path.as_posix()
|
@@ -802,7 +805,8 @@ def from_dir(
|
|
802
805
|
if artifact.hash is not None
|
803
806
|
]
|
804
807
|
uids = artifacts_dict.keys()
|
805
|
-
|
808
|
+
n_unique_hashes = len(set(hashes))
|
809
|
+
if n_unique_hashes == len(hashes):
|
806
810
|
artifacts = list(artifacts_dict.values())
|
807
811
|
else:
|
808
812
|
# consider exact duplicates (same id, same hash)
|
@@ -811,7 +815,7 @@ def from_dir(
|
|
811
815
|
# logger.warning("dropping duplicate records in list of artifact records")
|
812
816
|
# artifacts = list(set(uids))
|
813
817
|
# consider false duplicates (different id, same hash)
|
814
|
-
if not len(set(uids)) ==
|
818
|
+
if not len(set(uids)) == n_unique_hashes:
|
815
819
|
seen_hashes = set()
|
816
820
|
non_unique_artifacts = {
|
817
821
|
hash: artifact
|
@@ -905,14 +909,19 @@ def replace(
|
|
905
909
|
# docstring handled through attach_func_to_class_method
|
906
910
|
def open(
|
907
911
|
self, mode: str = "r", is_run_input: bool | None = None
|
908
|
-
) ->
|
912
|
+
) -> (
|
913
|
+
AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | PyArrowDataset
|
914
|
+
):
|
909
915
|
# ignore empty suffix for now
|
910
|
-
suffixes = (".h5", ".hdf5", ".h5ad", ".zarr", ".tiledbsoma"
|
916
|
+
suffixes = ("", ".h5", ".hdf5", ".h5ad", ".zarr", ".tiledbsoma") + PYARROW_SUFFIXES
|
911
917
|
if self.suffix not in suffixes:
|
912
918
|
raise ValueError(
|
913
|
-
"Artifact should have a zarr, h5
|
914
|
-
"
|
915
|
-
|
919
|
+
"Artifact should have a zarr, h5, tiledbsoma object"
|
920
|
+
" or a compatible `pyarrow.dataset.dataset` directory"
|
921
|
+
" as the underlying data, please use one of the following suffixes"
|
922
|
+
f" for the object name: {', '.join(suffixes[1:])}."
|
923
|
+
f" Or no suffix for a folder with {', '.join(PYARROW_SUFFIXES)} files"
|
924
|
+
" (no mixing allowed)."
|
916
925
|
)
|
917
926
|
if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
|
918
927
|
raise ValueError("Only a tiledbsoma store can be openened with `mode!='r'`.")
|
lamindb/_can_validate.py
CHANGED
@@ -10,10 +10,10 @@ from lamin_utils import colors, logger
|
|
10
10
|
from lamindb_setup.core._docs import doc_args
|
11
11
|
from lnschema_core import CanValidate, Record
|
12
12
|
|
13
|
-
from lamindb._utils import attach_func_to_class_method
|
14
|
-
|
15
13
|
from ._from_values import _has_organism_field, _print_values, get_or_create_records
|
16
14
|
from ._record import _queryset, get_name_field
|
15
|
+
from ._utils import attach_func_to_class_method
|
16
|
+
from .core.exceptions import ValidationError
|
17
17
|
|
18
18
|
if TYPE_CHECKING:
|
19
19
|
from django.db.models import QuerySet
|
@@ -108,14 +108,14 @@ def _check_organism_db(organism: Record, using_key: str | None):
|
|
108
108
|
|
109
109
|
def _concat_lists(values: ListLike) -> list[str]:
|
110
110
|
"""Concatenate a list of lists of strings into a single list."""
|
111
|
-
if
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
111
|
+
if isinstance(values, (list, pd.Series)) and len(values) > 0:
|
112
|
+
first_item = values[0] if isinstance(values, list) else values.iloc[0]
|
113
|
+
if isinstance(first_item, list):
|
114
|
+
if isinstance(values, pd.Series):
|
115
|
+
values = values.tolist()
|
116
|
+
values = [
|
117
|
+
v for sublist in values if isinstance(sublist, list) for v in sublist
|
118
|
+
]
|
119
119
|
return values
|
120
120
|
|
121
121
|
|
@@ -250,7 +250,7 @@ def _validate(
|
|
250
250
|
f"Your {cls.__name__} registry is empty, consider populating it first!"
|
251
251
|
)
|
252
252
|
if hasattr(cls, "source_id"):
|
253
|
-
msg += "\n → use `.
|
253
|
+
msg += "\n → use `.import_source()` to import records from a source, e.g. a public ontology"
|
254
254
|
logger.warning(msg)
|
255
255
|
return np.array([False] * len(values))
|
256
256
|
|
@@ -388,7 +388,11 @@ def _standardize(
|
|
388
388
|
|
389
389
|
try:
|
390
390
|
registry._meta.get_field(synonyms_field)
|
391
|
-
fields = {
|
391
|
+
fields = {
|
392
|
+
field_name
|
393
|
+
for field_name in [field, return_field, synonyms_field]
|
394
|
+
if field_name is not None
|
395
|
+
}
|
392
396
|
df = _filter_query_based_on_organism(
|
393
397
|
queryset=queryset,
|
394
398
|
field=field,
|
@@ -445,14 +449,19 @@ def _standardize(
|
|
445
449
|
if len(std_names_bt_mapper) > 0 and not mute:
|
446
450
|
s = "" if len(std_names_bt_mapper) == 1 else "s"
|
447
451
|
field_print = "synonym" if field == return_field else field
|
448
|
-
|
449
|
-
|
450
|
-
|
452
|
+
|
453
|
+
reduced_mapped_keys_str = f"{list(std_names_bt_mapper.keys())[:10] + ['...'] if len(std_names_bt_mapper) > 10 else list(std_names_bt_mapper.keys())}"
|
454
|
+
truncated_note = (
|
455
|
+
" (output truncated)" if len(std_names_bt_mapper) > 10 else ""
|
451
456
|
)
|
452
|
-
|
453
|
-
|
454
|
-
f"
|
457
|
+
|
458
|
+
warn_msg = (
|
459
|
+
f"found {len(std_names_bt_mapper)} {field_print}{s} in Bionty{truncated_note}:"
|
460
|
+
f" {reduced_mapped_keys_str}\n"
|
461
|
+
f" please add corresponding {registry._meta.model.__name__} records via{truncated_note}:"
|
462
|
+
f" `.from_values({reduced_mapped_keys_str})`"
|
455
463
|
)
|
464
|
+
|
456
465
|
logger.warning(warn_msg)
|
457
466
|
|
458
467
|
mapper.update(std_names_bt_mapper)
|
@@ -496,9 +505,9 @@ def _add_or_remove_synonyms(
|
|
496
505
|
" with the following records:\n"
|
497
506
|
)
|
498
507
|
display(records_df)
|
499
|
-
raise
|
500
|
-
"
|
501
|
-
"
|
508
|
+
raise ValidationError(
|
509
|
+
f"you are trying to assign a synonym to record: {record}\n"
|
510
|
+
" → consider removing the synonym from existing records or using a different synonym."
|
502
511
|
)
|
503
512
|
|
504
513
|
# passed synonyms
|
@@ -516,7 +525,7 @@ def _add_or_remove_synonyms(
|
|
516
525
|
return
|
517
526
|
# because we use | as the separator
|
518
527
|
if any("|" in i for i in syn_new_set):
|
519
|
-
raise
|
528
|
+
raise ValidationError("a synonym can't contain '|'!")
|
520
529
|
|
521
530
|
# existing synonyms
|
522
531
|
syns_exist = record.synonyms
|
lamindb/_collection.py
CHANGED
@@ -20,20 +20,21 @@ from lnschema_core.models import (
|
|
20
20
|
)
|
21
21
|
from lnschema_core.types import VisibilityChoice
|
22
22
|
|
23
|
-
from lamindb._utils import attach_func_to_class_method
|
24
|
-
from lamindb.core._data import _track_run_input, describe, view_lineage
|
25
|
-
from lamindb.core._mapped_collection import MappedCollection
|
26
|
-
from lamindb.core.versioning import process_revises
|
27
|
-
|
28
23
|
from . import Artifact, Run
|
29
24
|
from ._record import init_self_from_db, update_attributes
|
25
|
+
from ._utils import attach_func_to_class_method
|
30
26
|
from .core._data import (
|
27
|
+
_track_run_input,
|
31
28
|
add_transform_to_kwargs,
|
29
|
+
describe,
|
32
30
|
get_run,
|
33
31
|
save_feature_set_links,
|
34
32
|
save_feature_sets,
|
33
|
+
view_lineage,
|
35
34
|
)
|
35
|
+
from .core._mapped_collection import MappedCollection
|
36
36
|
from .core._settings import settings
|
37
|
+
from .core.versioning import process_revises
|
37
38
|
|
38
39
|
if TYPE_CHECKING:
|
39
40
|
from collections.abc import Iterable
|
lamindb/_curate.py
CHANGED
@@ -20,6 +20,7 @@ from .core.exceptions import ValidationError
|
|
20
20
|
|
21
21
|
if TYPE_CHECKING:
|
22
22
|
from collections.abc import Iterable
|
23
|
+
from typing import Any
|
23
24
|
|
24
25
|
from lamindb_setup.core.types import UPathStr
|
25
26
|
from lnschema_core.types import FieldAttr
|
@@ -184,7 +185,7 @@ class DataFrameCurator(BaseCurator):
|
|
184
185
|
def non_validated(self) -> list:
|
185
186
|
"""Return the non-validated features and labels."""
|
186
187
|
if self._non_validated is None:
|
187
|
-
raise
|
188
|
+
raise ValidationError("Please run validate() first!")
|
188
189
|
return self._non_validated
|
189
190
|
|
190
191
|
@property
|
@@ -222,11 +223,11 @@ class DataFrameCurator(BaseCurator):
|
|
222
223
|
valid_keys = set(self._df.columns) | {"columns"} | extra
|
223
224
|
nonval_keys = [key for key in d.keys() if key not in valid_keys]
|
224
225
|
if len(nonval_keys) > 0:
|
225
|
-
raise
|
226
|
+
raise ValidationError(
|
226
227
|
f"the following keys passed to {name} are not allowed: {nonval_keys}"
|
227
228
|
)
|
228
229
|
|
229
|
-
def _save_columns(self, validated_only: bool = True
|
230
|
+
def _save_columns(self, validated_only: bool = True) -> None:
|
230
231
|
"""Save column name records."""
|
231
232
|
# Always save features specified as the fields keys
|
232
233
|
update_registry(
|
@@ -238,7 +239,7 @@ class DataFrameCurator(BaseCurator):
|
|
238
239
|
validated_only=False,
|
239
240
|
source=self._sources.get("columns"),
|
240
241
|
exclude=self._exclude.get("columns"),
|
241
|
-
**
|
242
|
+
**self._kwargs, # type: ignore
|
242
243
|
)
|
243
244
|
|
244
245
|
# Save the rest of the columns based on validated_only
|
@@ -255,7 +256,7 @@ class DataFrameCurator(BaseCurator):
|
|
255
256
|
source=self._sources.get("columns"),
|
256
257
|
exclude=self._exclude.get("columns"),
|
257
258
|
warning=False, # Do not warn about missing columns, just an info message
|
258
|
-
**
|
259
|
+
**self._kwargs, # type: ignore
|
259
260
|
)
|
260
261
|
|
261
262
|
def add_new_from(self, key: str, organism: str | None = None, **kwargs):
|
@@ -288,9 +289,11 @@ class DataFrameCurator(BaseCurator):
|
|
288
289
|
self._save_columns(validated_only=validated_only, **kwargs)
|
289
290
|
else:
|
290
291
|
if categorical not in self.fields:
|
291
|
-
raise
|
292
|
+
raise ValidationError(
|
293
|
+
f"Feature {categorical} is not part of the fields!"
|
294
|
+
)
|
292
295
|
update_registry(
|
293
|
-
values=self._df[categorical]
|
296
|
+
values=_flatten_unique(self._df[categorical]),
|
294
297
|
field=self.fields[categorical],
|
295
298
|
key=categorical,
|
296
299
|
using_key=self._using_key,
|
@@ -303,7 +306,6 @@ class DataFrameCurator(BaseCurator):
|
|
303
306
|
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
304
307
|
"""Save labels for all features."""
|
305
308
|
for name in self.fields.keys():
|
306
|
-
logger.info(f"saving validated records of '{name}'")
|
307
309
|
self._update_registry(name, validated_only=validated_only, **kwargs)
|
308
310
|
|
309
311
|
def validate(self, organism: str | None = None) -> bool:
|
@@ -434,12 +436,15 @@ class AnnDataCurator(DataFrameCurator):
|
|
434
436
|
) -> None:
|
435
437
|
from lamindb_setup.core import upath
|
436
438
|
|
439
|
+
if isinstance(var_index, str):
|
440
|
+
raise TypeError("var_index parameter has to be a bionty field")
|
441
|
+
|
437
442
|
from ._artifact import data_is_anndata
|
438
443
|
|
439
444
|
if sources is None:
|
440
445
|
sources = {}
|
441
446
|
if not data_is_anndata(data):
|
442
|
-
raise
|
447
|
+
raise TypeError(
|
443
448
|
"data has to be an AnnData object or a path to AnnData-like"
|
444
449
|
)
|
445
450
|
if isinstance(data, ad.AnnData):
|
@@ -449,6 +454,11 @@ class AnnDataCurator(DataFrameCurator):
|
|
449
454
|
|
450
455
|
self._adata = backed_access(upath.create_path(data))
|
451
456
|
|
457
|
+
if "symbol" in str(var_index):
|
458
|
+
logger.warning(
|
459
|
+
"Curating gene symbols is discouraged. See FAQ for more details."
|
460
|
+
)
|
461
|
+
|
452
462
|
self._data = data
|
453
463
|
self._var_field = var_index
|
454
464
|
super().__init__(
|
@@ -508,13 +518,11 @@ class AnnDataCurator(DataFrameCurator):
|
|
508
518
|
exclude=self._exclude.get("var_index"),
|
509
519
|
)
|
510
520
|
|
511
|
-
def _update_registry_all(self):
|
521
|
+
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
512
522
|
"""Save labels for all features."""
|
513
|
-
|
514
|
-
self._save_from_var_index(validated_only=True, **self._kwargs)
|
523
|
+
self._save_from_var_index(validated_only=validated_only, **self._kwargs)
|
515
524
|
for name in self._obs_fields.keys():
|
516
|
-
|
517
|
-
self._update_registry(name, validated_only=True, **self._kwargs)
|
525
|
+
self._update_registry(name, validated_only=validated_only, **self._kwargs)
|
518
526
|
|
519
527
|
def add_new_from_var_index(self, organism: str | None = None, **kwargs):
|
520
528
|
"""Update variable records.
|
@@ -704,7 +712,7 @@ class MuDataCurator:
|
|
704
712
|
"""Verify the modality exists."""
|
705
713
|
for modality in modalities:
|
706
714
|
if modality not in self._mdata.mod.keys():
|
707
|
-
raise
|
715
|
+
raise ValidationError(f"modality '{modality}' does not exist!")
|
708
716
|
|
709
717
|
def _save_from_var_index_modality(
|
710
718
|
self, modality: str, validated_only: bool = True, **kwargs
|
@@ -729,7 +737,7 @@ class MuDataCurator:
|
|
729
737
|
obs_fields: dict[str, dict[str, FieldAttr]] = {}
|
730
738
|
for k, v in categoricals.items():
|
731
739
|
if k not in self._mdata.obs.columns:
|
732
|
-
raise
|
740
|
+
raise ValidationError(f"column '{k}' does not exist in mdata.obs!")
|
733
741
|
if any(k.startswith(prefix) for prefix in prefixes):
|
734
742
|
modality, col = k.split(":")[0], k.split(":")[1]
|
735
743
|
if modality not in obs_fields.keys():
|
@@ -1120,7 +1128,7 @@ def check_registry_organism(registry: Record, organism: str | None = None) -> di
|
|
1120
1128
|
import bionty as bt
|
1121
1129
|
|
1122
1130
|
if organism is None and bt.settings.organism is None:
|
1123
|
-
raise
|
1131
|
+
raise ValidationError(
|
1124
1132
|
f"{registry.__name__} registry requires an organism!\n"
|
1125
1133
|
" → please pass an organism name via organism="
|
1126
1134
|
)
|
@@ -1148,8 +1156,8 @@ def validate_categories(
|
|
1148
1156
|
using_key: A reference LaminDB instance.
|
1149
1157
|
organism: The organism name.
|
1150
1158
|
source: The source record.
|
1151
|
-
exclude: Exclude specific values.
|
1152
|
-
standardize:
|
1159
|
+
exclude: Exclude specific values from validation.
|
1160
|
+
standardize: Whether to standardize the values.
|
1153
1161
|
validated_hint_print: The hint to print for validated values.
|
1154
1162
|
"""
|
1155
1163
|
from lamindb._from_values import _print_values
|
@@ -1210,12 +1218,15 @@ def validate_categories(
|
|
1210
1218
|
|
1211
1219
|
validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
|
1212
1220
|
n_validated = len(values_validated)
|
1221
|
+
|
1213
1222
|
if n_validated > 0:
|
1214
1223
|
_log_mapping_info()
|
1224
|
+
terms_str = f"{', '.join([f'{chr(39)}{v}{chr(39)}' for v in values_validated[:10]])}{', ...' if len(values_validated) > 10 else ''}"
|
1225
|
+
val_numerous = "" if n_validated == 1 else "s"
|
1215
1226
|
logger.warning(
|
1216
|
-
f"found {colors.yellow(n_validated)} validated
|
1217
|
-
f"{colors.yellow(
|
1218
|
-
f"{colors.yellow(validated_hint_print)}"
|
1227
|
+
f"found {colors.yellow(n_validated)} validated term{val_numerous}: "
|
1228
|
+
f"{colors.yellow(terms_str)}\n"
|
1229
|
+
f"→ save term{val_numerous} via {colors.yellow(validated_hint_print)}"
|
1219
1230
|
)
|
1220
1231
|
|
1221
1232
|
non_validated_hint_print = validated_hint_print.replace("_validated_", "_new_")
|
@@ -1224,19 +1235,21 @@ def validate_categories(
|
|
1224
1235
|
if n_non_validated == 0:
|
1225
1236
|
if n_validated == 0:
|
1226
1237
|
logger.indent = ""
|
1227
|
-
logger.success(f"{key} is validated against {colors.italic(model_field)}")
|
1238
|
+
logger.success(f"'{key}' is validated against {colors.italic(model_field)}")
|
1228
1239
|
return True, []
|
1229
1240
|
else:
|
1230
1241
|
# validated values still need to be saved to the current instance
|
1231
1242
|
return False, []
|
1232
1243
|
else:
|
1233
|
-
|
1244
|
+
non_val_numerous = ("", "is") if n_non_validated == 1 else ("s", "are")
|
1234
1245
|
print_values = _print_values(non_validated)
|
1235
1246
|
warning_message = (
|
1236
|
-
f"{colors.red(f'{n_non_validated}
|
1237
|
-
f"{colors.red(print_values)
|
1247
|
+
f"{colors.red(f'{n_non_validated} term{non_val_numerous[0]}')} {non_val_numerous[1]} not validated: "
|
1248
|
+
f"{colors.red(', '.join(print_values.split(', ')[:10]) + ', ...' if len(print_values.split(', ')) > 10 else print_values)}\n"
|
1249
|
+
f"→ fix typo{non_val_numerous[0]}, remove non-existent value{non_val_numerous[0]}, or save term{non_val_numerous[0]} via "
|
1238
1250
|
f"{colors.red(non_validated_hint_print)}"
|
1239
1251
|
)
|
1252
|
+
|
1240
1253
|
if logger.indent == "":
|
1241
1254
|
_log_mapping_info()
|
1242
1255
|
logger.warning(warning_message)
|
@@ -1427,6 +1440,19 @@ def save_artifact(
|
|
1427
1440
|
return artifact
|
1428
1441
|
|
1429
1442
|
|
1443
|
+
def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:
|
1444
|
+
"""Flatten a Pandas series containing lists or single items into a unique list of elements."""
|
1445
|
+
result = set()
|
1446
|
+
|
1447
|
+
for item in series:
|
1448
|
+
if isinstance(item, list):
|
1449
|
+
result.update(item)
|
1450
|
+
else:
|
1451
|
+
result.add(item)
|
1452
|
+
|
1453
|
+
return list(result)
|
1454
|
+
|
1455
|
+
|
1430
1456
|
def update_registry(
|
1431
1457
|
values: list[str],
|
1432
1458
|
field: FieldAttr,
|
@@ -1485,9 +1511,14 @@ def update_registry(
|
|
1485
1511
|
|
1486
1512
|
public_records = [r for r in existing_and_public_records if r._state.adding]
|
1487
1513
|
# here we check to only save the public records if they are from the specified source
|
1488
|
-
# we check the uid because r.source and
|
1514
|
+
# we check the uid because r.source and source can be from different instances
|
1489
1515
|
if source:
|
1490
1516
|
public_records = [r for r in public_records if r.source.uid == source.uid]
|
1517
|
+
|
1518
|
+
if public_records:
|
1519
|
+
settings.verbosity = "info"
|
1520
|
+
logger.info(f"saving validated records of '{key}'")
|
1521
|
+
settings.verbosity = "error"
|
1491
1522
|
ln_save(public_records)
|
1492
1523
|
labels_saved["from public"] = [
|
1493
1524
|
getattr(r, field.field.name) for r in public_records
|
@@ -1596,24 +1627,25 @@ def log_saved_labels(
|
|
1596
1627
|
continue
|
1597
1628
|
|
1598
1629
|
if k == "without reference" and validated_only:
|
1599
|
-
|
1600
|
-
|
1601
|
-
)
|
1602
|
-
|
1603
|
-
|
1604
|
-
)
|
1605
|
-
|
1606
|
-
|
1607
|
-
|
1608
|
-
msg +=
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
)
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1630
|
+
continue
|
1631
|
+
# msg = colors.yellow(
|
1632
|
+
# f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
|
1633
|
+
# )
|
1634
|
+
# lookup_print = (
|
1635
|
+
# f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
|
1636
|
+
# )
|
1637
|
+
|
1638
|
+
# hint = f".add_new_from('{key}')"
|
1639
|
+
# msg += f"\n → to lookup values, use {lookup_print}"
|
1640
|
+
# msg += (
|
1641
|
+
# f"\n → to save, run {colors.yellow(hint)}"
|
1642
|
+
# if save_function == "add_new_from"
|
1643
|
+
# else f"\n → to save, run {colors.yellow(save_function)}"
|
1644
|
+
# )
|
1645
|
+
# if warning:
|
1646
|
+
# logger.warning(msg)
|
1647
|
+
# else:
|
1648
|
+
# logger.info(msg)
|
1617
1649
|
else:
|
1618
1650
|
k = "" if k == "without reference" else f"{colors.green(k)} "
|
1619
1651
|
# the term "transferred" stresses that this is always in the context of transferring
|
@@ -1631,8 +1663,8 @@ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> N
|
|
1631
1663
|
all_records = registry.from_values(list(values), field=field)
|
1632
1664
|
is_feature = registry.filter(name=f"is_{key}").one_or_none()
|
1633
1665
|
if is_feature is None:
|
1634
|
-
is_feature = registry(name=f"is_{key}")
|
1635
|
-
|
1666
|
+
is_feature = registry(name=f"is_{key}").save()
|
1667
|
+
logger.important(f"Created a parent ULabel: {is_feature}")
|
1636
1668
|
is_feature.children.add(*all_records)
|
1637
1669
|
|
1638
1670
|
|
@@ -1689,7 +1721,7 @@ def _save_organism(name: str): # pragma: no cover
|
|
1689
1721
|
if organism is None:
|
1690
1722
|
organism = bt.Organism.from_source(name=name)
|
1691
1723
|
if organism is None:
|
1692
|
-
raise
|
1724
|
+
raise ValidationError(
|
1693
1725
|
f"Organism '{name}' not found\n"
|
1694
1726
|
f" → please save it: bt.Organism(name='{name}').save()"
|
1695
1727
|
)
|
lamindb/_feature.py
CHANGED
@@ -8,10 +8,9 @@ from lamindb_setup.core._docs import doc_args
|
|
8
8
|
from lnschema_core.models import Artifact, Feature
|
9
9
|
from pandas.api.types import CategoricalDtype, is_string_dtype
|
10
10
|
|
11
|
-
from lamindb._utils import attach_func_to_class_method
|
12
|
-
from lamindb.core._settings import settings
|
13
|
-
|
14
11
|
from ._query_set import RecordsList
|
12
|
+
from ._utils import attach_func_to_class_method
|
13
|
+
from .core._settings import settings
|
15
14
|
from .core.schema import dict_schema_name_to_model_name
|
16
15
|
|
17
16
|
if TYPE_CHECKING:
|
lamindb/_feature_set.py
CHANGED
@@ -10,10 +10,9 @@ from lamindb_setup.core.hashing import hash_set
|
|
10
10
|
from lnschema_core import Feature, FeatureSet, Record, ids
|
11
11
|
from lnschema_core.types import FieldAttr, ListLike
|
12
12
|
|
13
|
-
from lamindb._utils import attach_func_to_class_method
|
14
|
-
|
15
13
|
from ._feature import convert_numpy_dtype_to_lamin_feature_type
|
16
14
|
from ._record import init_self_from_db
|
15
|
+
from ._utils import attach_func_to_class_method
|
17
16
|
from .core.exceptions import ValidationError
|
18
17
|
from .core.schema import (
|
19
18
|
dict_related_model_to_related_name,
|