lamindb 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -3
- lamindb/_finish.py +32 -16
- lamindb/base/types.py +6 -4
- lamindb/core/_context.py +127 -57
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/_settings.py +44 -4
- lamindb/core/_track_environment.py +5 -2
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_anndata_accessor.py +1 -1
- lamindb/core/storage/_tiledbsoma.py +14 -8
- lamindb/core/storage/_valid_suffixes.py +0 -1
- lamindb/core/storage/_zarr.py +1 -1
- lamindb/core/storage/objects.py +13 -8
- lamindb/core/storage/paths.py +9 -6
- lamindb/core/types.py +1 -1
- lamindb/curators/_legacy.py +2 -1
- lamindb/curators/core.py +106 -105
- lamindb/errors.py +9 -0
- lamindb/examples/fixtures/__init__.py +0 -0
- lamindb/examples/fixtures/sheets.py +224 -0
- lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +1 -1
- lamindb/migrations/0105_record_unique_name.py +20 -0
- lamindb/migrations/0106_transfer_data_migration.py +25 -0
- lamindb/migrations/0107_add_schema_to_record.py +68 -0
- lamindb/migrations/0108_remove_record_sheet_remove_sheetproject_sheet_and_more.py +30 -0
- lamindb/migrations/0109_record_input_of_runs_alter_record_run_and_more.py +123 -0
- lamindb/migrations/0110_rename_values_artifacts_record_linked_artifacts.py +17 -0
- lamindb/migrations/0111_remove_record__sort_order.py +148 -0
- lamindb/migrations/0112_alter_recordartifact_feature_and_more.py +105 -0
- lamindb/migrations/0113_lower_case_branch_and_space_names.py +62 -0
- lamindb/migrations/0114_alter_run__status_code.py +24 -0
- lamindb/migrations/0115_alter_space_uid.py +52 -0
- lamindb/migrations/{0104_squashed.py → 0115_squashed.py} +261 -257
- lamindb/models/__init__.py +4 -3
- lamindb/models/_describe.py +88 -31
- lamindb/models/_feature_manager.py +627 -658
- lamindb/models/_label_manager.py +1 -3
- lamindb/models/artifact.py +214 -99
- lamindb/models/collection.py +7 -1
- lamindb/models/feature.py +288 -60
- lamindb/models/has_parents.py +3 -3
- lamindb/models/project.py +32 -15
- lamindb/models/query_manager.py +7 -1
- lamindb/models/query_set.py +118 -41
- lamindb/models/record.py +140 -94
- lamindb/models/run.py +42 -42
- lamindb/models/save.py +102 -16
- lamindb/models/schema.py +41 -8
- lamindb/models/sqlrecord.py +105 -40
- lamindb/models/storage.py +278 -0
- lamindb/models/transform.py +10 -2
- lamindb/models/ulabel.py +9 -1
- lamindb/py.typed +0 -0
- lamindb/setup/__init__.py +2 -1
- lamindb/setup/_switch.py +16 -0
- lamindb/setup/errors/__init__.py +4 -0
- lamindb/setup/types/__init__.py +4 -0
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/METADATA +5 -5
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/RECORD +61 -44
- lamindb/models/core.py +0 -135
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/LICENSE +0 -0
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/WHEEL +0 -0
@@ -13,7 +13,7 @@ from lamindb_setup.core.upath import LocalPathClasses, create_path
|
|
13
13
|
from packaging import version
|
14
14
|
|
15
15
|
if TYPE_CHECKING:
|
16
|
-
from lamindb_setup.
|
16
|
+
from lamindb_setup.types import UPathStr
|
17
17
|
from tiledbsoma import Collection as SOMACollection
|
18
18
|
from tiledbsoma import Experiment as SOMAExperiment
|
19
19
|
from tiledbsoma import Measurement as SOMAMeasurement
|
@@ -54,12 +54,18 @@ def _tiledb_config_s3(storepath: UPath) -> dict:
|
|
54
54
|
else:
|
55
55
|
tiledb_config["vfs.s3.region"] = get_storage_region(storepath)
|
56
56
|
|
57
|
-
if "
|
58
|
-
tiledb_config["vfs.s3.
|
59
|
-
|
60
|
-
tiledb_config["vfs.s3.aws_secret_access_key"] =
|
61
|
-
|
62
|
-
|
57
|
+
if storage_options.get("anon", False):
|
58
|
+
tiledb_config["vfs.s3.no_sign_request"] = "true"
|
59
|
+
tiledb_config["vfs.s3.aws_access_key_id"] = ""
|
60
|
+
tiledb_config["vfs.s3.aws_secret_access_key"] = ""
|
61
|
+
tiledb_config["vfs.s3.aws_session_token"] = ""
|
62
|
+
else:
|
63
|
+
if "key" in storage_options:
|
64
|
+
tiledb_config["vfs.s3.aws_access_key_id"] = storage_options["key"]
|
65
|
+
if "secret" in storage_options:
|
66
|
+
tiledb_config["vfs.s3.aws_secret_access_key"] = storage_options["secret"]
|
67
|
+
if "token" in storage_options:
|
68
|
+
tiledb_config["vfs.s3.aws_session_token"] = storage_options["token"]
|
63
69
|
|
64
70
|
return tiledb_config
|
65
71
|
|
@@ -148,7 +154,7 @@ def save_tiledbsoma_experiment(
|
|
148
154
|
else:
|
149
155
|
uid, _ = create_uid(n_full_id=20)
|
150
156
|
storage_key = auto_storage_key_from_artifact_uid(
|
151
|
-
uid, ".tiledbsoma",
|
157
|
+
uid, ".tiledbsoma", overwrite_versions=True
|
152
158
|
)
|
153
159
|
storepath = setup_settings.storage.root / storage_key
|
154
160
|
|
lamindb/core/storage/_zarr.py
CHANGED
lamindb/core/storage/objects.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from pathlib import PurePosixPath
|
4
|
-
from typing import TYPE_CHECKING, TypeAlias
|
4
|
+
from typing import TYPE_CHECKING, Any, TypeAlias
|
5
5
|
|
6
6
|
from anndata import AnnData
|
7
7
|
from pandas import DataFrame
|
@@ -12,14 +12,15 @@ from lamindb.core._compat import (
|
|
12
12
|
from lamindb.core.types import ScverseDataStructures
|
13
13
|
|
14
14
|
if TYPE_CHECKING:
|
15
|
-
from lamindb_setup.
|
15
|
+
from lamindb_setup.types import UPathStr
|
16
16
|
|
17
17
|
SupportedDataTypes: TypeAlias = DataFrame | ScverseDataStructures
|
18
18
|
|
19
19
|
|
20
|
-
def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
|
20
|
+
def infer_suffix(dmem: SupportedDataTypes, format: str | dict[str, Any] | None = None):
|
21
21
|
"""Infer LaminDB storage file suffix from a data object."""
|
22
22
|
if isinstance(dmem, AnnData):
|
23
|
+
assert not isinstance(format, dict) # noqa: S101
|
23
24
|
if format is not None:
|
24
25
|
# should be `.h5ad`, `.`zarr`, or `.anndata.zarr`
|
25
26
|
if format not in {"h5ad", "zarr", "anndata.zarr"}:
|
@@ -32,8 +33,12 @@ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
|
|
32
33
|
return ".h5ad"
|
33
34
|
|
34
35
|
if isinstance(dmem, DataFrame):
|
35
|
-
if format
|
36
|
-
|
36
|
+
if isinstance(format, str):
|
37
|
+
if format == ".csv":
|
38
|
+
return ".csv"
|
39
|
+
elif isinstance(format, dict):
|
40
|
+
if format.get("suffix") == ".csv":
|
41
|
+
return ".csv"
|
37
42
|
return ".parquet"
|
38
43
|
|
39
44
|
if with_package_obj(
|
@@ -68,7 +73,7 @@ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
|
|
68
73
|
raise NotImplementedError
|
69
74
|
|
70
75
|
|
71
|
-
def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr) -> None:
|
76
|
+
def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr, **kwargs) -> None:
|
72
77
|
"""Writes the passed in memory data to disk to a specified path."""
|
73
78
|
if isinstance(dmem, AnnData):
|
74
79
|
suffix = PurePosixPath(filepath).suffix
|
@@ -83,9 +88,9 @@ def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr) -> None:
|
|
83
88
|
|
84
89
|
if isinstance(dmem, DataFrame):
|
85
90
|
if filepath.suffix == ".csv":
|
86
|
-
dmem.to_csv(filepath)
|
91
|
+
dmem.to_csv(filepath, **kwargs)
|
87
92
|
return
|
88
|
-
dmem.to_parquet(filepath)
|
93
|
+
dmem.to_parquet(filepath, **kwargs)
|
89
94
|
return
|
90
95
|
|
91
96
|
if with_package_obj(dmem, "MuData", "mudata", lambda obj: obj.write(filepath))[0]:
|
lamindb/core/storage/paths.py
CHANGED
@@ -15,7 +15,7 @@ from lamindb.core._settings import settings
|
|
15
15
|
if TYPE_CHECKING:
|
16
16
|
from pathlib import Path
|
17
17
|
|
18
|
-
from lamindb_setup.
|
18
|
+
from lamindb_setup.types import UPathStr
|
19
19
|
|
20
20
|
from lamindb.models.artifact import Artifact
|
21
21
|
|
@@ -26,15 +26,18 @@ AUTO_KEY_PREFIX = ".lamindb/"
|
|
26
26
|
# add type annotations back asap when re-organizing the module
|
27
27
|
def auto_storage_key_from_artifact(artifact: Artifact):
|
28
28
|
if artifact.key is None or artifact._key_is_virtual:
|
29
|
-
|
30
|
-
|
29
|
+
return auto_storage_key_from_artifact_uid(
|
30
|
+
artifact.uid, artifact.suffix, artifact.overwrite_versions
|
31
|
+
)
|
31
32
|
else:
|
32
33
|
return artifact.key
|
33
34
|
|
34
35
|
|
35
|
-
def auto_storage_key_from_artifact_uid(
|
36
|
+
def auto_storage_key_from_artifact_uid(
|
37
|
+
uid: str, suffix: str, overwrite_versions: bool
|
38
|
+
) -> str:
|
36
39
|
assert isinstance(suffix, str) # noqa: S101 Suffix cannot be None.
|
37
|
-
if
|
40
|
+
if overwrite_versions:
|
38
41
|
uid_storage = uid[:16] # 16 chars, leave 4 chars for versioning
|
39
42
|
else:
|
40
43
|
uid_storage = uid
|
@@ -75,7 +78,7 @@ def attempt_accessing_path(
|
|
75
78
|
|
76
79
|
if (
|
77
80
|
artifact._state.db in ("default", None)
|
78
|
-
and artifact.storage_id == settings._storage_settings.
|
81
|
+
and artifact.storage_id == settings._storage_settings._id
|
79
82
|
):
|
80
83
|
if access_token is None:
|
81
84
|
storage_settings = settings._storage_settings
|
lamindb/core/types.py
CHANGED
lamindb/curators/_legacy.py
CHANGED
@@ -16,7 +16,7 @@ from lamindb.models.artifact import data_is_scversedatastructure
|
|
16
16
|
from ..errors import InvalidArgument
|
17
17
|
|
18
18
|
if TYPE_CHECKING:
|
19
|
-
from lamindb_setup.
|
19
|
+
from lamindb_setup.types import UPathStr
|
20
20
|
from mudata import MuData
|
21
21
|
from spatialdata import SpatialData
|
22
22
|
|
@@ -222,6 +222,7 @@ class DataFrameCatManager(CatManager):
|
|
222
222
|
key="columns",
|
223
223
|
source=self._sources.get("columns"),
|
224
224
|
)
|
225
|
+
self._cat_vectors["columns"].add_new()
|
225
226
|
for key, field in self._categoricals.items():
|
226
227
|
self._cat_vectors[key] = CatVector(
|
227
228
|
values_getter=lambda k=key: self._dataset[
|
lamindb/curators/core.py
CHANGED
@@ -21,7 +21,7 @@ from typing import TYPE_CHECKING, Any, Callable
|
|
21
21
|
import lamindb_setup as ln_setup
|
22
22
|
import numpy as np
|
23
23
|
import pandas as pd
|
24
|
-
import pandera.pandas as
|
24
|
+
import pandera.pandas as pandera
|
25
25
|
from lamin_utils import colors, logger
|
26
26
|
from lamindb_setup.core._docs import doc_args
|
27
27
|
|
@@ -38,7 +38,12 @@ from lamindb.models.artifact import (
|
|
38
38
|
data_is_scversedatastructure,
|
39
39
|
data_is_soma_experiment,
|
40
40
|
)
|
41
|
-
from lamindb.models.feature import
|
41
|
+
from lamindb.models.feature import (
|
42
|
+
parse_cat_dtype,
|
43
|
+
parse_dtype,
|
44
|
+
parse_filter_string,
|
45
|
+
resolve_relation_filters,
|
46
|
+
)
|
42
47
|
|
43
48
|
from ..errors import InvalidArgument, ValidationError
|
44
49
|
|
@@ -276,7 +281,6 @@ class SlotsCurator(Curator):
|
|
276
281
|
Args:
|
277
282
|
dataset: The dataset to validate & annotate.
|
278
283
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
279
|
-
|
280
284
|
"""
|
281
285
|
|
282
286
|
def __init__(
|
@@ -324,23 +328,25 @@ class SlotsCurator(Curator):
|
|
324
328
|
if self._artifact is None:
|
325
329
|
type_mapping = [
|
326
330
|
(
|
327
|
-
lambda
|
331
|
+
lambda dataset: data_is_scversedatastructure(dataset, "AnnData"),
|
328
332
|
Artifact.from_anndata,
|
329
333
|
),
|
330
334
|
(
|
331
|
-
lambda
|
335
|
+
lambda dataset: data_is_scversedatastructure(dataset, "MuData"),
|
332
336
|
Artifact.from_mudata,
|
333
337
|
),
|
334
338
|
(
|
335
|
-
lambda
|
339
|
+
lambda dataset: data_is_scversedatastructure(
|
340
|
+
dataset, "SpatialData"
|
341
|
+
),
|
336
342
|
Artifact.from_spatialdata,
|
337
343
|
),
|
338
344
|
(data_is_soma_experiment, Artifact.from_tiledbsoma),
|
339
345
|
]
|
340
346
|
|
341
|
-
for type_check,
|
347
|
+
for type_check, af_constructor in type_mapping:
|
342
348
|
if type_check(self._dataset):
|
343
|
-
self._artifact =
|
349
|
+
self._artifact = af_constructor( # type: ignore
|
344
350
|
self._dataset,
|
345
351
|
key=key,
|
346
352
|
description=description,
|
@@ -373,9 +379,8 @@ def is_list_of_type(value, expected_type):
|
|
373
379
|
def check_dtype(expected_type) -> Callable:
|
374
380
|
"""Creates a check function for Pandera that validates a column's dtype.
|
375
381
|
|
376
|
-
Supports both standard dtype checking and mixed list/single values for
|
377
|
-
|
378
|
-
also accept a mix of float values and lists of floats.
|
382
|
+
Supports both standard dtype checking and mixed list/single values for the same type.
|
383
|
+
For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.
|
379
384
|
|
380
385
|
Args:
|
381
386
|
expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
|
@@ -394,6 +399,8 @@ def check_dtype(expected_type) -> Callable:
|
|
394
399
|
return True
|
395
400
|
elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
|
396
401
|
return True
|
402
|
+
elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
|
403
|
+
return True
|
397
404
|
|
398
405
|
# if we're here, it might be a mixed column with object dtype
|
399
406
|
# need to check each value individually
|
@@ -406,8 +413,10 @@ def check_dtype(expected_type) -> Callable:
|
|
406
413
|
elif expected_type_member == "num":
|
407
414
|
# for numeric, accept either int or float
|
408
415
|
return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
|
409
|
-
elif
|
410
|
-
"
|
416
|
+
elif (
|
417
|
+
expected_type_member == "str"
|
418
|
+
or expected_type_member == "path"
|
419
|
+
or expected_type_member.startswith("cat[")
|
411
420
|
):
|
412
421
|
return series.apply(lambda x: is_list_of_type(x, str)).all()
|
413
422
|
|
@@ -490,9 +499,12 @@ class DataFrameCurator(Curator):
|
|
490
499
|
else:
|
491
500
|
required = False
|
492
501
|
# series.dtype is "object" if the column has lists types, e.g. [["a", "b"], ["a"], ["b"]]
|
493
|
-
if feature.dtype in {
|
494
|
-
"
|
495
|
-
|
502
|
+
if feature.dtype in {
|
503
|
+
"int",
|
504
|
+
"float",
|
505
|
+
"num",
|
506
|
+
"path",
|
507
|
+
} or feature.dtype.startswith("list"):
|
496
508
|
if isinstance(self._dataset, pd.DataFrame):
|
497
509
|
dtype = (
|
498
510
|
self._dataset[feature.name].dtype
|
@@ -501,9 +513,9 @@ class DataFrameCurator(Curator):
|
|
501
513
|
)
|
502
514
|
else:
|
503
515
|
dtype = None
|
504
|
-
pandera_columns[feature.name] =
|
516
|
+
pandera_columns[feature.name] = pandera.Column(
|
505
517
|
dtype=None,
|
506
|
-
checks=
|
518
|
+
checks=pandera.Check(
|
507
519
|
check_dtype(feature.dtype),
|
508
520
|
element_wise=False,
|
509
521
|
error=f"Column '{feature.name}' failed dtype check for '{feature.dtype}': got {dtype}",
|
@@ -518,7 +530,7 @@ class DataFrameCurator(Curator):
|
|
518
530
|
if not feature.dtype.startswith("cat")
|
519
531
|
else "category"
|
520
532
|
)
|
521
|
-
pandera_columns[feature.name] =
|
533
|
+
pandera_columns[feature.name] = pandera.Column(
|
522
534
|
pandera_dtype,
|
523
535
|
nullable=feature.nullable,
|
524
536
|
coerce=feature.coerce_dtype,
|
@@ -533,24 +545,26 @@ class DataFrameCurator(Curator):
|
|
533
545
|
if schema._index_feature_uid is not None:
|
534
546
|
# in almost no case, an index should have a pandas.CategoricalDtype in a DataFrame
|
535
547
|
# so, we're typing it as `str` here
|
536
|
-
index =
|
548
|
+
index = pandera.Index(
|
537
549
|
schema.index.dtype
|
538
550
|
if not schema.index.dtype.startswith("cat")
|
539
551
|
else str
|
540
552
|
)
|
541
553
|
else:
|
542
554
|
index = None
|
543
|
-
self._pandera_schema =
|
555
|
+
self._pandera_schema = pandera.DataFrameSchema(
|
544
556
|
pandera_columns,
|
545
557
|
coerce=schema.coerce_dtype,
|
546
558
|
strict=schema.maximal_set,
|
547
559
|
ordered=schema.ordered_set,
|
548
560
|
index=index,
|
549
561
|
)
|
562
|
+
# in the DataFrameCatManager, we use the
|
563
|
+
# actual columns of the dataset, not the pandera columns
|
564
|
+
# the pandera columns might have additional optional columns
|
550
565
|
self._cat_manager = DataFrameCatManager(
|
551
566
|
self._dataset,
|
552
567
|
columns_field=parse_cat_dtype(schema.itype, is_itype=True)["field"],
|
553
|
-
columns_names=pandera_columns.keys(),
|
554
568
|
categoricals=categoricals,
|
555
569
|
index=schema.index,
|
556
570
|
slot=slot,
|
@@ -621,10 +635,10 @@ class DataFrameCurator(Curator):
|
|
621
635
|
if self._schema.n > 0:
|
622
636
|
try:
|
623
637
|
# first validate through pandera
|
624
|
-
self._pandera_schema.validate(self._dataset)
|
638
|
+
self._pandera_schema.validate(self._dataset, lazy=True)
|
625
639
|
# then validate lamindb categoricals
|
626
640
|
self._cat_manager_validate()
|
627
|
-
except
|
641
|
+
except (pandera.errors.SchemaError, pandera.errors.SchemaErrors) as err:
|
628
642
|
self._is_validated = False
|
629
643
|
# .exconly() doesn't exist on SchemaError
|
630
644
|
raise ValidationError(str(err)) from err
|
@@ -904,7 +918,7 @@ class SpatialDataCurator(SlotsCurator):
|
|
904
918
|
|
905
919
|
|
906
920
|
class TiledbsomaExperimentCurator(SlotsCurator):
|
907
|
-
"""Curator for `
|
921
|
+
"""Curator for `tiledbsoma.Experiment`.
|
908
922
|
|
909
923
|
Args:
|
910
924
|
dataset: The `tiledbsoma.Experiment` object.
|
@@ -933,7 +947,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
|
|
933
947
|
|
934
948
|
for slot, slot_schema in schema.slots.items():
|
935
949
|
if slot.startswith("ms:"):
|
936
|
-
|
950
|
+
_, modality_slot = slot.split(":")
|
937
951
|
schema_dataset = (
|
938
952
|
self._dataset.ms[modality_slot.removesuffix(".T")]
|
939
953
|
.var.read()
|
@@ -943,21 +957,12 @@ class TiledbsomaExperimentCurator(SlotsCurator):
|
|
943
957
|
)
|
944
958
|
|
945
959
|
self._slots[slot] = DataFrameCurator(
|
946
|
-
(
|
947
|
-
schema_dataset.T
|
948
|
-
if modality_slot == "var.T"
|
949
|
-
or (
|
950
|
-
# backward compat
|
951
|
-
modality_slot == "var"
|
952
|
-
and schema.slots[slot].itype not in {None, "Feature"}
|
953
|
-
)
|
954
|
-
else schema_dataset
|
955
|
-
),
|
960
|
+
(schema_dataset.T if modality_slot == "var.T" else schema_dataset),
|
956
961
|
slot_schema,
|
957
962
|
)
|
958
963
|
else:
|
959
964
|
# global Experiment obs slot
|
960
|
-
|
965
|
+
modality_slot = slot
|
961
966
|
schema_dataset = (
|
962
967
|
self._dataset.obs.read()
|
963
968
|
.concat()
|
@@ -969,16 +974,8 @@ class TiledbsomaExperimentCurator(SlotsCurator):
|
|
969
974
|
slot_schema,
|
970
975
|
)
|
971
976
|
|
972
|
-
if modality_slot == "var" and schema.slots[slot].itype not in {
|
973
|
-
None,
|
974
|
-
"Feature",
|
975
|
-
}:
|
976
|
-
logger.warning(
|
977
|
-
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
978
|
-
)
|
979
|
-
|
980
977
|
_assign_var_fields_categoricals_multimodal(
|
981
|
-
modality=slot, # not
|
978
|
+
modality=slot, # not passing `measurement` here because it's a constant. The slot has the actual modality
|
982
979
|
slot_type=modality_slot,
|
983
980
|
slot=slot,
|
984
981
|
slot_schema=slot_schema,
|
@@ -1020,6 +1017,13 @@ class CatVector:
|
|
1020
1017
|
self.feature = feature
|
1021
1018
|
self.records = None
|
1022
1019
|
self._maximal_set = maximal_set
|
1020
|
+
|
1021
|
+
self._all_filters = {"source": self._source, "organism": self._organism}
|
1022
|
+
if self._subtype_str and "=" in self._subtype_str:
|
1023
|
+
self._all_filters.update(
|
1024
|
+
resolve_relation_filters(parse_filter_string(self._subtype_str), self) # type: ignore
|
1025
|
+
)
|
1026
|
+
|
1023
1027
|
if hasattr(field.field.model, "_name_field"):
|
1024
1028
|
label_ref_is_name = field.field.name == field.field.model._name_field
|
1025
1029
|
else:
|
@@ -1049,7 +1053,7 @@ class CatVector:
|
|
1049
1053
|
# should probably add a setting `at_least_one_validated`
|
1050
1054
|
result = True
|
1051
1055
|
if len(self.values) > 0 and len(self.values) == len(self._non_validated):
|
1052
|
-
|
1056
|
+
logger.warning(f"no values were validated for {self._key}!")
|
1053
1057
|
# len(self._non_validated) != 0
|
1054
1058
|
# if maximal_set is True, return False
|
1055
1059
|
# if maximal_set is False, return True
|
@@ -1116,9 +1120,15 @@ class CatVector:
|
|
1116
1120
|
registry = self._field.field.model
|
1117
1121
|
field_name = self._field.field.name
|
1118
1122
|
model_field = registry.__get_name_with_module__()
|
1119
|
-
filter_kwargs = get_current_filter_kwargs(
|
1120
|
-
|
1121
|
-
|
1123
|
+
filter_kwargs = get_current_filter_kwargs(registry, self._all_filters)
|
1124
|
+
|
1125
|
+
valid_from_values_kwargs = {}
|
1126
|
+
for key, value in filter_kwargs.items():
|
1127
|
+
if key in {"field", "organism", "source", "mute"}:
|
1128
|
+
valid_from_values_kwargs[key] = value
|
1129
|
+
elif hasattr(registry, key) and "__" not in key:
|
1130
|
+
valid_from_values_kwargs[key] = value
|
1131
|
+
|
1122
1132
|
values = [
|
1123
1133
|
i
|
1124
1134
|
for i in self.values
|
@@ -1133,13 +1143,13 @@ class CatVector:
|
|
1133
1143
|
str_values = _flatten_unique(values)
|
1134
1144
|
|
1135
1145
|
# inspect the default instance and save validated records from public
|
1136
|
-
if
|
1137
|
-
self._subtype_str != "" and "__" not in self._subtype_str
|
1138
|
-
): # not for general filter expressions
|
1146
|
+
if self._subtype_str != "" and "=" not in self._subtype_str:
|
1139
1147
|
related_name = registry._meta.get_field("type").remote_field.related_name
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1148
|
+
type_record = registry.get(name=self._subtype_str)
|
1149
|
+
if registry.__name__ == "Record":
|
1150
|
+
self._subtype_query_set = type_record.query_children()
|
1151
|
+
else:
|
1152
|
+
self._subtype_query_set = getattr(type_record, related_name).all()
|
1143
1153
|
values_array = np.array(str_values)
|
1144
1154
|
validated_mask = self._subtype_query_set.validate( # type: ignore
|
1145
1155
|
values_array, field=self._field, **filter_kwargs, mute=True
|
@@ -1149,11 +1159,14 @@ class CatVector:
|
|
1149
1159
|
values_array[~validated_mask],
|
1150
1160
|
)
|
1151
1161
|
records = registry.from_values(
|
1152
|
-
validated_labels,
|
1162
|
+
validated_labels,
|
1163
|
+
field=self._field,
|
1164
|
+
**valid_from_values_kwargs,
|
1165
|
+
mute=True,
|
1153
1166
|
)
|
1154
1167
|
else:
|
1155
1168
|
existing_and_public_records = registry.from_values(
|
1156
|
-
str_values, field=self._field, **
|
1169
|
+
str_values, field=self._field, **valid_from_values_kwargs, mute=True
|
1157
1170
|
)
|
1158
1171
|
existing_and_public_labels = [
|
1159
1172
|
getattr(r, field_name) for r in existing_and_public_records
|
@@ -1236,16 +1249,25 @@ class CatVector:
|
|
1236
1249
|
field_name = self._field.field.name
|
1237
1250
|
model_field = f"{registry.__name__}.{field_name}"
|
1238
1251
|
|
1239
|
-
kwargs_current = get_current_filter_kwargs(
|
1240
|
-
|
1241
|
-
|
1252
|
+
kwargs_current = get_current_filter_kwargs(registry, self._all_filters)
|
1253
|
+
|
1254
|
+
valid_inspect_kwargs = {}
|
1255
|
+
for key, value in kwargs_current.items():
|
1256
|
+
if key in {"field", "organism", "source", "mute", "from_source"}:
|
1257
|
+
valid_inspect_kwargs[key] = value
|
1258
|
+
elif hasattr(registry, key) and "__" not in key:
|
1259
|
+
valid_inspect_kwargs[key] = value
|
1242
1260
|
|
1243
1261
|
# inspect values from the default instance, excluding public
|
1244
1262
|
registry_or_queryset = registry
|
1245
1263
|
if self._subtype_query_set is not None:
|
1246
1264
|
registry_or_queryset = self._subtype_query_set
|
1247
1265
|
inspect_result = registry_or_queryset.inspect(
|
1248
|
-
values,
|
1266
|
+
values,
|
1267
|
+
field=self._field,
|
1268
|
+
mute=True,
|
1269
|
+
from_source=False,
|
1270
|
+
**valid_inspect_kwargs,
|
1249
1271
|
)
|
1250
1272
|
non_validated = inspect_result.non_validated
|
1251
1273
|
syn_mapper = inspect_result.synonyms_mapper
|
@@ -1257,7 +1279,7 @@ class CatVector:
|
|
1257
1279
|
non_validated,
|
1258
1280
|
field=self._field,
|
1259
1281
|
mute=True,
|
1260
|
-
**
|
1282
|
+
**valid_inspect_kwargs,
|
1261
1283
|
)
|
1262
1284
|
values_validated += [getattr(r, field_name) for r in public_records]
|
1263
1285
|
|
@@ -1309,10 +1331,6 @@ class CatVector:
|
|
1309
1331
|
self._validated, self._non_validated = self._add_validated()
|
1310
1332
|
self._non_validated, self._synonyms = self._validate(values=self._non_validated)
|
1311
1333
|
|
1312
|
-
# always register new Features if they are columns
|
1313
|
-
if self._key == "columns" and self._field == Feature.name:
|
1314
|
-
self.add_new()
|
1315
|
-
|
1316
1334
|
def standardize(self) -> None:
|
1317
1335
|
"""Standardize the vector."""
|
1318
1336
|
registry = self._field.field.model
|
@@ -1363,7 +1381,6 @@ class DataFrameCatManager:
|
|
1363
1381
|
self,
|
1364
1382
|
df: pd.DataFrame | Artifact,
|
1365
1383
|
columns_field: FieldAttr = Feature.name,
|
1366
|
-
columns_names: Iterable[str] | None = None,
|
1367
1384
|
categoricals: list[Feature] | None = None,
|
1368
1385
|
sources: dict[str, SQLRecord] | None = None,
|
1369
1386
|
index: Feature | None = None,
|
@@ -1387,29 +1404,19 @@ class DataFrameCatManager:
|
|
1387
1404
|
self._slot = slot
|
1388
1405
|
self._maximal_set = maximal_set
|
1389
1406
|
|
1390
|
-
|
1391
|
-
|
1392
|
-
|
1393
|
-
|
1394
|
-
values_getter=columns_names,
|
1395
|
-
field=columns_field,
|
1396
|
-
key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
|
1397
|
-
source=self._sources.get("columns"),
|
1398
|
-
cat_manager=self,
|
1399
|
-
maximal_set=self._maximal_set,
|
1400
|
-
)
|
1401
|
-
else:
|
1402
|
-
self._cat_vectors["columns"] = CatVector(
|
1403
|
-
values_getter=lambda: self._dataset.columns, # lambda ensures the inplace update
|
1404
|
-
values_setter=lambda new_values: setattr(
|
1405
|
-
self._dataset, "columns", pd.Index(new_values)
|
1406
|
-
),
|
1407
|
-
field=columns_field,
|
1408
|
-
key="columns",
|
1409
|
-
source=self._sources.get("columns"),
|
1410
|
-
cat_manager=self,
|
1411
|
-
maximal_set=self._maximal_set,
|
1407
|
+
self._cat_vectors["columns"] = CatVector(
|
1408
|
+
values_getter=lambda: self._dataset.keys(), # lambda ensures the inplace update
|
1409
|
+
values_setter=lambda new_values: setattr(
|
1410
|
+
self._dataset, "columns", pd.Index(new_values)
|
1412
1411
|
)
|
1412
|
+
if isinstance(self._dataset, pd.DataFrame)
|
1413
|
+
else None,
|
1414
|
+
field=columns_field,
|
1415
|
+
key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
|
1416
|
+
source=self._sources.get("columns"),
|
1417
|
+
cat_manager=self,
|
1418
|
+
maximal_set=self._maximal_set,
|
1419
|
+
)
|
1413
1420
|
for feature in self._categoricals:
|
1414
1421
|
result = parse_dtype(feature.dtype)[
|
1415
1422
|
0
|
@@ -1533,25 +1540,19 @@ class DataFrameCatManager:
|
|
1533
1540
|
self._cat_vectors[key].add_new(**kwargs)
|
1534
1541
|
|
1535
1542
|
|
1536
|
-
def get_current_filter_kwargs(
|
1543
|
+
def get_current_filter_kwargs(
|
1544
|
+
registry: type[SQLRecord], kwargs: dict[str, SQLRecord]
|
1545
|
+
) -> dict:
|
1537
1546
|
"""Make sure the source and organism are saved in the same database as the registry."""
|
1538
1547
|
db = registry.filter().db
|
1539
|
-
source = kwargs.get("source")
|
1540
|
-
organism = kwargs.get("organism")
|
1541
1548
|
filter_kwargs = kwargs.copy()
|
1542
1549
|
|
1543
|
-
|
1544
|
-
if
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
if isinstance(source, SQLRecord) and source._state.db != "default":
|
1550
|
-
if db is None or db == "default":
|
1551
|
-
source_default = copy.copy(source)
|
1552
|
-
# save the source record in the default database
|
1553
|
-
source_default.save()
|
1554
|
-
filter_kwargs["source"] = source_default
|
1550
|
+
for key, value in kwargs.items():
|
1551
|
+
if isinstance(value, SQLRecord) and value._state.db != "default":
|
1552
|
+
if db is None or db == "default":
|
1553
|
+
value_default = copy.copy(value)
|
1554
|
+
value_default.save()
|
1555
|
+
filter_kwargs[key] = value_default
|
1555
1556
|
|
1556
1557
|
return filter_kwargs
|
1557
1558
|
|
lamindb/errors.py
CHANGED
@@ -7,10 +7,13 @@
|
|
7
7
|
InvalidArgument
|
8
8
|
DoesNotExist
|
9
9
|
NotebookNotSaved
|
10
|
+
UnknownStorageLocation
|
10
11
|
MissingContextUID
|
11
12
|
UpdateContext
|
12
13
|
IntegrityError
|
14
|
+
FieldValidationError
|
13
15
|
SQLRecordNameChangeIntegrityError
|
16
|
+
NoWriteAccess
|
14
17
|
|
15
18
|
"""
|
16
19
|
|
@@ -43,6 +46,12 @@ class NotebookNotSaved(Exception):
|
|
43
46
|
pass
|
44
47
|
|
45
48
|
|
49
|
+
class UnknownStorageLocation(Exception):
|
50
|
+
"""Path is not contained in any known storage location."""
|
51
|
+
|
52
|
+
pass
|
53
|
+
|
54
|
+
|
46
55
|
# equivalent to Django's DoesNotExist
|
47
56
|
# and SQLAlchemy's NoResultFound
|
48
57
|
class DoesNotExist(Exception):
|
File without changes
|