lamindb 1.5.3__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +25 -6
- lamindb/_finish.py +5 -5
- lamindb/_tracked.py +1 -1
- lamindb/_view.py +4 -4
- lamindb/core/_context.py +32 -6
- lamindb/core/_settings.py +1 -1
- lamindb/core/datasets/mini_immuno.py +8 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_anndata_accessor.py +9 -9
- lamindb/core/storage/_valid_suffixes.py +1 -0
- lamindb/core/storage/_zarr.py +32 -107
- lamindb/curators/__init__.py +19 -2
- lamindb/curators/_cellxgene_schemas/__init__.py +3 -3
- lamindb/curators/_legacy.py +15 -19
- lamindb/curators/core.py +247 -80
- lamindb/errors.py +2 -2
- lamindb/migrations/0069_squashed.py +8 -8
- lamindb/migrations/0071_lamindbv1_migrate_schema.py +3 -3
- lamindb/migrations/0073_merge_ourprojects.py +7 -7
- lamindb/migrations/0075_lamindbv1_part5.py +1 -1
- lamindb/migrations/0077_lamindbv1_part6b.py +3 -3
- lamindb/migrations/0080_polish_lamindbv1.py +2 -2
- lamindb/migrations/0088_schema_components.py +1 -1
- lamindb/migrations/0090_runproject_project_runs.py +2 -2
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +1 -1
- lamindb/migrations/0094_writeloglock_writelogmigrationstate_and_more.py +84 -0
- lamindb/migrations/0095_remove_rundata_flextable.py +155 -0
- lamindb/migrations/0096_remove_artifact__param_values_and_more.py +266 -0
- lamindb/migrations/0097_remove_schemaparam_param_remove_paramvalue_param_and_more.py +27 -0
- lamindb/migrations/0098_alter_feature_type_alter_project_type_and_more.py +656 -0
- lamindb/migrations/0099_alter_writelog_seqno.py +22 -0
- lamindb/migrations/0100_branch_alter_artifact__branch_code_and_more.py +102 -0
- lamindb/migrations/0101_alter_artifact_hash_alter_feature_name_and_more.py +444 -0
- lamindb/migrations/0102_remove_writelog_branch_code_and_more.py +72 -0
- lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +46 -0
- lamindb/migrations/{0090_squashed.py → 0103_squashed.py} +1013 -1009
- lamindb/models/__init__.py +35 -18
- lamindb/models/_describe.py +4 -4
- lamindb/models/_django.py +38 -4
- lamindb/models/_feature_manager.py +66 -123
- lamindb/models/_from_values.py +13 -13
- lamindb/models/_label_manager.py +8 -6
- lamindb/models/_relations.py +7 -7
- lamindb/models/artifact.py +166 -156
- lamindb/models/can_curate.py +25 -25
- lamindb/models/collection.py +48 -18
- lamindb/models/core.py +3 -3
- lamindb/models/feature.py +88 -60
- lamindb/models/has_parents.py +17 -17
- lamindb/models/project.py +52 -24
- lamindb/models/query_manager.py +5 -5
- lamindb/models/query_set.py +61 -37
- lamindb/models/record.py +158 -1583
- lamindb/models/run.py +39 -176
- lamindb/models/save.py +6 -6
- lamindb/models/schema.py +32 -43
- lamindb/models/sqlrecord.py +1743 -0
- lamindb/models/transform.py +17 -33
- lamindb/models/ulabel.py +21 -15
- {lamindb-1.5.3.dist-info → lamindb-1.6.0.dist-info}/METADATA +7 -11
- lamindb-1.6.0.dist-info/RECORD +118 -0
- lamindb/core/storage/_anndata_sizes.py +0 -41
- lamindb/models/flextable.py +0 -163
- lamindb-1.5.3.dist-info/RECORD +0 -109
- {lamindb-1.5.3.dist-info → lamindb-1.6.0.dist-info}/LICENSE +0 -0
- {lamindb-1.5.3.dist-info → lamindb-1.6.0.dist-info}/WHEEL +0 -0
lamindb/curators/core.py
CHANGED
@@ -15,12 +15,13 @@ from __future__ import annotations
|
|
15
15
|
|
16
16
|
import copy
|
17
17
|
import re
|
18
|
+
from collections.abc import Iterable
|
18
19
|
from typing import TYPE_CHECKING, Any, Callable
|
19
20
|
|
20
21
|
import lamindb_setup as ln_setup
|
21
22
|
import numpy as np
|
22
23
|
import pandas as pd
|
23
|
-
import pandera
|
24
|
+
import pandera.pandas as pa
|
24
25
|
from lamin_utils import colors, logger
|
25
26
|
from lamindb_setup.core._docs import doc_args
|
26
27
|
|
@@ -28,29 +29,29 @@ from lamindb.base.types import FieldAttr # noqa
|
|
28
29
|
from lamindb.models import (
|
29
30
|
Artifact,
|
30
31
|
Feature,
|
31
|
-
Record,
|
32
32
|
Run,
|
33
33
|
Schema,
|
34
|
+
SQLRecord,
|
34
35
|
)
|
35
36
|
from lamindb.models._from_values import _format_values
|
36
37
|
from lamindb.models.artifact import (
|
37
|
-
|
38
|
-
|
39
|
-
data_is_spatialdata,
|
38
|
+
data_is_scversedatastructure,
|
39
|
+
data_is_soma_experiment,
|
40
40
|
)
|
41
41
|
from lamindb.models.feature import parse_cat_dtype, parse_dtype
|
42
42
|
|
43
43
|
from ..errors import InvalidArgument, ValidationError
|
44
44
|
|
45
45
|
if TYPE_CHECKING:
|
46
|
-
from collections.abc import Iterable
|
47
46
|
from typing import Any
|
48
47
|
|
49
48
|
from anndata import AnnData
|
50
49
|
from mudata import MuData
|
51
50
|
from spatialdata import SpatialData
|
51
|
+
from tiledbsoma._experiment import Experiment as SOMAExperiment
|
52
52
|
|
53
|
-
from lamindb.
|
53
|
+
from lamindb.core.types import ScverseDataStructures
|
54
|
+
from lamindb.models.query_set import SQLRecordList
|
54
55
|
|
55
56
|
|
56
57
|
def strip_ansi_codes(text):
|
@@ -79,7 +80,7 @@ class CatLookup:
|
|
79
80
|
categoricals: list[Feature] | dict[str, FieldAttr],
|
80
81
|
slots: dict[str, FieldAttr] = None,
|
81
82
|
public: bool = False,
|
82
|
-
sources: dict[str,
|
83
|
+
sources: dict[str, SQLRecord] | None = None,
|
83
84
|
) -> None:
|
84
85
|
slots = slots or {}
|
85
86
|
if isinstance(categoricals, list):
|
@@ -269,7 +270,6 @@ class Curator:
|
|
269
270
|
)
|
270
271
|
|
271
272
|
|
272
|
-
# default implementation for AnnDataCurator, MuDataCurator, and SpatialDataCurator
|
273
273
|
class SlotsCurator(Curator):
|
274
274
|
"""Curator for a dataset with slots.
|
275
275
|
|
@@ -281,13 +281,13 @@ class SlotsCurator(Curator):
|
|
281
281
|
|
282
282
|
def __init__(
|
283
283
|
self,
|
284
|
-
dataset:
|
284
|
+
dataset: Artifact | ScverseDataStructures | SOMAExperiment,
|
285
285
|
schema: Schema,
|
286
286
|
) -> None:
|
287
287
|
super().__init__(dataset=dataset, schema=schema)
|
288
288
|
self._slots: dict[str, DataFrameCurator] = {}
|
289
289
|
|
290
|
-
# used
|
290
|
+
# used for multimodal data structures (not AnnData)
|
291
291
|
# in form of {table/modality_key: var_field}
|
292
292
|
self._var_fields: dict[str, FieldAttr] = {}
|
293
293
|
# in form of {table/modality_key: categoricals}
|
@@ -320,31 +320,35 @@ class SlotsCurator(Curator):
|
|
320
320
|
"""{}""" # noqa: D415
|
321
321
|
if not self._is_validated:
|
322
322
|
self.validate()
|
323
|
+
|
323
324
|
if self._artifact is None:
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
self.
|
342
|
-
self.
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
325
|
+
type_mapping = [
|
326
|
+
(
|
327
|
+
lambda data: data_is_scversedatastructure(data, "AnnData"),
|
328
|
+
Artifact.from_anndata,
|
329
|
+
),
|
330
|
+
(
|
331
|
+
lambda data: data_is_scversedatastructure(data, "MuData"),
|
332
|
+
Artifact.from_mudata,
|
333
|
+
),
|
334
|
+
(
|
335
|
+
lambda data: data_is_scversedatastructure(data, "SpatialData"),
|
336
|
+
Artifact.from_spatialdata,
|
337
|
+
),
|
338
|
+
(data_is_soma_experiment, Artifact.from_tiledbsoma),
|
339
|
+
]
|
340
|
+
|
341
|
+
for type_check, factory in type_mapping:
|
342
|
+
if type_check(self._dataset):
|
343
|
+
self._artifact = factory( # type: ignore
|
344
|
+
self._dataset,
|
345
|
+
key=key,
|
346
|
+
description=description,
|
347
|
+
revises=revises,
|
348
|
+
run=run,
|
349
|
+
)
|
350
|
+
break
|
351
|
+
|
348
352
|
self._artifact.schema = self._schema
|
349
353
|
self._artifact.save()
|
350
354
|
cat_vectors = {}
|
@@ -358,24 +362,57 @@ class SlotsCurator(Curator):
|
|
358
362
|
)
|
359
363
|
|
360
364
|
|
365
|
+
def is_list_of_type(value, expected_type):
|
366
|
+
"""Helper function to check if a value is either of expected_type or a list of that type, or a mix of both in a nested structure."""
|
367
|
+
if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
|
368
|
+
# handle nested lists recursively
|
369
|
+
return all(is_list_of_type(item, expected_type) for item in value)
|
370
|
+
return isinstance(value, expected_type)
|
371
|
+
|
372
|
+
|
361
373
|
def check_dtype(expected_type) -> Callable:
|
362
374
|
"""Creates a check function for Pandera that validates a column's dtype.
|
363
375
|
|
376
|
+
Supports both standard dtype checking and mixed list/single values for
|
377
|
+
the same type. For example, a column with expected_type 'float' would
|
378
|
+
also accept a mix of float values and lists of floats.
|
379
|
+
|
364
380
|
Args:
|
365
|
-
expected_type: String identifier for the expected type ('int', 'float',
|
381
|
+
expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
|
366
382
|
|
367
383
|
Returns:
|
368
|
-
A function that checks if a series has the expected dtype
|
384
|
+
A function that checks if a series has the expected dtype or contains mixed types
|
369
385
|
"""
|
370
386
|
|
371
387
|
def check_function(series):
|
372
|
-
if
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
388
|
+
# first check if the series is entirely of the expected dtype (fast path)
|
389
|
+
if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
|
390
|
+
return True
|
391
|
+
elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
|
392
|
+
return True
|
393
|
+
elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
|
394
|
+
return True
|
395
|
+
elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
|
396
|
+
return True
|
397
|
+
|
398
|
+
# if we're here, it might be a mixed column with object dtype
|
399
|
+
# need to check each value individually
|
400
|
+
if series.dtype == "object" and expected_type.startswith("list"):
|
401
|
+
expected_type_member = expected_type.replace("list[", "").removesuffix("]")
|
402
|
+
if expected_type_member == "int":
|
403
|
+
return series.apply(lambda x: is_list_of_type(x, int)).all()
|
404
|
+
elif expected_type_member == "float":
|
405
|
+
return series.apply(lambda x: is_list_of_type(x, float)).all()
|
406
|
+
elif expected_type_member == "num":
|
407
|
+
# for numeric, accept either int or float
|
408
|
+
return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
|
409
|
+
elif expected_type_member == "str" or expected_type_member.startswith(
|
410
|
+
"cat["
|
411
|
+
):
|
412
|
+
return series.apply(lambda x: is_list_of_type(x, str)).all()
|
413
|
+
|
414
|
+
# if we get here, the validation failed
|
415
|
+
return False
|
379
416
|
|
380
417
|
return check_function
|
381
418
|
|
@@ -452,7 +489,10 @@ class DataFrameCurator(Curator):
|
|
452
489
|
required = feature.uid not in optional_feature_uids
|
453
490
|
else:
|
454
491
|
required = False
|
455
|
-
|
492
|
+
# series.dtype is "object" if the column has lists types, e.g. [["a", "b"], ["a"], ["b"]]
|
493
|
+
if feature.dtype in {"int", "float", "num"} or feature.dtype.startswith(
|
494
|
+
"list"
|
495
|
+
):
|
456
496
|
if isinstance(self._dataset, pd.DataFrame):
|
457
497
|
dtype = (
|
458
498
|
self._dataset[feature.name].dtype
|
@@ -461,9 +501,9 @@ class DataFrameCurator(Curator):
|
|
461
501
|
)
|
462
502
|
else:
|
463
503
|
dtype = None
|
464
|
-
pandera_columns[feature.name] =
|
504
|
+
pandera_columns[feature.name] = pa.Column(
|
465
505
|
dtype=None,
|
466
|
-
checks=
|
506
|
+
checks=pa.Check(
|
467
507
|
check_dtype(feature.dtype),
|
468
508
|
element_wise=False,
|
469
509
|
error=f"Column '{feature.name}' failed dtype check for '{feature.dtype}': got {dtype}",
|
@@ -478,27 +518,29 @@ class DataFrameCurator(Curator):
|
|
478
518
|
if not feature.dtype.startswith("cat")
|
479
519
|
else "category"
|
480
520
|
)
|
481
|
-
pandera_columns[feature.name] =
|
521
|
+
pandera_columns[feature.name] = pa.Column(
|
482
522
|
pandera_dtype,
|
483
523
|
nullable=feature.nullable,
|
484
524
|
coerce=feature.coerce_dtype,
|
485
525
|
required=required,
|
486
526
|
)
|
487
|
-
if feature.dtype.startswith("cat")
|
527
|
+
if feature.dtype.startswith("cat") or feature.dtype.startswith(
|
528
|
+
"list[cat["
|
529
|
+
):
|
488
530
|
# validate categoricals if the column is required or if the column is present
|
489
531
|
if required or feature.name in self._dataset.keys():
|
490
532
|
categoricals.append(feature)
|
491
533
|
if schema._index_feature_uid is not None:
|
492
534
|
# in almost no case, an index should have a pandas.CategoricalDtype in a DataFrame
|
493
535
|
# so, we're typing it as `str` here
|
494
|
-
index =
|
536
|
+
index = pa.Index(
|
495
537
|
schema.index.dtype
|
496
538
|
if not schema.index.dtype.startswith("cat")
|
497
539
|
else str
|
498
540
|
)
|
499
541
|
else:
|
500
542
|
index = None
|
501
|
-
self._pandera_schema =
|
543
|
+
self._pandera_schema = pa.DataFrameSchema(
|
502
544
|
pandera_columns,
|
503
545
|
coerce=schema.coerce_dtype,
|
504
546
|
strict=schema.maximal_set,
|
@@ -582,7 +624,7 @@ class DataFrameCurator(Curator):
|
|
582
624
|
self._pandera_schema.validate(self._dataset)
|
583
625
|
# then validate lamindb categoricals
|
584
626
|
self._cat_manager_validate()
|
585
|
-
except
|
627
|
+
except pa.errors.SchemaError as err:
|
586
628
|
self._is_validated = False
|
587
629
|
# .exconly() doesn't exist on SchemaError
|
588
630
|
raise ValidationError(str(err)) from err
|
@@ -627,8 +669,12 @@ class AnnDataCurator(SlotsCurator):
|
|
627
669
|
|
628
670
|
Example:
|
629
671
|
|
630
|
-
|
672
|
+
.. literalinclude:: scripts/curate_anndata_flexible.py
|
673
|
+
:language: python
|
674
|
+
:caption: curate_anndata_flexible.py
|
631
675
|
|
676
|
+
See Also:
|
677
|
+
:meth:`~lamindb.Artifact.from_anndata`.
|
632
678
|
"""
|
633
679
|
|
634
680
|
def __init__(
|
@@ -637,7 +683,7 @@ class AnnDataCurator(SlotsCurator):
|
|
637
683
|
schema: Schema,
|
638
684
|
) -> None:
|
639
685
|
super().__init__(dataset=dataset, schema=schema)
|
640
|
-
if not
|
686
|
+
if not data_is_scversedatastructure(self._dataset, "AnnData"):
|
641
687
|
raise InvalidArgument("dataset must be AnnData-like.")
|
642
688
|
if schema.otype != "AnnData":
|
643
689
|
raise InvalidArgument("Schema otype must be 'AnnData'.")
|
@@ -710,9 +756,12 @@ class MuDataCurator(SlotsCurator):
|
|
710
756
|
|
711
757
|
Example:
|
712
758
|
|
713
|
-
.. literalinclude:: scripts/
|
759
|
+
.. literalinclude:: scripts/curate_mudata.py
|
714
760
|
:language: python
|
715
|
-
:caption:
|
761
|
+
:caption: curate_mudata.py
|
762
|
+
|
763
|
+
See Also:
|
764
|
+
:meth:`~lamindb.Artifact.from_mudata`.
|
716
765
|
"""
|
717
766
|
|
718
767
|
def __init__(
|
@@ -721,7 +770,7 @@ class MuDataCurator(SlotsCurator):
|
|
721
770
|
schema: Schema,
|
722
771
|
) -> None:
|
723
772
|
super().__init__(dataset=dataset, schema=schema)
|
724
|
-
if not
|
773
|
+
if not data_is_scversedatastructure(self._dataset, "MuData"):
|
725
774
|
raise InvalidArgument("dataset must be MuData-like.")
|
726
775
|
if schema.otype != "MuData":
|
727
776
|
raise InvalidArgument("Schema otype must be 'MuData'.")
|
@@ -774,18 +823,21 @@ class SpatialDataCurator(SlotsCurator):
|
|
774
823
|
|
775
824
|
Example:
|
776
825
|
|
777
|
-
|
826
|
+
.. literalinclude:: scripts/curate_mudata.py
|
827
|
+
:language: python
|
828
|
+
:caption: curate_mudata.py
|
829
|
+
|
830
|
+
See Also:
|
831
|
+
:meth:`~lamindb.Artifact.from_spatialdata`.
|
778
832
|
"""
|
779
833
|
|
780
834
|
def __init__(
|
781
835
|
self,
|
782
836
|
dataset: SpatialData | Artifact,
|
783
837
|
schema: Schema,
|
784
|
-
*,
|
785
|
-
sample_metadata_key: str | None = "sample",
|
786
838
|
) -> None:
|
787
839
|
super().__init__(dataset=dataset, schema=schema)
|
788
|
-
if not
|
840
|
+
if not data_is_scversedatastructure(self._dataset, "SpatialData"):
|
789
841
|
raise InvalidArgument("dataset must be SpatialData-like.")
|
790
842
|
if schema.otype != "SpatialData":
|
791
843
|
raise InvalidArgument("Schema otype must be 'SpatialData'.")
|
@@ -851,6 +903,92 @@ class SpatialDataCurator(SlotsCurator):
|
|
851
903
|
self._columns_field = self._var_fields
|
852
904
|
|
853
905
|
|
906
|
+
class TiledbsomaExperimentCurator(SlotsCurator):
|
907
|
+
"""Curator for `TileDB-SOMA`.
|
908
|
+
|
909
|
+
Args:
|
910
|
+
dataset: The `tiledbsoma.Experiment` object.
|
911
|
+
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
912
|
+
|
913
|
+
Example:
|
914
|
+
|
915
|
+
.. literalinclude:: scripts/curate_soma_experiment.py
|
916
|
+
:language: python
|
917
|
+
:caption: curate_soma_experiment.py
|
918
|
+
|
919
|
+
See Also:
|
920
|
+
:meth:`~lamindb.Artifact.from_tiledbsoma`.
|
921
|
+
"""
|
922
|
+
|
923
|
+
def __init__(
|
924
|
+
self,
|
925
|
+
dataset: SOMAExperiment | Artifact,
|
926
|
+
schema: Schema,
|
927
|
+
) -> None:
|
928
|
+
super().__init__(dataset=dataset, schema=schema)
|
929
|
+
if not data_is_soma_experiment(self._dataset):
|
930
|
+
raise InvalidArgument("dataset must be SOMAExperiment-like.")
|
931
|
+
if schema.otype != "tiledbsoma":
|
932
|
+
raise InvalidArgument("Schema otype must be 'tiledbsoma'.")
|
933
|
+
|
934
|
+
for slot, slot_schema in schema.slots.items():
|
935
|
+
if slot.startswith("ms:"):
|
936
|
+
ms, modality_slot = slot.split(":")
|
937
|
+
schema_dataset = (
|
938
|
+
self._dataset.ms[modality_slot.removesuffix(".T")]
|
939
|
+
.var.read()
|
940
|
+
.concat()
|
941
|
+
.to_pandas()
|
942
|
+
.drop("soma_joinid", axis=1, errors="ignore")
|
943
|
+
)
|
944
|
+
|
945
|
+
self._slots[slot] = DataFrameCurator(
|
946
|
+
(
|
947
|
+
schema_dataset.T
|
948
|
+
if modality_slot == "var.T"
|
949
|
+
or (
|
950
|
+
# backward compat
|
951
|
+
modality_slot == "var"
|
952
|
+
and schema.slots[slot].itype not in {None, "Feature"}
|
953
|
+
)
|
954
|
+
else schema_dataset
|
955
|
+
),
|
956
|
+
slot_schema,
|
957
|
+
)
|
958
|
+
else:
|
959
|
+
# global Experiment obs slot
|
960
|
+
_ms, modality_slot = None, slot
|
961
|
+
schema_dataset = (
|
962
|
+
self._dataset.obs.read()
|
963
|
+
.concat()
|
964
|
+
.to_pandas()
|
965
|
+
.drop(["soma_joinid", "obs_id"], axis=1, errors="ignore")
|
966
|
+
)
|
967
|
+
self._slots[slot] = DataFrameCurator(
|
968
|
+
schema_dataset,
|
969
|
+
slot_schema,
|
970
|
+
)
|
971
|
+
|
972
|
+
if modality_slot == "var" and schema.slots[slot].itype not in {
|
973
|
+
None,
|
974
|
+
"Feature",
|
975
|
+
}:
|
976
|
+
logger.warning(
|
977
|
+
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
978
|
+
)
|
979
|
+
|
980
|
+
_assign_var_fields_categoricals_multimodal(
|
981
|
+
modality=slot, # not using "ms" here as it would always be the same for all modalities
|
982
|
+
slot_type=modality_slot,
|
983
|
+
slot=slot,
|
984
|
+
slot_schema=slot_schema,
|
985
|
+
var_fields=self._var_fields,
|
986
|
+
cat_vectors=self._cat_vectors,
|
987
|
+
slots=self._slots,
|
988
|
+
)
|
989
|
+
self._columns_field = self._var_fields
|
990
|
+
|
991
|
+
|
854
992
|
class CatVector:
|
855
993
|
"""Vector with categorical values."""
|
856
994
|
|
@@ -861,7 +999,7 @@ class CatVector:
|
|
861
999
|
field: FieldAttr, # The field to validate against.
|
862
1000
|
key: str, # The name of the vector to validate. Only used for logging.
|
863
1001
|
values_setter: Callable | None = None, # A callable that sets the values.
|
864
|
-
source:
|
1002
|
+
source: SQLRecord | None = None, # The ontology source to validate against.
|
865
1003
|
feature: Feature | None = None,
|
866
1004
|
cat_manager: DataFrameCatManager | None = None,
|
867
1005
|
subtype_str: str = "",
|
@@ -924,10 +1062,20 @@ class CatVector:
|
|
924
1062
|
|
925
1063
|
def _replace_synonyms(self) -> list[str]:
|
926
1064
|
"""Replace synonyms in the vector with standardized values."""
|
1065
|
+
|
1066
|
+
def process_value(value, syn_mapper):
|
1067
|
+
"""Helper function to process values recursively."""
|
1068
|
+
if isinstance(value, list):
|
1069
|
+
# Handle list - recursively process each item
|
1070
|
+
return [process_value(item, syn_mapper) for item in value]
|
1071
|
+
else:
|
1072
|
+
# Handle single value
|
1073
|
+
return syn_mapper.get(value, value)
|
1074
|
+
|
927
1075
|
syn_mapper = self._synonyms
|
928
1076
|
# replace the values in df
|
929
1077
|
std_values = self.values.map(
|
930
|
-
lambda unstd_val:
|
1078
|
+
lambda unstd_val: process_value(unstd_val, syn_mapper)
|
931
1079
|
)
|
932
1080
|
# remove the standardized values from self.non_validated
|
933
1081
|
non_validated = [i for i in self._non_validated if i not in syn_mapper]
|
@@ -971,15 +1119,28 @@ class CatVector:
|
|
971
1119
|
filter_kwargs = get_current_filter_kwargs(
|
972
1120
|
registry, {"organism": self._organism, "source": self._source}
|
973
1121
|
)
|
974
|
-
values = [
|
1122
|
+
values = [
|
1123
|
+
i
|
1124
|
+
for i in self.values
|
1125
|
+
if (isinstance(i, str) and i)
|
1126
|
+
or (isinstance(i, list) and i)
|
1127
|
+
or (isinstance(i, np.ndarray) and i.size > 0)
|
1128
|
+
]
|
975
1129
|
if not values:
|
976
1130
|
return [], []
|
1131
|
+
|
1132
|
+
# if a value is a list, we need to flatten it
|
1133
|
+
str_values = _flatten_unique(values)
|
1134
|
+
|
977
1135
|
# inspect the default instance and save validated records from public
|
978
1136
|
if (
|
979
1137
|
self._subtype_str != "" and "__" not in self._subtype_str
|
980
1138
|
): # not for general filter expressions
|
981
|
-
|
982
|
-
|
1139
|
+
related_name = registry._meta.get_field("type").remote_field.related_name
|
1140
|
+
self._subtype_query_set = getattr(
|
1141
|
+
registry.get(name=self._subtype_str), related_name
|
1142
|
+
).all()
|
1143
|
+
values_array = np.array(str_values)
|
983
1144
|
validated_mask = self._subtype_query_set.validate( # type: ignore
|
984
1145
|
values_array, field=self._field, **filter_kwargs, mute=True
|
985
1146
|
)
|
@@ -992,7 +1153,7 @@ class CatVector:
|
|
992
1153
|
)
|
993
1154
|
else:
|
994
1155
|
existing_and_public_records = registry.from_values(
|
995
|
-
|
1156
|
+
str_values, field=self._field, **filter_kwargs, mute=True
|
996
1157
|
)
|
997
1158
|
existing_and_public_labels = [
|
998
1159
|
getattr(r, field_name) for r in existing_and_public_records
|
@@ -1019,7 +1180,7 @@ class CatVector:
|
|
1019
1180
|
)
|
1020
1181
|
# non-validated records from the default instance
|
1021
1182
|
non_validated_labels = [
|
1022
|
-
i for i in
|
1183
|
+
i for i in str_values if i not in existing_and_public_labels
|
1023
1184
|
]
|
1024
1185
|
validated_labels = existing_and_public_labels
|
1025
1186
|
records = existing_and_public_records
|
@@ -1040,7 +1201,7 @@ class CatVector:
|
|
1040
1201
|
|
1041
1202
|
registry = self._field.field.model
|
1042
1203
|
field_name = self._field.field.name
|
1043
|
-
non_validated_records:
|
1204
|
+
non_validated_records: SQLRecordList[Any] = [] # type: ignore
|
1044
1205
|
if df is not None and registry == Feature:
|
1045
1206
|
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
1046
1207
|
non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
|
@@ -1204,7 +1365,7 @@ class DataFrameCatManager:
|
|
1204
1365
|
columns_field: FieldAttr = Feature.name,
|
1205
1366
|
columns_names: Iterable[str] | None = None,
|
1206
1367
|
categoricals: list[Feature] | None = None,
|
1207
|
-
sources: dict[str,
|
1368
|
+
sources: dict[str, SQLRecord] | None = None,
|
1208
1369
|
index: Feature | None = None,
|
1209
1370
|
slot: str | None = None,
|
1210
1371
|
maximal_set: bool = False,
|
@@ -1372,20 +1533,20 @@ class DataFrameCatManager:
|
|
1372
1533
|
self._cat_vectors[key].add_new(**kwargs)
|
1373
1534
|
|
1374
1535
|
|
1375
|
-
def get_current_filter_kwargs(registry: type[
|
1536
|
+
def get_current_filter_kwargs(registry: type[SQLRecord], kwargs: dict) -> dict:
|
1376
1537
|
"""Make sure the source and organism are saved in the same database as the registry."""
|
1377
1538
|
db = registry.filter().db
|
1378
1539
|
source = kwargs.get("source")
|
1379
1540
|
organism = kwargs.get("organism")
|
1380
1541
|
filter_kwargs = kwargs.copy()
|
1381
1542
|
|
1382
|
-
if isinstance(organism,
|
1543
|
+
if isinstance(organism, SQLRecord) and organism._state.db != "default":
|
1383
1544
|
if db is None or db == "default":
|
1384
1545
|
organism_default = copy.copy(organism)
|
1385
1546
|
# save the organism record in the default database
|
1386
1547
|
organism_default.save()
|
1387
1548
|
filter_kwargs["organism"] = organism_default
|
1388
|
-
if isinstance(source,
|
1549
|
+
if isinstance(source, SQLRecord) and source._state.db != "default":
|
1389
1550
|
if db is None or db == "default":
|
1390
1551
|
source_default = copy.copy(source)
|
1391
1552
|
# save the source record in the default database
|
@@ -1505,18 +1666,24 @@ def annotate_artifact(
|
|
1505
1666
|
return artifact
|
1506
1667
|
|
1507
1668
|
|
1508
|
-
# TODO: need this function to support mutli-value columns
|
1509
1669
|
def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:
|
1510
|
-
"""Flatten a Pandas series containing lists or single items into a unique list of elements.
|
1511
|
-
|
1670
|
+
"""Flatten a Pandas series containing lists or single items into a unique list of elements.
|
1671
|
+
|
1672
|
+
The order of elements in the result list preserves the order they first appear in the input series.
|
1673
|
+
"""
|
1674
|
+
# Use dict.fromkeys to preserve order while ensuring uniqueness
|
1675
|
+
result: dict = {}
|
1512
1676
|
|
1513
1677
|
for item in series:
|
1514
|
-
if isinstance(item, list):
|
1515
|
-
|
1678
|
+
if isinstance(item, list | np.ndarray):
|
1679
|
+
# Add each element to the dict (only first occurrence is kept)
|
1680
|
+
for element in item:
|
1681
|
+
result[element] = None
|
1516
1682
|
else:
|
1517
|
-
result
|
1683
|
+
result[item] = None
|
1518
1684
|
|
1519
|
-
|
1685
|
+
# Return the keys as a list, preserving order
|
1686
|
+
return list(result.keys())
|
1520
1687
|
|
1521
1688
|
|
1522
1689
|
def _save_organism(name: str):
|
lamindb/errors.py
CHANGED
@@ -10,7 +10,7 @@
|
|
10
10
|
MissingContextUID
|
11
11
|
UpdateContext
|
12
12
|
IntegrityError
|
13
|
-
|
13
|
+
SQLRecordNameChangeIntegrityError
|
14
14
|
|
15
15
|
"""
|
16
16
|
|
@@ -57,7 +57,7 @@ class InconsistentKey(Exception):
|
|
57
57
|
pass
|
58
58
|
|
59
59
|
|
60
|
-
class
|
60
|
+
class SQLRecordNameChangeIntegrityError(Exception):
|
61
61
|
"""Custom exception for name change errors."""
|
62
62
|
|
63
63
|
pass
|
@@ -569,7 +569,7 @@ class Migration(migrations.Migration):
|
|
569
569
|
),
|
570
570
|
),
|
571
571
|
],
|
572
|
-
bases=(lamindb.models.
|
572
|
+
bases=(lamindb.models.IsLink, models.Model),
|
573
573
|
),
|
574
574
|
migrations.AddField(
|
575
575
|
model_name="collection",
|
@@ -619,7 +619,7 @@ class Migration(migrations.Migration):
|
|
619
619
|
),
|
620
620
|
),
|
621
621
|
],
|
622
|
-
bases=(lamindb.models.
|
622
|
+
bases=(lamindb.models.IsLink, models.Model),
|
623
623
|
),
|
624
624
|
migrations.AddField(
|
625
625
|
model_name="artifact",
|
@@ -656,7 +656,7 @@ class Migration(migrations.Migration):
|
|
656
656
|
options={
|
657
657
|
"unique_together": {("featureset", "feature")},
|
658
658
|
},
|
659
|
-
bases=(models.Model, lamindb.models.
|
659
|
+
bases=(models.Model, lamindb.models.IsLink),
|
660
660
|
),
|
661
661
|
migrations.AddField(
|
662
662
|
model_name="feature",
|
@@ -727,7 +727,7 @@ class Migration(migrations.Migration):
|
|
727
727
|
),
|
728
728
|
),
|
729
729
|
],
|
730
|
-
bases=(lamindb.models.
|
730
|
+
bases=(lamindb.models.IsLink, models.Model),
|
731
731
|
),
|
732
732
|
migrations.AddField(
|
733
733
|
model_name="artifact",
|
@@ -805,7 +805,7 @@ class Migration(migrations.Migration):
|
|
805
805
|
options={
|
806
806
|
"unique_together": {("artifact", "paramvalue")},
|
807
807
|
},
|
808
|
-
bases=(models.Model, lamindb.models.
|
808
|
+
bases=(models.Model, lamindb.models.IsLink),
|
809
809
|
),
|
810
810
|
migrations.AddField(
|
811
811
|
model_name="artifact",
|
@@ -1082,7 +1082,7 @@ class Migration(migrations.Migration):
|
|
1082
1082
|
options={
|
1083
1083
|
"unique_together": {("run", "paramvalue")},
|
1084
1084
|
},
|
1085
|
-
bases=(models.Model, lamindb.models.
|
1085
|
+
bases=(models.Model, lamindb.models.IsLink),
|
1086
1086
|
),
|
1087
1087
|
migrations.AddField(
|
1088
1088
|
model_name="run",
|
@@ -1539,7 +1539,7 @@ class Migration(migrations.Migration):
|
|
1539
1539
|
options={
|
1540
1540
|
"unique_together": {("collection", "ulabel")},
|
1541
1541
|
},
|
1542
|
-
bases=(lamindb.models.
|
1542
|
+
bases=(lamindb.models.IsLink, models.Model),
|
1543
1543
|
),
|
1544
1544
|
migrations.AddField(
|
1545
1545
|
model_name="collection",
|
@@ -1624,7 +1624,7 @@ class Migration(migrations.Migration):
|
|
1624
1624
|
options={
|
1625
1625
|
"unique_together": {("artifact", "ulabel", "feature")},
|
1626
1626
|
},
|
1627
|
-
bases=(lamindb.models.
|
1627
|
+
bases=(lamindb.models.IsLink, models.Model),
|
1628
1628
|
),
|
1629
1629
|
migrations.AddField(
|
1630
1630
|
model_name="artifact",
|