lamindb 1.5.3__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. lamindb/__init__.py +25 -6
  2. lamindb/_finish.py +5 -5
  3. lamindb/_tracked.py +1 -1
  4. lamindb/_view.py +4 -4
  5. lamindb/core/_context.py +32 -6
  6. lamindb/core/_settings.py +1 -1
  7. lamindb/core/datasets/mini_immuno.py +8 -0
  8. lamindb/core/loaders.py +1 -1
  9. lamindb/core/storage/_anndata_accessor.py +9 -9
  10. lamindb/core/storage/_valid_suffixes.py +1 -0
  11. lamindb/core/storage/_zarr.py +32 -107
  12. lamindb/curators/__init__.py +19 -2
  13. lamindb/curators/_cellxgene_schemas/__init__.py +3 -3
  14. lamindb/curators/_legacy.py +15 -19
  15. lamindb/curators/core.py +247 -80
  16. lamindb/errors.py +2 -2
  17. lamindb/migrations/0069_squashed.py +8 -8
  18. lamindb/migrations/0071_lamindbv1_migrate_schema.py +3 -3
  19. lamindb/migrations/0073_merge_ourprojects.py +7 -7
  20. lamindb/migrations/0075_lamindbv1_part5.py +1 -1
  21. lamindb/migrations/0077_lamindbv1_part6b.py +3 -3
  22. lamindb/migrations/0080_polish_lamindbv1.py +2 -2
  23. lamindb/migrations/0088_schema_components.py +1 -1
  24. lamindb/migrations/0090_runproject_project_runs.py +2 -2
  25. lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +1 -1
  26. lamindb/migrations/0094_writeloglock_writelogmigrationstate_and_more.py +84 -0
  27. lamindb/migrations/0095_remove_rundata_flextable.py +155 -0
  28. lamindb/migrations/0096_remove_artifact__param_values_and_more.py +266 -0
  29. lamindb/migrations/0097_remove_schemaparam_param_remove_paramvalue_param_and_more.py +27 -0
  30. lamindb/migrations/0098_alter_feature_type_alter_project_type_and_more.py +656 -0
  31. lamindb/migrations/0099_alter_writelog_seqno.py +22 -0
  32. lamindb/migrations/0100_branch_alter_artifact__branch_code_and_more.py +102 -0
  33. lamindb/migrations/0101_alter_artifact_hash_alter_feature_name_and_more.py +444 -0
  34. lamindb/migrations/0102_remove_writelog_branch_code_and_more.py +72 -0
  35. lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +46 -0
  36. lamindb/migrations/{0090_squashed.py → 0103_squashed.py} +1013 -1009
  37. lamindb/models/__init__.py +35 -18
  38. lamindb/models/_describe.py +4 -4
  39. lamindb/models/_django.py +38 -4
  40. lamindb/models/_feature_manager.py +66 -123
  41. lamindb/models/_from_values.py +13 -13
  42. lamindb/models/_label_manager.py +8 -6
  43. lamindb/models/_relations.py +7 -7
  44. lamindb/models/artifact.py +166 -156
  45. lamindb/models/can_curate.py +25 -25
  46. lamindb/models/collection.py +48 -18
  47. lamindb/models/core.py +3 -3
  48. lamindb/models/feature.py +88 -60
  49. lamindb/models/has_parents.py +17 -17
  50. lamindb/models/project.py +52 -24
  51. lamindb/models/query_manager.py +5 -5
  52. lamindb/models/query_set.py +61 -37
  53. lamindb/models/record.py +158 -1583
  54. lamindb/models/run.py +39 -176
  55. lamindb/models/save.py +6 -6
  56. lamindb/models/schema.py +32 -43
  57. lamindb/models/sqlrecord.py +1743 -0
  58. lamindb/models/transform.py +17 -33
  59. lamindb/models/ulabel.py +21 -15
  60. {lamindb-1.5.3.dist-info → lamindb-1.6.0.dist-info}/METADATA +7 -11
  61. lamindb-1.6.0.dist-info/RECORD +118 -0
  62. lamindb/core/storage/_anndata_sizes.py +0 -41
  63. lamindb/models/flextable.py +0 -163
  64. lamindb-1.5.3.dist-info/RECORD +0 -109
  65. {lamindb-1.5.3.dist-info → lamindb-1.6.0.dist-info}/LICENSE +0 -0
  66. {lamindb-1.5.3.dist-info → lamindb-1.6.0.dist-info}/WHEEL +0 -0
lamindb/curators/core.py CHANGED
@@ -15,12 +15,13 @@ from __future__ import annotations
15
15
 
16
16
  import copy
17
17
  import re
18
+ from collections.abc import Iterable
18
19
  from typing import TYPE_CHECKING, Any, Callable
19
20
 
20
21
  import lamindb_setup as ln_setup
21
22
  import numpy as np
22
23
  import pandas as pd
23
- import pandera
24
+ import pandera.pandas as pa
24
25
  from lamin_utils import colors, logger
25
26
  from lamindb_setup.core._docs import doc_args
26
27
 
@@ -28,29 +29,29 @@ from lamindb.base.types import FieldAttr # noqa
28
29
  from lamindb.models import (
29
30
  Artifact,
30
31
  Feature,
31
- Record,
32
32
  Run,
33
33
  Schema,
34
+ SQLRecord,
34
35
  )
35
36
  from lamindb.models._from_values import _format_values
36
37
  from lamindb.models.artifact import (
37
- data_is_anndata,
38
- data_is_mudata,
39
- data_is_spatialdata,
38
+ data_is_scversedatastructure,
39
+ data_is_soma_experiment,
40
40
  )
41
41
  from lamindb.models.feature import parse_cat_dtype, parse_dtype
42
42
 
43
43
  from ..errors import InvalidArgument, ValidationError
44
44
 
45
45
  if TYPE_CHECKING:
46
- from collections.abc import Iterable
47
46
  from typing import Any
48
47
 
49
48
  from anndata import AnnData
50
49
  from mudata import MuData
51
50
  from spatialdata import SpatialData
51
+ from tiledbsoma._experiment import Experiment as SOMAExperiment
52
52
 
53
- from lamindb.models.query_set import RecordList
53
+ from lamindb.core.types import ScverseDataStructures
54
+ from lamindb.models.query_set import SQLRecordList
54
55
 
55
56
 
56
57
  def strip_ansi_codes(text):
@@ -79,7 +80,7 @@ class CatLookup:
79
80
  categoricals: list[Feature] | dict[str, FieldAttr],
80
81
  slots: dict[str, FieldAttr] = None,
81
82
  public: bool = False,
82
- sources: dict[str, Record] | None = None,
83
+ sources: dict[str, SQLRecord] | None = None,
83
84
  ) -> None:
84
85
  slots = slots or {}
85
86
  if isinstance(categoricals, list):
@@ -269,7 +270,6 @@ class Curator:
269
270
  )
270
271
 
271
272
 
272
- # default implementation for AnnDataCurator, MuDataCurator, and SpatialDataCurator
273
273
  class SlotsCurator(Curator):
274
274
  """Curator for a dataset with slots.
275
275
 
@@ -281,13 +281,13 @@ class SlotsCurator(Curator):
281
281
 
282
282
  def __init__(
283
283
  self,
284
- dataset: Any,
284
+ dataset: Artifact | ScverseDataStructures | SOMAExperiment,
285
285
  schema: Schema,
286
286
  ) -> None:
287
287
  super().__init__(dataset=dataset, schema=schema)
288
288
  self._slots: dict[str, DataFrameCurator] = {}
289
289
 
290
- # used in MuDataCurator and SpatialDataCurator
290
+ # used for multimodal data structures (not AnnData)
291
291
  # in form of {table/modality_key: var_field}
292
292
  self._var_fields: dict[str, FieldAttr] = {}
293
293
  # in form of {table/modality_key: categoricals}
@@ -320,31 +320,35 @@ class SlotsCurator(Curator):
320
320
  """{}""" # noqa: D415
321
321
  if not self._is_validated:
322
322
  self.validate()
323
+
323
324
  if self._artifact is None:
324
- if data_is_anndata(self._dataset):
325
- self._artifact = Artifact.from_anndata(
326
- self._dataset,
327
- key=key,
328
- description=description,
329
- revises=revises,
330
- run=run,
331
- )
332
- if data_is_mudata(self._dataset):
333
- self._artifact = Artifact.from_mudata(
334
- self._dataset,
335
- key=key,
336
- description=description,
337
- revises=revises,
338
- run=run,
339
- )
340
- elif data_is_spatialdata(self._dataset):
341
- self._artifact = Artifact.from_spatialdata(
342
- self._dataset,
343
- key=key,
344
- description=description,
345
- revises=revises,
346
- run=run,
347
- )
325
+ type_mapping = [
326
+ (
327
+ lambda data: data_is_scversedatastructure(data, "AnnData"),
328
+ Artifact.from_anndata,
329
+ ),
330
+ (
331
+ lambda data: data_is_scversedatastructure(data, "MuData"),
332
+ Artifact.from_mudata,
333
+ ),
334
+ (
335
+ lambda data: data_is_scversedatastructure(data, "SpatialData"),
336
+ Artifact.from_spatialdata,
337
+ ),
338
+ (data_is_soma_experiment, Artifact.from_tiledbsoma),
339
+ ]
340
+
341
+ for type_check, factory in type_mapping:
342
+ if type_check(self._dataset):
343
+ self._artifact = factory( # type: ignore
344
+ self._dataset,
345
+ key=key,
346
+ description=description,
347
+ revises=revises,
348
+ run=run,
349
+ )
350
+ break
351
+
348
352
  self._artifact.schema = self._schema
349
353
  self._artifact.save()
350
354
  cat_vectors = {}
@@ -358,24 +362,57 @@ class SlotsCurator(Curator):
358
362
  )
359
363
 
360
364
 
365
+ def is_list_of_type(value, expected_type):
366
+ """Helper function to check if a value is either of expected_type or a list of that type, or a mix of both in a nested structure."""
367
+ if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
368
+ # handle nested lists recursively
369
+ return all(is_list_of_type(item, expected_type) for item in value)
370
+ return isinstance(value, expected_type)
371
+
372
+
361
373
  def check_dtype(expected_type) -> Callable:
362
374
  """Creates a check function for Pandera that validates a column's dtype.
363
375
 
376
+ Supports both standard dtype checking and mixed list/single values for
377
+ the same type. For example, a column with expected_type 'float' would
378
+ also accept a mix of float values and lists of floats.
379
+
364
380
  Args:
365
- expected_type: String identifier for the expected type ('int', 'float', or 'num')
381
+ expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
366
382
 
367
383
  Returns:
368
- A function that checks if a series has the expected dtype
384
+ A function that checks if a series has the expected dtype or contains mixed types
369
385
  """
370
386
 
371
387
  def check_function(series):
372
- if expected_type == "int":
373
- is_valid = pd.api.types.is_integer_dtype(series.dtype)
374
- elif expected_type == "float":
375
- is_valid = pd.api.types.is_float_dtype(series.dtype)
376
- elif expected_type == "num":
377
- is_valid = pd.api.types.is_numeric_dtype(series.dtype)
378
- return is_valid
388
+ # first check if the series is entirely of the expected dtype (fast path)
389
+ if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
390
+ return True
391
+ elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
392
+ return True
393
+ elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
394
+ return True
395
+ elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
396
+ return True
397
+
398
+ # if we're here, it might be a mixed column with object dtype
399
+ # need to check each value individually
400
+ if series.dtype == "object" and expected_type.startswith("list"):
401
+ expected_type_member = expected_type.replace("list[", "").removesuffix("]")
402
+ if expected_type_member == "int":
403
+ return series.apply(lambda x: is_list_of_type(x, int)).all()
404
+ elif expected_type_member == "float":
405
+ return series.apply(lambda x: is_list_of_type(x, float)).all()
406
+ elif expected_type_member == "num":
407
+ # for numeric, accept either int or float
408
+ return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
409
+ elif expected_type_member == "str" or expected_type_member.startswith(
410
+ "cat["
411
+ ):
412
+ return series.apply(lambda x: is_list_of_type(x, str)).all()
413
+
414
+ # if we get here, the validation failed
415
+ return False
379
416
 
380
417
  return check_function
381
418
 
@@ -452,7 +489,10 @@ class DataFrameCurator(Curator):
452
489
  required = feature.uid not in optional_feature_uids
453
490
  else:
454
491
  required = False
455
- if feature.dtype in {"int", "float", "num"}:
492
+ # series.dtype is "object" if the column has lists types, e.g. [["a", "b"], ["a"], ["b"]]
493
+ if feature.dtype in {"int", "float", "num"} or feature.dtype.startswith(
494
+ "list"
495
+ ):
456
496
  if isinstance(self._dataset, pd.DataFrame):
457
497
  dtype = (
458
498
  self._dataset[feature.name].dtype
@@ -461,9 +501,9 @@ class DataFrameCurator(Curator):
461
501
  )
462
502
  else:
463
503
  dtype = None
464
- pandera_columns[feature.name] = pandera.Column(
504
+ pandera_columns[feature.name] = pa.Column(
465
505
  dtype=None,
466
- checks=pandera.Check(
506
+ checks=pa.Check(
467
507
  check_dtype(feature.dtype),
468
508
  element_wise=False,
469
509
  error=f"Column '{feature.name}' failed dtype check for '{feature.dtype}': got {dtype}",
@@ -478,27 +518,29 @@ class DataFrameCurator(Curator):
478
518
  if not feature.dtype.startswith("cat")
479
519
  else "category"
480
520
  )
481
- pandera_columns[feature.name] = pandera.Column(
521
+ pandera_columns[feature.name] = pa.Column(
482
522
  pandera_dtype,
483
523
  nullable=feature.nullable,
484
524
  coerce=feature.coerce_dtype,
485
525
  required=required,
486
526
  )
487
- if feature.dtype.startswith("cat"):
527
+ if feature.dtype.startswith("cat") or feature.dtype.startswith(
528
+ "list[cat["
529
+ ):
488
530
  # validate categoricals if the column is required or if the column is present
489
531
  if required or feature.name in self._dataset.keys():
490
532
  categoricals.append(feature)
491
533
  if schema._index_feature_uid is not None:
492
534
  # in almost no case, an index should have a pandas.CategoricalDtype in a DataFrame
493
535
  # so, we're typing it as `str` here
494
- index = pandera.Index(
536
+ index = pa.Index(
495
537
  schema.index.dtype
496
538
  if not schema.index.dtype.startswith("cat")
497
539
  else str
498
540
  )
499
541
  else:
500
542
  index = None
501
- self._pandera_schema = pandera.DataFrameSchema(
543
+ self._pandera_schema = pa.DataFrameSchema(
502
544
  pandera_columns,
503
545
  coerce=schema.coerce_dtype,
504
546
  strict=schema.maximal_set,
@@ -582,7 +624,7 @@ class DataFrameCurator(Curator):
582
624
  self._pandera_schema.validate(self._dataset)
583
625
  # then validate lamindb categoricals
584
626
  self._cat_manager_validate()
585
- except pandera.errors.SchemaError as err:
627
+ except pa.errors.SchemaError as err:
586
628
  self._is_validated = False
587
629
  # .exconly() doesn't exist on SchemaError
588
630
  raise ValidationError(str(err)) from err
@@ -627,8 +669,12 @@ class AnnDataCurator(SlotsCurator):
627
669
 
628
670
  Example:
629
671
 
630
- See :meth:`~lamindb.Artifact.from_anndata`.
672
+ .. literalinclude:: scripts/curate_anndata_flexible.py
673
+ :language: python
674
+ :caption: curate_anndata_flexible.py
631
675
 
676
+ See Also:
677
+ :meth:`~lamindb.Artifact.from_anndata`.
632
678
  """
633
679
 
634
680
  def __init__(
@@ -637,7 +683,7 @@ class AnnDataCurator(SlotsCurator):
637
683
  schema: Schema,
638
684
  ) -> None:
639
685
  super().__init__(dataset=dataset, schema=schema)
640
- if not data_is_anndata(self._dataset):
686
+ if not data_is_scversedatastructure(self._dataset, "AnnData"):
641
687
  raise InvalidArgument("dataset must be AnnData-like.")
642
688
  if schema.otype != "AnnData":
643
689
  raise InvalidArgument("Schema otype must be 'AnnData'.")
@@ -710,9 +756,12 @@ class MuDataCurator(SlotsCurator):
710
756
 
711
757
  Example:
712
758
 
713
- .. literalinclude:: scripts/curate-mudata.py
759
+ .. literalinclude:: scripts/curate_mudata.py
714
760
  :language: python
715
- :caption: curate-mudata.py
761
+ :caption: curate_mudata.py
762
+
763
+ See Also:
764
+ :meth:`~lamindb.Artifact.from_mudata`.
716
765
  """
717
766
 
718
767
  def __init__(
@@ -721,7 +770,7 @@ class MuDataCurator(SlotsCurator):
721
770
  schema: Schema,
722
771
  ) -> None:
723
772
  super().__init__(dataset=dataset, schema=schema)
724
- if not data_is_mudata(self._dataset):
773
+ if not data_is_scversedatastructure(self._dataset, "MuData"):
725
774
  raise InvalidArgument("dataset must be MuData-like.")
726
775
  if schema.otype != "MuData":
727
776
  raise InvalidArgument("Schema otype must be 'MuData'.")
@@ -774,18 +823,21 @@ class SpatialDataCurator(SlotsCurator):
774
823
 
775
824
  Example:
776
825
 
777
- See :meth:`~lamindb.Artifact.from_spatialdata`.
826
+ .. literalinclude:: scripts/curate_mudata.py
827
+ :language: python
828
+ :caption: curate_mudata.py
829
+
830
+ See Also:
831
+ :meth:`~lamindb.Artifact.from_spatialdata`.
778
832
  """
779
833
 
780
834
  def __init__(
781
835
  self,
782
836
  dataset: SpatialData | Artifact,
783
837
  schema: Schema,
784
- *,
785
- sample_metadata_key: str | None = "sample",
786
838
  ) -> None:
787
839
  super().__init__(dataset=dataset, schema=schema)
788
- if not data_is_spatialdata(self._dataset):
840
+ if not data_is_scversedatastructure(self._dataset, "SpatialData"):
789
841
  raise InvalidArgument("dataset must be SpatialData-like.")
790
842
  if schema.otype != "SpatialData":
791
843
  raise InvalidArgument("Schema otype must be 'SpatialData'.")
@@ -851,6 +903,92 @@ class SpatialDataCurator(SlotsCurator):
851
903
  self._columns_field = self._var_fields
852
904
 
853
905
 
906
+ class TiledbsomaExperimentCurator(SlotsCurator):
907
+ """Curator for `TileDB-SOMA`.
908
+
909
+ Args:
910
+ dataset: The `tiledbsoma.Experiment` object.
911
+ schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
912
+
913
+ Example:
914
+
915
+ .. literalinclude:: scripts/curate_soma_experiment.py
916
+ :language: python
917
+ :caption: curate_soma_experiment.py
918
+
919
+ See Also:
920
+ :meth:`~lamindb.Artifact.from_tiledbsoma`.
921
+ """
922
+
923
+ def __init__(
924
+ self,
925
+ dataset: SOMAExperiment | Artifact,
926
+ schema: Schema,
927
+ ) -> None:
928
+ super().__init__(dataset=dataset, schema=schema)
929
+ if not data_is_soma_experiment(self._dataset):
930
+ raise InvalidArgument("dataset must be SOMAExperiment-like.")
931
+ if schema.otype != "tiledbsoma":
932
+ raise InvalidArgument("Schema otype must be 'tiledbsoma'.")
933
+
934
+ for slot, slot_schema in schema.slots.items():
935
+ if slot.startswith("ms:"):
936
+ ms, modality_slot = slot.split(":")
937
+ schema_dataset = (
938
+ self._dataset.ms[modality_slot.removesuffix(".T")]
939
+ .var.read()
940
+ .concat()
941
+ .to_pandas()
942
+ .drop("soma_joinid", axis=1, errors="ignore")
943
+ )
944
+
945
+ self._slots[slot] = DataFrameCurator(
946
+ (
947
+ schema_dataset.T
948
+ if modality_slot == "var.T"
949
+ or (
950
+ # backward compat
951
+ modality_slot == "var"
952
+ and schema.slots[slot].itype not in {None, "Feature"}
953
+ )
954
+ else schema_dataset
955
+ ),
956
+ slot_schema,
957
+ )
958
+ else:
959
+ # global Experiment obs slot
960
+ _ms, modality_slot = None, slot
961
+ schema_dataset = (
962
+ self._dataset.obs.read()
963
+ .concat()
964
+ .to_pandas()
965
+ .drop(["soma_joinid", "obs_id"], axis=1, errors="ignore")
966
+ )
967
+ self._slots[slot] = DataFrameCurator(
968
+ schema_dataset,
969
+ slot_schema,
970
+ )
971
+
972
+ if modality_slot == "var" and schema.slots[slot].itype not in {
973
+ None,
974
+ "Feature",
975
+ }:
976
+ logger.warning(
977
+ "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
978
+ )
979
+
980
+ _assign_var_fields_categoricals_multimodal(
981
+ modality=slot, # not using "ms" here as it would always be the same for all modalities
982
+ slot_type=modality_slot,
983
+ slot=slot,
984
+ slot_schema=slot_schema,
985
+ var_fields=self._var_fields,
986
+ cat_vectors=self._cat_vectors,
987
+ slots=self._slots,
988
+ )
989
+ self._columns_field = self._var_fields
990
+
991
+
854
992
  class CatVector:
855
993
  """Vector with categorical values."""
856
994
 
@@ -861,7 +999,7 @@ class CatVector:
861
999
  field: FieldAttr, # The field to validate against.
862
1000
  key: str, # The name of the vector to validate. Only used for logging.
863
1001
  values_setter: Callable | None = None, # A callable that sets the values.
864
- source: Record | None = None, # The ontology source to validate against.
1002
+ source: SQLRecord | None = None, # The ontology source to validate against.
865
1003
  feature: Feature | None = None,
866
1004
  cat_manager: DataFrameCatManager | None = None,
867
1005
  subtype_str: str = "",
@@ -924,10 +1062,20 @@ class CatVector:
924
1062
 
925
1063
  def _replace_synonyms(self) -> list[str]:
926
1064
  """Replace synonyms in the vector with standardized values."""
1065
+
1066
+ def process_value(value, syn_mapper):
1067
+ """Helper function to process values recursively."""
1068
+ if isinstance(value, list):
1069
+ # Handle list - recursively process each item
1070
+ return [process_value(item, syn_mapper) for item in value]
1071
+ else:
1072
+ # Handle single value
1073
+ return syn_mapper.get(value, value)
1074
+
927
1075
  syn_mapper = self._synonyms
928
1076
  # replace the values in df
929
1077
  std_values = self.values.map(
930
- lambda unstd_val: syn_mapper.get(unstd_val, unstd_val)
1078
+ lambda unstd_val: process_value(unstd_val, syn_mapper)
931
1079
  )
932
1080
  # remove the standardized values from self.non_validated
933
1081
  non_validated = [i for i in self._non_validated if i not in syn_mapper]
@@ -971,15 +1119,28 @@ class CatVector:
971
1119
  filter_kwargs = get_current_filter_kwargs(
972
1120
  registry, {"organism": self._organism, "source": self._source}
973
1121
  )
974
- values = [i for i in self.values if isinstance(i, str) and i]
1122
+ values = [
1123
+ i
1124
+ for i in self.values
1125
+ if (isinstance(i, str) and i)
1126
+ or (isinstance(i, list) and i)
1127
+ or (isinstance(i, np.ndarray) and i.size > 0)
1128
+ ]
975
1129
  if not values:
976
1130
  return [], []
1131
+
1132
+ # if a value is a list, we need to flatten it
1133
+ str_values = _flatten_unique(values)
1134
+
977
1135
  # inspect the default instance and save validated records from public
978
1136
  if (
979
1137
  self._subtype_str != "" and "__" not in self._subtype_str
980
1138
  ): # not for general filter expressions
981
- self._subtype_query_set = registry.get(name=self._subtype_str).records.all()
982
- values_array = np.array(values)
1139
+ related_name = registry._meta.get_field("type").remote_field.related_name
1140
+ self._subtype_query_set = getattr(
1141
+ registry.get(name=self._subtype_str), related_name
1142
+ ).all()
1143
+ values_array = np.array(str_values)
983
1144
  validated_mask = self._subtype_query_set.validate( # type: ignore
984
1145
  values_array, field=self._field, **filter_kwargs, mute=True
985
1146
  )
@@ -992,7 +1153,7 @@ class CatVector:
992
1153
  )
993
1154
  else:
994
1155
  existing_and_public_records = registry.from_values(
995
- list(values), field=self._field, **filter_kwargs, mute=True
1156
+ str_values, field=self._field, **filter_kwargs, mute=True
996
1157
  )
997
1158
  existing_and_public_labels = [
998
1159
  getattr(r, field_name) for r in existing_and_public_records
@@ -1019,7 +1180,7 @@ class CatVector:
1019
1180
  )
1020
1181
  # non-validated records from the default instance
1021
1182
  non_validated_labels = [
1022
- i for i in values if i not in existing_and_public_labels
1183
+ i for i in str_values if i not in existing_and_public_labels
1023
1184
  ]
1024
1185
  validated_labels = existing_and_public_labels
1025
1186
  records = existing_and_public_records
@@ -1040,7 +1201,7 @@ class CatVector:
1040
1201
 
1041
1202
  registry = self._field.field.model
1042
1203
  field_name = self._field.field.name
1043
- non_validated_records: RecordList[Any] = [] # type: ignore
1204
+ non_validated_records: SQLRecordList[Any] = [] # type: ignore
1044
1205
  if df is not None and registry == Feature:
1045
1206
  nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
1046
1207
  non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
@@ -1204,7 +1365,7 @@ class DataFrameCatManager:
1204
1365
  columns_field: FieldAttr = Feature.name,
1205
1366
  columns_names: Iterable[str] | None = None,
1206
1367
  categoricals: list[Feature] | None = None,
1207
- sources: dict[str, Record] | None = None,
1368
+ sources: dict[str, SQLRecord] | None = None,
1208
1369
  index: Feature | None = None,
1209
1370
  slot: str | None = None,
1210
1371
  maximal_set: bool = False,
@@ -1372,20 +1533,20 @@ class DataFrameCatManager:
1372
1533
  self._cat_vectors[key].add_new(**kwargs)
1373
1534
 
1374
1535
 
1375
- def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
1536
+ def get_current_filter_kwargs(registry: type[SQLRecord], kwargs: dict) -> dict:
1376
1537
  """Make sure the source and organism are saved in the same database as the registry."""
1377
1538
  db = registry.filter().db
1378
1539
  source = kwargs.get("source")
1379
1540
  organism = kwargs.get("organism")
1380
1541
  filter_kwargs = kwargs.copy()
1381
1542
 
1382
- if isinstance(organism, Record) and organism._state.db != "default":
1543
+ if isinstance(organism, SQLRecord) and organism._state.db != "default":
1383
1544
  if db is None or db == "default":
1384
1545
  organism_default = copy.copy(organism)
1385
1546
  # save the organism record in the default database
1386
1547
  organism_default.save()
1387
1548
  filter_kwargs["organism"] = organism_default
1388
- if isinstance(source, Record) and source._state.db != "default":
1549
+ if isinstance(source, SQLRecord) and source._state.db != "default":
1389
1550
  if db is None or db == "default":
1390
1551
  source_default = copy.copy(source)
1391
1552
  # save the source record in the default database
@@ -1505,18 +1666,24 @@ def annotate_artifact(
1505
1666
  return artifact
1506
1667
 
1507
1668
 
1508
- # TODO: need this function to support mutli-value columns
1509
1669
  def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:
1510
- """Flatten a Pandas series containing lists or single items into a unique list of elements."""
1511
- result = set()
1670
+ """Flatten a Pandas series containing lists or single items into a unique list of elements.
1671
+
1672
+ The order of elements in the result list preserves the order they first appear in the input series.
1673
+ """
1674
+ # Use dict.fromkeys to preserve order while ensuring uniqueness
1675
+ result: dict = {}
1512
1676
 
1513
1677
  for item in series:
1514
- if isinstance(item, list):
1515
- result.update(item)
1678
+ if isinstance(item, list | np.ndarray):
1679
+ # Add each element to the dict (only first occurrence is kept)
1680
+ for element in item:
1681
+ result[element] = None
1516
1682
  else:
1517
- result.add(item)
1683
+ result[item] = None
1518
1684
 
1519
- return list(result)
1685
+ # Return the keys as a list, preserving order
1686
+ return list(result.keys())
1520
1687
 
1521
1688
 
1522
1689
  def _save_organism(name: str):
lamindb/errors.py CHANGED
@@ -10,7 +10,7 @@
10
10
  MissingContextUID
11
11
  UpdateContext
12
12
  IntegrityError
13
- RecordNameChangeIntegrityError
13
+ SQLRecordNameChangeIntegrityError
14
14
 
15
15
  """
16
16
 
@@ -57,7 +57,7 @@ class InconsistentKey(Exception):
57
57
  pass
58
58
 
59
59
 
60
- class RecordNameChangeIntegrityError(Exception):
60
+ class SQLRecordNameChangeIntegrityError(Exception):
61
61
  """Custom exception for name change errors."""
62
62
 
63
63
  pass
@@ -569,7 +569,7 @@ class Migration(migrations.Migration):
569
569
  ),
570
570
  ),
571
571
  ],
572
- bases=(lamindb.models.LinkORM, models.Model),
572
+ bases=(lamindb.models.IsLink, models.Model),
573
573
  ),
574
574
  migrations.AddField(
575
575
  model_name="collection",
@@ -619,7 +619,7 @@ class Migration(migrations.Migration):
619
619
  ),
620
620
  ),
621
621
  ],
622
- bases=(lamindb.models.LinkORM, models.Model),
622
+ bases=(lamindb.models.IsLink, models.Model),
623
623
  ),
624
624
  migrations.AddField(
625
625
  model_name="artifact",
@@ -656,7 +656,7 @@ class Migration(migrations.Migration):
656
656
  options={
657
657
  "unique_together": {("featureset", "feature")},
658
658
  },
659
- bases=(models.Model, lamindb.models.LinkORM),
659
+ bases=(models.Model, lamindb.models.IsLink),
660
660
  ),
661
661
  migrations.AddField(
662
662
  model_name="feature",
@@ -727,7 +727,7 @@ class Migration(migrations.Migration):
727
727
  ),
728
728
  ),
729
729
  ],
730
- bases=(lamindb.models.LinkORM, models.Model),
730
+ bases=(lamindb.models.IsLink, models.Model),
731
731
  ),
732
732
  migrations.AddField(
733
733
  model_name="artifact",
@@ -805,7 +805,7 @@ class Migration(migrations.Migration):
805
805
  options={
806
806
  "unique_together": {("artifact", "paramvalue")},
807
807
  },
808
- bases=(models.Model, lamindb.models.LinkORM),
808
+ bases=(models.Model, lamindb.models.IsLink),
809
809
  ),
810
810
  migrations.AddField(
811
811
  model_name="artifact",
@@ -1082,7 +1082,7 @@ class Migration(migrations.Migration):
1082
1082
  options={
1083
1083
  "unique_together": {("run", "paramvalue")},
1084
1084
  },
1085
- bases=(models.Model, lamindb.models.LinkORM),
1085
+ bases=(models.Model, lamindb.models.IsLink),
1086
1086
  ),
1087
1087
  migrations.AddField(
1088
1088
  model_name="run",
@@ -1539,7 +1539,7 @@ class Migration(migrations.Migration):
1539
1539
  options={
1540
1540
  "unique_together": {("collection", "ulabel")},
1541
1541
  },
1542
- bases=(lamindb.models.LinkORM, models.Model),
1542
+ bases=(lamindb.models.IsLink, models.Model),
1543
1543
  ),
1544
1544
  migrations.AddField(
1545
1545
  model_name="collection",
@@ -1624,7 +1624,7 @@ class Migration(migrations.Migration):
1624
1624
  options={
1625
1625
  "unique_together": {("artifact", "ulabel", "feature")},
1626
1626
  },
1627
- bases=(lamindb.models.LinkORM, models.Model),
1627
+ bases=(lamindb.models.IsLink, models.Model),
1628
1628
  ),
1629
1629
  migrations.AddField(
1630
1630
  model_name="artifact",