lamindb 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lamindb/__init__.py +1 -3
  2. lamindb/_finish.py +32 -16
  3. lamindb/base/types.py +6 -4
  4. lamindb/core/_context.py +127 -57
  5. lamindb/core/_mapped_collection.py +1 -1
  6. lamindb/core/_settings.py +44 -4
  7. lamindb/core/_track_environment.py +5 -2
  8. lamindb/core/loaders.py +1 -1
  9. lamindb/core/storage/_anndata_accessor.py +1 -1
  10. lamindb/core/storage/_tiledbsoma.py +14 -8
  11. lamindb/core/storage/_valid_suffixes.py +0 -1
  12. lamindb/core/storage/_zarr.py +1 -1
  13. lamindb/core/storage/objects.py +13 -8
  14. lamindb/core/storage/paths.py +9 -6
  15. lamindb/core/types.py +1 -1
  16. lamindb/curators/_legacy.py +2 -1
  17. lamindb/curators/core.py +106 -105
  18. lamindb/errors.py +9 -0
  19. lamindb/examples/fixtures/__init__.py +0 -0
  20. lamindb/examples/fixtures/sheets.py +224 -0
  21. lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +1 -1
  22. lamindb/migrations/0105_record_unique_name.py +20 -0
  23. lamindb/migrations/0106_transfer_data_migration.py +25 -0
  24. lamindb/migrations/0107_add_schema_to_record.py +68 -0
  25. lamindb/migrations/0108_remove_record_sheet_remove_sheetproject_sheet_and_more.py +30 -0
  26. lamindb/migrations/0109_record_input_of_runs_alter_record_run_and_more.py +123 -0
  27. lamindb/migrations/0110_rename_values_artifacts_record_linked_artifacts.py +17 -0
  28. lamindb/migrations/0111_remove_record__sort_order.py +148 -0
  29. lamindb/migrations/0112_alter_recordartifact_feature_and_more.py +105 -0
  30. lamindb/migrations/0113_lower_case_branch_and_space_names.py +62 -0
  31. lamindb/migrations/0114_alter_run__status_code.py +24 -0
  32. lamindb/migrations/0115_alter_space_uid.py +52 -0
  33. lamindb/migrations/{0104_squashed.py → 0115_squashed.py} +261 -257
  34. lamindb/models/__init__.py +4 -3
  35. lamindb/models/_describe.py +88 -31
  36. lamindb/models/_feature_manager.py +627 -658
  37. lamindb/models/_label_manager.py +1 -3
  38. lamindb/models/artifact.py +214 -99
  39. lamindb/models/collection.py +7 -1
  40. lamindb/models/feature.py +288 -60
  41. lamindb/models/has_parents.py +3 -3
  42. lamindb/models/project.py +32 -15
  43. lamindb/models/query_manager.py +7 -1
  44. lamindb/models/query_set.py +118 -41
  45. lamindb/models/record.py +140 -94
  46. lamindb/models/run.py +42 -42
  47. lamindb/models/save.py +102 -16
  48. lamindb/models/schema.py +41 -8
  49. lamindb/models/sqlrecord.py +105 -40
  50. lamindb/models/storage.py +278 -0
  51. lamindb/models/transform.py +10 -2
  52. lamindb/models/ulabel.py +9 -1
  53. lamindb/py.typed +0 -0
  54. lamindb/setup/__init__.py +2 -1
  55. lamindb/setup/_switch.py +16 -0
  56. lamindb/setup/errors/__init__.py +4 -0
  57. lamindb/setup/types/__init__.py +4 -0
  58. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/METADATA +5 -5
  59. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/RECORD +61 -44
  60. lamindb/models/core.py +0 -135
  61. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/LICENSE +0 -0
  62. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/WHEEL +0 -0
@@ -13,7 +13,7 @@ from lamindb_setup.core.upath import LocalPathClasses, create_path
13
13
  from packaging import version
14
14
 
15
15
  if TYPE_CHECKING:
16
- from lamindb_setup.core.types import UPathStr
16
+ from lamindb_setup.types import UPathStr
17
17
  from tiledbsoma import Collection as SOMACollection
18
18
  from tiledbsoma import Experiment as SOMAExperiment
19
19
  from tiledbsoma import Measurement as SOMAMeasurement
@@ -54,12 +54,18 @@ def _tiledb_config_s3(storepath: UPath) -> dict:
54
54
  else:
55
55
  tiledb_config["vfs.s3.region"] = get_storage_region(storepath)
56
56
 
57
- if "key" in storage_options:
58
- tiledb_config["vfs.s3.aws_access_key_id"] = storage_options["key"]
59
- if "secret" in storage_options:
60
- tiledb_config["vfs.s3.aws_secret_access_key"] = storage_options["secret"]
61
- if "token" in storage_options:
62
- tiledb_config["vfs.s3.aws_session_token"] = storage_options["token"]
57
+ if storage_options.get("anon", False):
58
+ tiledb_config["vfs.s3.no_sign_request"] = "true"
59
+ tiledb_config["vfs.s3.aws_access_key_id"] = ""
60
+ tiledb_config["vfs.s3.aws_secret_access_key"] = ""
61
+ tiledb_config["vfs.s3.aws_session_token"] = ""
62
+ else:
63
+ if "key" in storage_options:
64
+ tiledb_config["vfs.s3.aws_access_key_id"] = storage_options["key"]
65
+ if "secret" in storage_options:
66
+ tiledb_config["vfs.s3.aws_secret_access_key"] = storage_options["secret"]
67
+ if "token" in storage_options:
68
+ tiledb_config["vfs.s3.aws_session_token"] = storage_options["token"]
63
69
 
64
70
  return tiledb_config
65
71
 
@@ -148,7 +154,7 @@ def save_tiledbsoma_experiment(
148
154
  else:
149
155
  uid, _ = create_uid(n_full_id=20)
150
156
  storage_key = auto_storage_key_from_artifact_uid(
151
- uid, ".tiledbsoma", is_dir=True
157
+ uid, ".tiledbsoma", overwrite_versions=True
152
158
  )
153
159
  storepath = setup_settings.storage.root / storage_key
154
160
 
@@ -6,7 +6,6 @@ from lamindb_setup.core.upath import VALID_COMPOSITE_SUFFIXES, VALID_SIMPLE_SUFF
6
6
  VALID_COMPOSITE_SUFFIXES.update(
7
7
  {
8
8
  ".vitessce.json",
9
- "spatialdata.zarr",
10
9
  ".ome.zarr",
11
10
  }
12
11
  )
@@ -24,7 +24,7 @@ else:
24
24
 
25
25
  if TYPE_CHECKING:
26
26
  from fsspec import FSMap
27
- from lamindb_setup.core.types import UPathStr
27
+ from lamindb_setup.types import UPathStr
28
28
 
29
29
  from lamindb.core.types import ScverseDataStructures
30
30
 
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from pathlib import PurePosixPath
4
- from typing import TYPE_CHECKING, TypeAlias
4
+ from typing import TYPE_CHECKING, Any, TypeAlias
5
5
 
6
6
  from anndata import AnnData
7
7
  from pandas import DataFrame
@@ -12,14 +12,15 @@ from lamindb.core._compat import (
12
12
  from lamindb.core.types import ScverseDataStructures
13
13
 
14
14
  if TYPE_CHECKING:
15
- from lamindb_setup.core.types import UPathStr
15
+ from lamindb_setup.types import UPathStr
16
16
 
17
17
  SupportedDataTypes: TypeAlias = DataFrame | ScverseDataStructures
18
18
 
19
19
 
20
- def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
20
+ def infer_suffix(dmem: SupportedDataTypes, format: str | dict[str, Any] | None = None):
21
21
  """Infer LaminDB storage file suffix from a data object."""
22
22
  if isinstance(dmem, AnnData):
23
+ assert not isinstance(format, dict) # noqa: S101
23
24
  if format is not None:
24
25
  # should be `.h5ad`, `.`zarr`, or `.anndata.zarr`
25
26
  if format not in {"h5ad", "zarr", "anndata.zarr"}:
@@ -32,8 +33,12 @@ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
32
33
  return ".h5ad"
33
34
 
34
35
  if isinstance(dmem, DataFrame):
35
- if format == ".csv":
36
- return ".csv"
36
+ if isinstance(format, str):
37
+ if format == ".csv":
38
+ return ".csv"
39
+ elif isinstance(format, dict):
40
+ if format.get("suffix") == ".csv":
41
+ return ".csv"
37
42
  return ".parquet"
38
43
 
39
44
  if with_package_obj(
@@ -68,7 +73,7 @@ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
68
73
  raise NotImplementedError
69
74
 
70
75
 
71
- def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr) -> None:
76
+ def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr, **kwargs) -> None:
72
77
  """Writes the passed in memory data to disk to a specified path."""
73
78
  if isinstance(dmem, AnnData):
74
79
  suffix = PurePosixPath(filepath).suffix
@@ -83,9 +88,9 @@ def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr) -> None:
83
88
 
84
89
  if isinstance(dmem, DataFrame):
85
90
  if filepath.suffix == ".csv":
86
- dmem.to_csv(filepath)
91
+ dmem.to_csv(filepath, **kwargs)
87
92
  return
88
- dmem.to_parquet(filepath)
93
+ dmem.to_parquet(filepath, **kwargs)
89
94
  return
90
95
 
91
96
  if with_package_obj(dmem, "MuData", "mudata", lambda obj: obj.write(filepath))[0]:
@@ -15,7 +15,7 @@ from lamindb.core._settings import settings
15
15
  if TYPE_CHECKING:
16
16
  from pathlib import Path
17
17
 
18
- from lamindb_setup.core.types import UPathStr
18
+ from lamindb_setup.types import UPathStr
19
19
 
20
20
  from lamindb.models.artifact import Artifact
21
21
 
@@ -26,15 +26,18 @@ AUTO_KEY_PREFIX = ".lamindb/"
26
26
  # add type annotations back asap when re-organizing the module
27
27
  def auto_storage_key_from_artifact(artifact: Artifact):
28
28
  if artifact.key is None or artifact._key_is_virtual:
29
- is_dir = artifact.n_files is not None
30
- return auto_storage_key_from_artifact_uid(artifact.uid, artifact.suffix, is_dir)
29
+ return auto_storage_key_from_artifact_uid(
30
+ artifact.uid, artifact.suffix, artifact.overwrite_versions
31
+ )
31
32
  else:
32
33
  return artifact.key
33
34
 
34
35
 
35
- def auto_storage_key_from_artifact_uid(uid: str, suffix: str, is_dir: bool) -> str:
36
+ def auto_storage_key_from_artifact_uid(
37
+ uid: str, suffix: str, overwrite_versions: bool
38
+ ) -> str:
36
39
  assert isinstance(suffix, str) # noqa: S101 Suffix cannot be None.
37
- if is_dir:
40
+ if overwrite_versions:
38
41
  uid_storage = uid[:16] # 16 chars, leave 4 chars for versioning
39
42
  else:
40
43
  uid_storage = uid
@@ -75,7 +78,7 @@ def attempt_accessing_path(
75
78
 
76
79
  if (
77
80
  artifact._state.db in ("default", None)
78
- and artifact.storage_id == settings._storage_settings.id
81
+ and artifact.storage_id == settings._storage_settings._id
79
82
  ):
80
83
  if access_token is None:
81
84
  storage_settings = settings._storage_settings
lamindb/core/types.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING, TypeVar
4
4
 
5
5
  from anndata import AnnData
6
- from lamindb_setup.core.types import UPathStr
6
+ from lamindb_setup.types import UPathStr
7
7
 
8
8
  from lamindb.base.types import (
9
9
  Dtype,
@@ -16,7 +16,7 @@ from lamindb.models.artifact import data_is_scversedatastructure
16
16
  from ..errors import InvalidArgument
17
17
 
18
18
  if TYPE_CHECKING:
19
- from lamindb_setup.core.types import UPathStr
19
+ from lamindb_setup.types import UPathStr
20
20
  from mudata import MuData
21
21
  from spatialdata import SpatialData
22
22
 
@@ -222,6 +222,7 @@ class DataFrameCatManager(CatManager):
222
222
  key="columns",
223
223
  source=self._sources.get("columns"),
224
224
  )
225
+ self._cat_vectors["columns"].add_new()
225
226
  for key, field in self._categoricals.items():
226
227
  self._cat_vectors[key] = CatVector(
227
228
  values_getter=lambda k=key: self._dataset[
lamindb/curators/core.py CHANGED
@@ -21,7 +21,7 @@ from typing import TYPE_CHECKING, Any, Callable
21
21
  import lamindb_setup as ln_setup
22
22
  import numpy as np
23
23
  import pandas as pd
24
- import pandera.pandas as pa
24
+ import pandera.pandas as pandera
25
25
  from lamin_utils import colors, logger
26
26
  from lamindb_setup.core._docs import doc_args
27
27
 
@@ -38,7 +38,12 @@ from lamindb.models.artifact import (
38
38
  data_is_scversedatastructure,
39
39
  data_is_soma_experiment,
40
40
  )
41
- from lamindb.models.feature import parse_cat_dtype, parse_dtype
41
+ from lamindb.models.feature import (
42
+ parse_cat_dtype,
43
+ parse_dtype,
44
+ parse_filter_string,
45
+ resolve_relation_filters,
46
+ )
42
47
 
43
48
  from ..errors import InvalidArgument, ValidationError
44
49
 
@@ -276,7 +281,6 @@ class SlotsCurator(Curator):
276
281
  Args:
277
282
  dataset: The dataset to validate & annotate.
278
283
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
279
-
280
284
  """
281
285
 
282
286
  def __init__(
@@ -324,23 +328,25 @@ class SlotsCurator(Curator):
324
328
  if self._artifact is None:
325
329
  type_mapping = [
326
330
  (
327
- lambda data: data_is_scversedatastructure(data, "AnnData"),
331
+ lambda dataset: data_is_scversedatastructure(dataset, "AnnData"),
328
332
  Artifact.from_anndata,
329
333
  ),
330
334
  (
331
- lambda data: data_is_scversedatastructure(data, "MuData"),
335
+ lambda dataset: data_is_scversedatastructure(dataset, "MuData"),
332
336
  Artifact.from_mudata,
333
337
  ),
334
338
  (
335
- lambda data: data_is_scversedatastructure(data, "SpatialData"),
339
+ lambda dataset: data_is_scversedatastructure(
340
+ dataset, "SpatialData"
341
+ ),
336
342
  Artifact.from_spatialdata,
337
343
  ),
338
344
  (data_is_soma_experiment, Artifact.from_tiledbsoma),
339
345
  ]
340
346
 
341
- for type_check, factory in type_mapping:
347
+ for type_check, af_constructor in type_mapping:
342
348
  if type_check(self._dataset):
343
- self._artifact = factory( # type: ignore
349
+ self._artifact = af_constructor( # type: ignore
344
350
  self._dataset,
345
351
  key=key,
346
352
  description=description,
@@ -373,9 +379,8 @@ def is_list_of_type(value, expected_type):
373
379
  def check_dtype(expected_type) -> Callable:
374
380
  """Creates a check function for Pandera that validates a column's dtype.
375
381
 
376
- Supports both standard dtype checking and mixed list/single values for
377
- the same type. For example, a column with expected_type 'float' would
378
- also accept a mix of float values and lists of floats.
382
+ Supports both standard dtype checking and mixed list/single values for the same type.
383
+ For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.
379
384
 
380
385
  Args:
381
386
  expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
@@ -394,6 +399,8 @@ def check_dtype(expected_type) -> Callable:
394
399
  return True
395
400
  elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
396
401
  return True
402
+ elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
403
+ return True
397
404
 
398
405
  # if we're here, it might be a mixed column with object dtype
399
406
  # need to check each value individually
@@ -406,8 +413,10 @@ def check_dtype(expected_type) -> Callable:
406
413
  elif expected_type_member == "num":
407
414
  # for numeric, accept either int or float
408
415
  return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
409
- elif expected_type_member == "str" or expected_type_member.startswith(
410
- "cat["
416
+ elif (
417
+ expected_type_member == "str"
418
+ or expected_type_member == "path"
419
+ or expected_type_member.startswith("cat[")
411
420
  ):
412
421
  return series.apply(lambda x: is_list_of_type(x, str)).all()
413
422
 
@@ -490,9 +499,12 @@ class DataFrameCurator(Curator):
490
499
  else:
491
500
  required = False
492
501
  # series.dtype is "object" if the column has lists types, e.g. [["a", "b"], ["a"], ["b"]]
493
- if feature.dtype in {"int", "float", "num"} or feature.dtype.startswith(
494
- "list"
495
- ):
502
+ if feature.dtype in {
503
+ "int",
504
+ "float",
505
+ "num",
506
+ "path",
507
+ } or feature.dtype.startswith("list"):
496
508
  if isinstance(self._dataset, pd.DataFrame):
497
509
  dtype = (
498
510
  self._dataset[feature.name].dtype
@@ -501,9 +513,9 @@ class DataFrameCurator(Curator):
501
513
  )
502
514
  else:
503
515
  dtype = None
504
- pandera_columns[feature.name] = pa.Column(
516
+ pandera_columns[feature.name] = pandera.Column(
505
517
  dtype=None,
506
- checks=pa.Check(
518
+ checks=pandera.Check(
507
519
  check_dtype(feature.dtype),
508
520
  element_wise=False,
509
521
  error=f"Column '{feature.name}' failed dtype check for '{feature.dtype}': got {dtype}",
@@ -518,7 +530,7 @@ class DataFrameCurator(Curator):
518
530
  if not feature.dtype.startswith("cat")
519
531
  else "category"
520
532
  )
521
- pandera_columns[feature.name] = pa.Column(
533
+ pandera_columns[feature.name] = pandera.Column(
522
534
  pandera_dtype,
523
535
  nullable=feature.nullable,
524
536
  coerce=feature.coerce_dtype,
@@ -533,24 +545,26 @@ class DataFrameCurator(Curator):
533
545
  if schema._index_feature_uid is not None:
534
546
  # in almost no case, an index should have a pandas.CategoricalDtype in a DataFrame
535
547
  # so, we're typing it as `str` here
536
- index = pa.Index(
548
+ index = pandera.Index(
537
549
  schema.index.dtype
538
550
  if not schema.index.dtype.startswith("cat")
539
551
  else str
540
552
  )
541
553
  else:
542
554
  index = None
543
- self._pandera_schema = pa.DataFrameSchema(
555
+ self._pandera_schema = pandera.DataFrameSchema(
544
556
  pandera_columns,
545
557
  coerce=schema.coerce_dtype,
546
558
  strict=schema.maximal_set,
547
559
  ordered=schema.ordered_set,
548
560
  index=index,
549
561
  )
562
+ # in the DataFrameCatManager, we use the
563
+ # actual columns of the dataset, not the pandera columns
564
+ # the pandera columns might have additional optional columns
550
565
  self._cat_manager = DataFrameCatManager(
551
566
  self._dataset,
552
567
  columns_field=parse_cat_dtype(schema.itype, is_itype=True)["field"],
553
- columns_names=pandera_columns.keys(),
554
568
  categoricals=categoricals,
555
569
  index=schema.index,
556
570
  slot=slot,
@@ -621,10 +635,10 @@ class DataFrameCurator(Curator):
621
635
  if self._schema.n > 0:
622
636
  try:
623
637
  # first validate through pandera
624
- self._pandera_schema.validate(self._dataset)
638
+ self._pandera_schema.validate(self._dataset, lazy=True)
625
639
  # then validate lamindb categoricals
626
640
  self._cat_manager_validate()
627
- except pa.errors.SchemaError as err:
641
+ except (pandera.errors.SchemaError, pandera.errors.SchemaErrors) as err:
628
642
  self._is_validated = False
629
643
  # .exconly() doesn't exist on SchemaError
630
644
  raise ValidationError(str(err)) from err
@@ -904,7 +918,7 @@ class SpatialDataCurator(SlotsCurator):
904
918
 
905
919
 
906
920
  class TiledbsomaExperimentCurator(SlotsCurator):
907
- """Curator for `TileDB-SOMA`.
921
+ """Curator for `tiledbsoma.Experiment`.
908
922
 
909
923
  Args:
910
924
  dataset: The `tiledbsoma.Experiment` object.
@@ -933,7 +947,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
933
947
 
934
948
  for slot, slot_schema in schema.slots.items():
935
949
  if slot.startswith("ms:"):
936
- ms, modality_slot = slot.split(":")
950
+ _, modality_slot = slot.split(":")
937
951
  schema_dataset = (
938
952
  self._dataset.ms[modality_slot.removesuffix(".T")]
939
953
  .var.read()
@@ -943,21 +957,12 @@ class TiledbsomaExperimentCurator(SlotsCurator):
943
957
  )
944
958
 
945
959
  self._slots[slot] = DataFrameCurator(
946
- (
947
- schema_dataset.T
948
- if modality_slot == "var.T"
949
- or (
950
- # backward compat
951
- modality_slot == "var"
952
- and schema.slots[slot].itype not in {None, "Feature"}
953
- )
954
- else schema_dataset
955
- ),
960
+ (schema_dataset.T if modality_slot == "var.T" else schema_dataset),
956
961
  slot_schema,
957
962
  )
958
963
  else:
959
964
  # global Experiment obs slot
960
- _ms, modality_slot = None, slot
965
+ modality_slot = slot
961
966
  schema_dataset = (
962
967
  self._dataset.obs.read()
963
968
  .concat()
@@ -969,16 +974,8 @@ class TiledbsomaExperimentCurator(SlotsCurator):
969
974
  slot_schema,
970
975
  )
971
976
 
972
- if modality_slot == "var" and schema.slots[slot].itype not in {
973
- None,
974
- "Feature",
975
- }:
976
- logger.warning(
977
- "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
978
- )
979
-
980
977
  _assign_var_fields_categoricals_multimodal(
981
- modality=slot, # not using "ms" here as it would always be the same for all modalities
978
+ modality=slot, # not passing `measurement` here because it's a constant. The slot has the actual modality
982
979
  slot_type=modality_slot,
983
980
  slot=slot,
984
981
  slot_schema=slot_schema,
@@ -1020,6 +1017,13 @@ class CatVector:
1020
1017
  self.feature = feature
1021
1018
  self.records = None
1022
1019
  self._maximal_set = maximal_set
1020
+
1021
+ self._all_filters = {"source": self._source, "organism": self._organism}
1022
+ if self._subtype_str and "=" in self._subtype_str:
1023
+ self._all_filters.update(
1024
+ resolve_relation_filters(parse_filter_string(self._subtype_str), self) # type: ignore
1025
+ )
1026
+
1023
1027
  if hasattr(field.field.model, "_name_field"):
1024
1028
  label_ref_is_name = field.field.name == field.field.model._name_field
1025
1029
  else:
@@ -1049,7 +1053,7 @@ class CatVector:
1049
1053
  # should probably add a setting `at_least_one_validated`
1050
1054
  result = True
1051
1055
  if len(self.values) > 0 and len(self.values) == len(self._non_validated):
1052
- result = False
1056
+ logger.warning(f"no values were validated for {self._key}!")
1053
1057
  # len(self._non_validated) != 0
1054
1058
  # if maximal_set is True, return False
1055
1059
  # if maximal_set is False, return True
@@ -1116,9 +1120,15 @@ class CatVector:
1116
1120
  registry = self._field.field.model
1117
1121
  field_name = self._field.field.name
1118
1122
  model_field = registry.__get_name_with_module__()
1119
- filter_kwargs = get_current_filter_kwargs(
1120
- registry, {"organism": self._organism, "source": self._source}
1121
- )
1123
+ filter_kwargs = get_current_filter_kwargs(registry, self._all_filters)
1124
+
1125
+ valid_from_values_kwargs = {}
1126
+ for key, value in filter_kwargs.items():
1127
+ if key in {"field", "organism", "source", "mute"}:
1128
+ valid_from_values_kwargs[key] = value
1129
+ elif hasattr(registry, key) and "__" not in key:
1130
+ valid_from_values_kwargs[key] = value
1131
+
1122
1132
  values = [
1123
1133
  i
1124
1134
  for i in self.values
@@ -1133,13 +1143,13 @@ class CatVector:
1133
1143
  str_values = _flatten_unique(values)
1134
1144
 
1135
1145
  # inspect the default instance and save validated records from public
1136
- if (
1137
- self._subtype_str != "" and "__" not in self._subtype_str
1138
- ): # not for general filter expressions
1146
+ if self._subtype_str != "" and "=" not in self._subtype_str:
1139
1147
  related_name = registry._meta.get_field("type").remote_field.related_name
1140
- self._subtype_query_set = getattr(
1141
- registry.get(name=self._subtype_str), related_name
1142
- ).all()
1148
+ type_record = registry.get(name=self._subtype_str)
1149
+ if registry.__name__ == "Record":
1150
+ self._subtype_query_set = type_record.query_children()
1151
+ else:
1152
+ self._subtype_query_set = getattr(type_record, related_name).all()
1143
1153
  values_array = np.array(str_values)
1144
1154
  validated_mask = self._subtype_query_set.validate( # type: ignore
1145
1155
  values_array, field=self._field, **filter_kwargs, mute=True
@@ -1149,11 +1159,14 @@ class CatVector:
1149
1159
  values_array[~validated_mask],
1150
1160
  )
1151
1161
  records = registry.from_values(
1152
- validated_labels, field=self._field, **filter_kwargs, mute=True
1162
+ validated_labels,
1163
+ field=self._field,
1164
+ **valid_from_values_kwargs,
1165
+ mute=True,
1153
1166
  )
1154
1167
  else:
1155
1168
  existing_and_public_records = registry.from_values(
1156
- str_values, field=self._field, **filter_kwargs, mute=True
1169
+ str_values, field=self._field, **valid_from_values_kwargs, mute=True
1157
1170
  )
1158
1171
  existing_and_public_labels = [
1159
1172
  getattr(r, field_name) for r in existing_and_public_records
@@ -1236,16 +1249,25 @@ class CatVector:
1236
1249
  field_name = self._field.field.name
1237
1250
  model_field = f"{registry.__name__}.{field_name}"
1238
1251
 
1239
- kwargs_current = get_current_filter_kwargs(
1240
- registry, {"organism": self._organism, "source": self._source}
1241
- )
1252
+ kwargs_current = get_current_filter_kwargs(registry, self._all_filters)
1253
+
1254
+ valid_inspect_kwargs = {}
1255
+ for key, value in kwargs_current.items():
1256
+ if key in {"field", "organism", "source", "mute", "from_source"}:
1257
+ valid_inspect_kwargs[key] = value
1258
+ elif hasattr(registry, key) and "__" not in key:
1259
+ valid_inspect_kwargs[key] = value
1242
1260
 
1243
1261
  # inspect values from the default instance, excluding public
1244
1262
  registry_or_queryset = registry
1245
1263
  if self._subtype_query_set is not None:
1246
1264
  registry_or_queryset = self._subtype_query_set
1247
1265
  inspect_result = registry_or_queryset.inspect(
1248
- values, field=self._field, mute=True, from_source=False, **kwargs_current
1266
+ values,
1267
+ field=self._field,
1268
+ mute=True,
1269
+ from_source=False,
1270
+ **valid_inspect_kwargs,
1249
1271
  )
1250
1272
  non_validated = inspect_result.non_validated
1251
1273
  syn_mapper = inspect_result.synonyms_mapper
@@ -1257,7 +1279,7 @@ class CatVector:
1257
1279
  non_validated,
1258
1280
  field=self._field,
1259
1281
  mute=True,
1260
- **kwargs_current,
1282
+ **valid_inspect_kwargs,
1261
1283
  )
1262
1284
  values_validated += [getattr(r, field_name) for r in public_records]
1263
1285
 
@@ -1309,10 +1331,6 @@ class CatVector:
1309
1331
  self._validated, self._non_validated = self._add_validated()
1310
1332
  self._non_validated, self._synonyms = self._validate(values=self._non_validated)
1311
1333
 
1312
- # always register new Features if they are columns
1313
- if self._key == "columns" and self._field == Feature.name:
1314
- self.add_new()
1315
-
1316
1334
  def standardize(self) -> None:
1317
1335
  """Standardize the vector."""
1318
1336
  registry = self._field.field.model
@@ -1363,7 +1381,6 @@ class DataFrameCatManager:
1363
1381
  self,
1364
1382
  df: pd.DataFrame | Artifact,
1365
1383
  columns_field: FieldAttr = Feature.name,
1366
- columns_names: Iterable[str] | None = None,
1367
1384
  categoricals: list[Feature] | None = None,
1368
1385
  sources: dict[str, SQLRecord] | None = None,
1369
1386
  index: Feature | None = None,
@@ -1387,29 +1404,19 @@ class DataFrameCatManager:
1387
1404
  self._slot = slot
1388
1405
  self._maximal_set = maximal_set
1389
1406
 
1390
- if columns_names is None:
1391
- columns_names = []
1392
- if columns_field == Feature.name:
1393
- self._cat_vectors["columns"] = CatVector(
1394
- values_getter=columns_names,
1395
- field=columns_field,
1396
- key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
1397
- source=self._sources.get("columns"),
1398
- cat_manager=self,
1399
- maximal_set=self._maximal_set,
1400
- )
1401
- else:
1402
- self._cat_vectors["columns"] = CatVector(
1403
- values_getter=lambda: self._dataset.columns, # lambda ensures the inplace update
1404
- values_setter=lambda new_values: setattr(
1405
- self._dataset, "columns", pd.Index(new_values)
1406
- ),
1407
- field=columns_field,
1408
- key="columns",
1409
- source=self._sources.get("columns"),
1410
- cat_manager=self,
1411
- maximal_set=self._maximal_set,
1407
+ self._cat_vectors["columns"] = CatVector(
1408
+ values_getter=lambda: self._dataset.keys(), # lambda ensures the inplace update
1409
+ values_setter=lambda new_values: setattr(
1410
+ self._dataset, "columns", pd.Index(new_values)
1412
1411
  )
1412
+ if isinstance(self._dataset, pd.DataFrame)
1413
+ else None,
1414
+ field=columns_field,
1415
+ key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
1416
+ source=self._sources.get("columns"),
1417
+ cat_manager=self,
1418
+ maximal_set=self._maximal_set,
1419
+ )
1413
1420
  for feature in self._categoricals:
1414
1421
  result = parse_dtype(feature.dtype)[
1415
1422
  0
@@ -1533,25 +1540,19 @@ class DataFrameCatManager:
1533
1540
  self._cat_vectors[key].add_new(**kwargs)
1534
1541
 
1535
1542
 
1536
- def get_current_filter_kwargs(registry: type[SQLRecord], kwargs: dict) -> dict:
1543
+ def get_current_filter_kwargs(
1544
+ registry: type[SQLRecord], kwargs: dict[str, SQLRecord]
1545
+ ) -> dict:
1537
1546
  """Make sure the source and organism are saved in the same database as the registry."""
1538
1547
  db = registry.filter().db
1539
- source = kwargs.get("source")
1540
- organism = kwargs.get("organism")
1541
1548
  filter_kwargs = kwargs.copy()
1542
1549
 
1543
- if isinstance(organism, SQLRecord) and organism._state.db != "default":
1544
- if db is None or db == "default":
1545
- organism_default = copy.copy(organism)
1546
- # save the organism record in the default database
1547
- organism_default.save()
1548
- filter_kwargs["organism"] = organism_default
1549
- if isinstance(source, SQLRecord) and source._state.db != "default":
1550
- if db is None or db == "default":
1551
- source_default = copy.copy(source)
1552
- # save the source record in the default database
1553
- source_default.save()
1554
- filter_kwargs["source"] = source_default
1550
+ for key, value in kwargs.items():
1551
+ if isinstance(value, SQLRecord) and value._state.db != "default":
1552
+ if db is None or db == "default":
1553
+ value_default = copy.copy(value)
1554
+ value_default.save()
1555
+ filter_kwargs[key] = value_default
1555
1556
 
1556
1557
  return filter_kwargs
1557
1558
 
lamindb/errors.py CHANGED
@@ -7,10 +7,13 @@
7
7
  InvalidArgument
8
8
  DoesNotExist
9
9
  NotebookNotSaved
10
+ UnknownStorageLocation
10
11
  MissingContextUID
11
12
  UpdateContext
12
13
  IntegrityError
14
+ FieldValidationError
13
15
  SQLRecordNameChangeIntegrityError
16
+ NoWriteAccess
14
17
 
15
18
  """
16
19
 
@@ -43,6 +46,12 @@ class NotebookNotSaved(Exception):
43
46
  pass
44
47
 
45
48
 
49
+ class UnknownStorageLocation(Exception):
50
+ """Path is not contained in any known storage location."""
51
+
52
+ pass
53
+
54
+
46
55
  # equivalent to Django's DoesNotExist
47
56
  # and SQLAlchemy's NoResultFound
48
57
  class DoesNotExist(Exception):
File without changes