lamindb 1.10.1__py3-none-any.whl → 1.11a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +14 -12
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +45 -2
  8. lamindb/core/storage/_anndata_accessor.py +118 -26
  9. lamindb/core/storage/_backed_access.py +10 -7
  10. lamindb/core/storage/_spatialdata_accessor.py +15 -4
  11. lamindb/core/storage/_zarr.py +3 -0
  12. lamindb/curators/_legacy.py +16 -3
  13. lamindb/curators/core.py +439 -191
  14. lamindb/examples/cellxgene/__init__.py +8 -3
  15. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  16. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  17. lamindb/examples/croissant/__init__.py +12 -2
  18. lamindb/examples/datasets/__init__.py +2 -2
  19. lamindb/examples/datasets/_core.py +1 -1
  20. lamindb/examples/datasets/_small.py +66 -22
  21. lamindb/examples/datasets/mini_immuno.py +1 -0
  22. lamindb/migrations/0118_alter_recordproject_value_projectrecord.py +99 -0
  23. lamindb/migrations/0119_rename_records_project_linked_in_records.py +26 -0
  24. lamindb/migrations/{0117_squashed.py → 0119_squashed.py} +92 -5
  25. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  26. lamindb/migrations/0121_recorduser.py +53 -0
  27. lamindb/models/__init__.py +3 -1
  28. lamindb/models/_describe.py +2 -2
  29. lamindb/models/_feature_manager.py +53 -53
  30. lamindb/models/_from_values.py +2 -2
  31. lamindb/models/_is_versioned.py +4 -4
  32. lamindb/models/_label_manager.py +4 -4
  33. lamindb/models/artifact.py +336 -136
  34. lamindb/models/artifact_set.py +36 -1
  35. lamindb/models/can_curate.py +1 -2
  36. lamindb/models/collection.py +3 -34
  37. lamindb/models/feature.py +111 -7
  38. lamindb/models/has_parents.py +11 -11
  39. lamindb/models/project.py +42 -2
  40. lamindb/models/query_manager.py +16 -7
  41. lamindb/models/query_set.py +59 -34
  42. lamindb/models/record.py +25 -4
  43. lamindb/models/run.py +8 -6
  44. lamindb/models/schema.py +54 -26
  45. lamindb/models/sqlrecord.py +123 -25
  46. lamindb/models/storage.py +59 -14
  47. lamindb/models/transform.py +17 -17
  48. lamindb/models/ulabel.py +6 -1
  49. {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/METADATA +3 -3
  50. {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/RECORD +52 -47
  51. {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/LICENSE +0 -0
  52. {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/WHEEL +0 -0
@@ -1,7 +1,6 @@
1
1
  # ruff: noqa: TC004
2
2
  from __future__ import annotations
3
3
 
4
- import os
5
4
  import shutil
6
5
  from collections import defaultdict
7
6
  from pathlib import Path, PurePath, PurePosixPath
@@ -63,7 +62,7 @@ from ..core.storage.paths import (
63
62
  filepath_cache_key_from_artifact,
64
63
  filepath_from_artifact,
65
64
  )
66
- from ..errors import IntegrityError, InvalidArgument, ValidationError
65
+ from ..errors import InvalidArgument, ValidationError
67
66
  from ..models._is_versioned import (
68
67
  create_uid,
69
68
  )
@@ -201,7 +200,7 @@ def process_pathlike(
201
200
  # hence, we revert the creation and throw an error
202
201
  storage_record.delete()
203
202
  raise UnknownStorageLocation(
204
- f"Path {filepath} is not contained in any known storage location:\n{Storage.df()[['uid', 'root', 'type']]}\n\n"
203
+ f"Path {filepath} is not contained in any known storage location:\n{Storage.to_dataframe()[['uid', 'root', 'type']]}\n\n"
205
204
  f"Create a managed storage location that contains the path, e.g., by calling: ln.Storage(root='{new_root}').save()"
206
205
  )
207
206
  use_existing_storage_key = True
@@ -542,6 +541,7 @@ def log_storage_hint(
542
541
  def data_is_scversedatastructure(
543
542
  data: ScverseDataStructures | UPathStr,
544
543
  structure_type: Literal["AnnData", "MuData", "SpatialData"] | None = None,
544
+ cloud_warning: bool = True,
545
545
  ) -> bool:
546
546
  """Determine whether a specific in-memory object or a UPathstr is any or a specific scverse data structure."""
547
547
  file_suffix = None
@@ -551,12 +551,19 @@ def data_is_scversedatastructure(
551
551
  file_suffix = ".h5mu"
552
552
  # SpatialData does not have a unique suffix but `.zarr`
553
553
 
554
+ # AnnData allows both AnnDataAccessor and AnnData
555
+ class_name = data.__class__.__name__
554
556
  if structure_type is None:
555
557
  return any(
556
- hasattr(data, "__class__") and data.__class__.__name__ == cl_name
558
+ class_name
559
+ in (["AnnData", "AnnDataAccessor"] if cl_name == "AnnData" else [cl_name])
557
560
  for cl_name in ["AnnData", "MuData", "SpatialData"]
558
561
  )
559
- elif hasattr(data, "__class__") and data.__class__.__name__ == structure_type:
562
+ elif class_name in (
563
+ ["AnnData", "AnnDataAccessor"]
564
+ if structure_type == "AnnData"
565
+ else [structure_type]
566
+ ):
560
567
  return True
561
568
 
562
569
  data_type = structure_type.lower()
@@ -580,11 +587,12 @@ def data_is_scversedatastructure(
580
587
  )
581
588
  == data_type
582
589
  )
583
- else:
590
+ elif cloud_warning:
584
591
  logger.warning(
585
592
  f"we do not check whether cloud zarr is {structure_type}"
586
593
  )
587
594
  return False
595
+
588
596
  return False
589
597
 
590
598
 
@@ -600,23 +608,24 @@ def data_is_soma_experiment(data: SOMAExperiment | UPathStr) -> bool:
600
608
  def _check_otype_artifact(
601
609
  data: UPathStr | pd.DataFrame | ScverseDataStructures,
602
610
  otype: str | None = None,
611
+ cloud_warning: bool = True,
603
612
  ) -> str:
604
613
  if otype is None:
605
614
  if isinstance(data, pd.DataFrame):
606
- logger.warning("data is a DataFrame, please use .from_df()")
615
+ logger.warning("data is a DataFrame, please use .from_dataframe()")
607
616
  otype = "DataFrame"
608
617
  return otype
609
618
 
610
619
  data_is_path = isinstance(data, (str, Path))
611
- if data_is_scversedatastructure(data, "AnnData"):
620
+ if data_is_scversedatastructure(data, "AnnData", cloud_warning):
612
621
  if not data_is_path:
613
622
  logger.warning("data is an AnnData, please use .from_anndata()")
614
623
  otype = "AnnData"
615
- elif data_is_scversedatastructure(data, "MuData"):
624
+ elif data_is_scversedatastructure(data, "MuData", cloud_warning):
616
625
  if not data_is_path:
617
626
  logger.warning("data is a MuData, please use .from_mudata()")
618
627
  otype = "MuData"
619
- elif data_is_scversedatastructure(data, "SpatialData"):
628
+ elif data_is_scversedatastructure(data, "SpatialData", cloud_warning):
620
629
  if not data_is_path:
621
630
  logger.warning("data is a SpatialData, please use .from_spatialdata()")
622
631
  otype = "SpatialData"
@@ -871,7 +880,7 @@ def get_labels(
871
880
 
872
881
  values = []
873
882
  for v in qs_by_registry.values():
874
- values += v.list(get_name_field(v))
883
+ values += v.to_list(get_name_field(v))
875
884
  return values
876
885
  if len(registries_to_check) == 1 and registry in qs_by_registry:
877
886
  return qs_by_registry[registry]
@@ -894,7 +903,7 @@ def add_labels(
894
903
  raise ValueError("Please save the artifact/collection before adding a label!")
895
904
 
896
905
  if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both
897
- records = records.list()
906
+ records = records.to_list()
898
907
  if isinstance(records, (str, SQLRecord)):
899
908
  records = [records]
900
909
  if not isinstance(records, list): # avoids warning for pd Series
@@ -993,6 +1002,112 @@ def add_labels(
993
1002
  )
994
1003
 
995
1004
 
1005
+ def delete_permanently(artifact: Artifact, storage: bool, using_key: str):
1006
+ # need to grab file path before deletion
1007
+ try:
1008
+ path, _ = filepath_from_artifact(artifact, using_key)
1009
+ except OSError:
1010
+ # we can still delete the record
1011
+ logger.warning("Could not get path")
1012
+ storage = False
1013
+ # only delete in storage if DB delete is successful
1014
+ # DB delete might error because of a foreign key constraint violated etc.
1015
+ if artifact._overwrite_versions and artifact.is_latest:
1016
+ logger.important(
1017
+ "deleting all versions of this artifact because they all share the same store"
1018
+ )
1019
+ for version in artifact.versions.all(): # includes artifact
1020
+ _delete_skip_storage(version)
1021
+ else:
1022
+ artifact._delete_skip_storage()
1023
+ # by default do not delete storage if deleting only a previous version
1024
+ # and the underlying store is mutable
1025
+ if artifact._overwrite_versions and not artifact.is_latest:
1026
+ delete_in_storage = False
1027
+ if storage:
1028
+ logger.warning(
1029
+ "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
1030
+ )
1031
+ elif artifact.key is None or artifact._key_is_virtual:
1032
+ # do not ask for confirmation also if storage is None
1033
+ delete_in_storage = storage is None or storage
1034
+ else:
1035
+ # for artifacts with non-virtual semantic storage keys (key is not None)
1036
+ # ask for extra-confirmation
1037
+ if storage is None:
1038
+ response = input(
1039
+ f"Are you sure to want to delete {path}? (y/n) You can't undo"
1040
+ " this action."
1041
+ )
1042
+ delete_in_storage = response == "y"
1043
+ else:
1044
+ delete_in_storage = storage
1045
+ if not delete_in_storage:
1046
+ logger.important(f"a file/folder remains here: {path}")
1047
+ # we don't yet have logic to bring back the deleted metadata record
1048
+ # in case storage deletion fails - this is important for ACID down the road
1049
+ if delete_in_storage:
1050
+ delete_msg = delete_storage(path, raise_file_not_found_error=False)
1051
+ if delete_msg != "did-not-delete":
1052
+ logger.success(f"deleted {colors.yellow(f'{path}')}")
1053
+
1054
+
1055
+ class LazyArtifact:
1056
+ """Lazy artifact for streaming to auto-generated internal paths.
1057
+
1058
+ This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
1059
+ and register the path as an artifact (see :class:`~lamindb.Artifact`).
1060
+
1061
+ This object creates a real artifact on `.save()` with the provided arguments.
1062
+
1063
+ Args:
1064
+ suffix: The suffix for the auto-generated internal path
1065
+ overwrite_versions: Whether to overwrite versions.
1066
+ **kwargs: Keyword arguments for the artifact to be created.
1067
+
1068
+ Examples:
1069
+
1070
+ Create a lazy artifact, write to the path and save to get a real artifact::
1071
+
1072
+ lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
1073
+ zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
1074
+ artifact = lazy.save()
1075
+ """
1076
+
1077
+ def __init__(self, suffix: str, overwrite_versions: bool, **kwargs):
1078
+ self.kwargs = kwargs
1079
+ self.kwargs["overwrite_versions"] = overwrite_versions
1080
+
1081
+ if (key := kwargs.get("key")) is not None and extract_suffix_from_path(
1082
+ PurePosixPath(key)
1083
+ ) != suffix:
1084
+ raise ValueError(
1085
+ "The suffix argument and the suffix of key should be the same."
1086
+ )
1087
+
1088
+ uid, _ = create_uid(n_full_id=20)
1089
+ storage_key = auto_storage_key_from_artifact_uid(
1090
+ uid, suffix, overwrite_versions=overwrite_versions
1091
+ )
1092
+ storepath = setup_settings.storage.root / storage_key
1093
+
1094
+ self._path = storepath
1095
+
1096
+ @property
1097
+ def path(self) -> UPath:
1098
+ return self._path
1099
+
1100
+ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1101
+ artifact = Artifact(self.path, _is_internal_call=True, **self.kwargs)
1102
+ return artifact.save(upload=upload, **kwargs)
1103
+
1104
+ def __repr__(self) -> str: # pragma: no cover
1105
+ show_kwargs = {k: v for k, v in self.kwargs.items() if v is not None}
1106
+ return (
1107
+ f"LazyArtifact object with\n path: {self.path}\n arguments: {show_kwargs}"
1108
+ )
1109
+
1110
+
996
1111
  class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
997
1112
  # Note that this docstring has to be consistent with Curator.save_artifact()
998
1113
  """Datasets & models stored as files, folders, or arrays.
@@ -1028,15 +1143,22 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1028
1143
 
1029
1144
  artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
1030
1145
 
1031
- If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_df()`, `.from_anndata()`, ... constructors::
1146
+ If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_dataframe()`, `.from_anndata()`, ... constructors::
1032
1147
 
1033
1148
  schema = ln.Schema(itype=ln.Feature) # a schema that merely enforces that feature names exist in the Feature registry
1034
- artifact = ln.Artifact.from_df("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
1149
+ artifact = ln.Artifact.from_dataframe("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
1150
+
1151
+ To annotate by **external features**::
1152
+
1153
+ schema = ln.examples.schemas.valid_features()
1154
+ artifact = ln.Artifact("./my_file.parquet", features={"species": "bird"}).save()
1155
+
1156
+ A `schema` can be optionally passed to also validate the features.
1035
1157
 
1036
1158
  You can make a **new version** of an artifact by passing an existing `key`::
1037
1159
 
1038
1160
  artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
1039
- artifact_v2.versions.df() # see all versions
1161
+ artifact_v2.versions.to_dataframe() # see all versions
1040
1162
 
1041
1163
  You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
1042
1164
 
@@ -1110,6 +1232,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1110
1232
 
1111
1233
  class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
1112
1234
  abstract = False
1235
+ app_label = "lamindb"
1113
1236
  constraints = [
1114
1237
  # a simple hard unique constraint on `hash` clashes with the fact
1115
1238
  # that pipelines sometimes aim to ingest the exact same file in different
@@ -1157,11 +1280,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1157
1280
 
1158
1281
  ln.Artifact.filter(scientist="Barbara McClintock")
1159
1282
 
1160
- Features may or may not be part of the dataset, i.e., the artifact content in storage. For
1161
- instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1162
- `DataFrame`-like artifact and annotates it with features corresponding to
1163
- these columns. `artifact.features.add_values`, by contrast, does not
1164
- validate the content of the artifact.
1283
+ Features may or may not be part of the dataset, i.e., the artifact content in storage.
1284
+ For instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1285
+ `DataFrame`-like artifact and annotates it with features corresponding to these columns.
1286
+ `artifact.features.add_values`, by contrast, does not validate the content of the artifact.
1165
1287
 
1166
1288
  .. dropdown:: An example for a model-like artifact
1167
1289
 
@@ -1176,6 +1298,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1176
1298
  "subset_highlyvariable": True,
1177
1299
  },
1178
1300
  })
1301
+
1302
+ To validate external features::
1303
+
1304
+ schema = ln.Schema([ln.Feature(name="species", dtype=str).save()]).save()
1305
+ artifact.features.add_values({"species": "bird"}, schema=schema)
1179
1306
  """
1180
1307
  from ._feature_manager import FeatureManager
1181
1308
 
@@ -1385,15 +1512,46 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1385
1512
  # now proceed with the user-facing constructor
1386
1513
  if len(args) > 1:
1387
1514
  raise ValueError("Only one non-keyword arg allowed: data")
1515
+
1388
1516
  data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
1389
1517
  kind: str = kwargs.pop("kind", None)
1390
1518
  key: str | None = kwargs.pop("key", None)
1391
1519
  run_id: int | None = kwargs.pop("run_id", None) # for REST API
1392
1520
  run: Run | None = kwargs.pop("run", None)
1521
+ using_key = kwargs.pop("using_key", None)
1393
1522
  description: str | None = kwargs.pop("description", None)
1394
1523
  revises: Artifact | None = kwargs.pop("revises", None)
1395
1524
  overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None)
1396
1525
  version: str | None = kwargs.pop("version", None)
1526
+
1527
+ features: dict[str, Any] = kwargs.pop("features", None)
1528
+ schema: Schema | None = kwargs.pop("schema", None)
1529
+ if features is not None and schema is not None:
1530
+ from lamindb.curators import DataFrameCurator
1531
+
1532
+ temp_df = pd.DataFrame([features])
1533
+ validation_schema = schema
1534
+ if schema.itype == "Composite" and schema.slots:
1535
+ if len(schema.slots) > 1:
1536
+ raise ValueError(
1537
+ f"Composite schema has {len(schema.slots)} slots. "
1538
+ "External feature validation only supports schemas with a single slot."
1539
+ )
1540
+ try:
1541
+ validation_schema = next(
1542
+ k for k in schema.slots.keys() if k.startswith("__external")
1543
+ )
1544
+ except StopIteration:
1545
+ raise ValueError(
1546
+ "External feature validation requires a slot that starts with __external."
1547
+ ) from None
1548
+
1549
+ external_curator = DataFrameCurator(temp_df, validation_schema)
1550
+ external_curator.validate()
1551
+ external_curator._artifact = self
1552
+
1553
+ self._external_features = features
1554
+
1397
1555
  branch_id: int | None = None
1398
1556
  if "visibility" in kwargs: # backward compat
1399
1557
  branch_id = kwargs.pop("visibility")
@@ -1404,13 +1562,16 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1404
1562
  else:
1405
1563
  branch_id = 1
1406
1564
  branch = kwargs.pop("branch", None)
1565
+
1407
1566
  space = kwargs.pop("space", None)
1408
- space_id = kwargs.pop("space_id", 1)
1567
+ assert "space_id" not in kwargs, "please pass space instead" # noqa: S101
1409
1568
  format = kwargs.pop("format", None)
1410
1569
  _is_internal_call = kwargs.pop("_is_internal_call", False)
1411
1570
  skip_check_exists = kwargs.pop("skip_check_exists", False)
1571
+ storage_was_passed = False
1412
1572
  if "storage" in kwargs:
1413
1573
  storage = kwargs.pop("storage")
1574
+ storage_was_passed = True
1414
1575
  elif (
1415
1576
  setup_settings.instance.keep_artifacts_local
1416
1577
  and setup_settings.instance._local_storage is not None
@@ -1418,13 +1579,32 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1418
1579
  storage = setup_settings.instance.local_storage.record
1419
1580
  else:
1420
1581
  storage = setup_settings.instance.storage.record
1421
- using_key = kwargs.pop("using_key", None)
1582
+ if space is None:
1583
+ from lamindb import context as run_context
1584
+
1585
+ if run_context.space is not None:
1586
+ space = run_context.space
1587
+ elif setup_settings.space is not None:
1588
+ space = setup_settings.space
1589
+ if space is not None and space != storage.space:
1590
+ if storage_was_passed:
1591
+ logger.warning(
1592
+ "storage argument ignored as storage information from space takes precedence"
1593
+ )
1594
+ storage_locs_for_space = Storage.filter(space=space)
1595
+ storage = storage_locs_for_space.first()
1596
+ if len(storage_locs_for_space) > 1:
1597
+ logger.warning(
1598
+ f"more than one storage location for space {space}, choosing {storage}"
1599
+ )
1422
1600
  otype = kwargs.pop("otype") if "otype" in kwargs else None
1423
1601
  if isinstance(data, str) and data.startswith("s3:///"):
1424
1602
  # issue in Groovy / nf-lamin producing malformed S3 paths
1425
1603
  # https://laminlabs.slack.com/archives/C08J590666Q/p1751315027830849?thread_ts=1751039961.479259&cid=C08J590666Q
1426
1604
  data = data.replace("s3:///", "s3://")
1427
- otype = _check_otype_artifact(data=data, otype=otype)
1605
+ otype = _check_otype_artifact(
1606
+ data=data, otype=otype, cloud_warning=not _is_internal_call
1607
+ )
1428
1608
  if "type" in kwargs:
1429
1609
  logger.warning("`type` will be removed soon, please use `kind`")
1430
1610
  kind = kwargs.pop("type")
@@ -1457,6 +1637,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1457
1637
  )
1458
1638
  else:
1459
1639
  is_automanaged_path = False
1640
+
1460
1641
  provisional_uid, revises = create_uid(revises=revises, version=version)
1461
1642
  kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
1462
1643
  data=data,
@@ -1514,7 +1695,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1514
1695
  uid, revises = create_uid(revises=revises, version=version)
1515
1696
  kwargs["uid"] = uid
1516
1697
 
1517
- # only set key now so that we don't do a look-up on it in case revises is passed
1698
+ # only set key now so that we don't perform a look-up on it in case revises is passed
1518
1699
  if revises is not None and revises.key is not None and kwargs["key"] is None:
1519
1700
  kwargs["key"] = revises.key
1520
1701
 
@@ -1526,7 +1707,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1526
1707
  kwargs["branch"] = branch
1527
1708
  kwargs["branch_id"] = branch_id
1528
1709
  kwargs["space"] = space
1529
- kwargs["space_id"] = space_id
1530
1710
  kwargs["otype"] = otype
1531
1711
  kwargs["revises"] = revises
1532
1712
  # this check needs to come down here because key might be populated from an
@@ -1540,6 +1720,43 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1540
1720
 
1541
1721
  super().__init__(**kwargs)
1542
1722
 
1723
+ @classmethod
1724
+ def from_lazy(
1725
+ cls,
1726
+ suffix: str,
1727
+ overwrite_versions: bool,
1728
+ key: str | None = None,
1729
+ description: str | None = None,
1730
+ run: Run | None = None,
1731
+ **kwargs,
1732
+ ) -> LazyArtifact:
1733
+ """Create a lazy artifact for streaming to auto-generated internal paths.
1734
+
1735
+ This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
1736
+ and register the path as an artifact.
1737
+
1738
+ The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
1739
+ on `.save()` with the provided arguments.
1740
+
1741
+ Args:
1742
+ suffix: The suffix for the auto-generated internal path
1743
+ overwrite_versions: Whether to overwrite versions.
1744
+ key: An optional key to reference the artifact.
1745
+ description: A description.
1746
+ run: The run that creates the artifact.
1747
+ **kwargs: Other keyword arguments for the artifact to be created.
1748
+
1749
+ Examples:
1750
+
1751
+ Create a lazy artifact, write to the path and save to get a real artifact::
1752
+
1753
+ lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
1754
+ zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
1755
+ artifact = lazy.save()
1756
+ """
1757
+ args = {"key": key, "description": description, "run": run, **kwargs}
1758
+ return LazyArtifact(suffix, overwrite_versions, **args)
1759
+
1543
1760
  @property
1544
1761
  @deprecated("kind")
1545
1762
  def type(self) -> str:
@@ -1623,6 +1840,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1623
1840
  idlike: Either a uid stub, uid or an integer id.
1624
1841
  is_run_input: Whether to track this artifact as run input.
1625
1842
  expressions: Fields and values passed as Django query expressions.
1843
+ Use `path=...` to get an artifact for a local or remote filepath if exists.
1626
1844
 
1627
1845
  Raises:
1628
1846
  :exc:`docs:lamindb.errors.DoesNotExist`: In case no matching record is found.
@@ -1637,6 +1855,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1637
1855
 
1638
1856
  artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
1639
1857
  artifact = ln.Arfifact.get(key="examples/my_file.parquet")
1858
+ artifact = ln.Artifact.get(path="s3://bucket/folder/adata.h5ad")
1640
1859
  """
1641
1860
  from .query_set import QuerySet
1642
1861
 
@@ -1706,7 +1925,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1706
1925
  )
1707
1926
 
1708
1927
  @classmethod
1709
- def from_df(
1928
+ def from_dataframe(
1710
1929
  cls,
1711
1930
  df: pd.DataFrame,
1712
1931
  *,
@@ -1715,6 +1934,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1715
1934
  run: Run | None = None,
1716
1935
  revises: Artifact | None = None,
1717
1936
  schema: Schema | None = None,
1937
+ features: dict[str, Any] | None = None,
1718
1938
  **kwargs,
1719
1939
  ) -> Artifact:
1720
1940
  """Create from `DataFrame`, optionally validate & annotate.
@@ -1727,6 +1947,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1727
1947
  revises: An old version of the artifact.
1728
1948
  run: The run that creates the artifact.
1729
1949
  schema: A schema that defines how to validate & annotate.
1950
+ features: External features dict for additional annotation.
1730
1951
 
1731
1952
  See Also:
1732
1953
  :meth:`~lamindb.Collection`
@@ -1741,7 +1962,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1741
1962
  import lamindb as ln
1742
1963
 
1743
1964
  df = ln.core.datasets.mini_immuno.get_dataset1()
1744
- artifact = ln.Artifact.from_df(df, key="examples/dataset1.parquet").save()
1965
+ artifact = ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet").save()
1745
1966
 
1746
1967
  With validation and annotation.
1747
1968
 
@@ -1758,6 +1979,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1758
1979
  .. literalinclude:: scripts/define_mini_immuno_features_labels.py
1759
1980
  :language: python
1760
1981
 
1982
+ External features:
1983
+
1984
+ .. literalinclude:: scripts/curate_dataframe_external_features.py
1985
+ :language: python
1761
1986
  """
1762
1987
  artifact = Artifact( # type: ignore
1763
1988
  data=df,
@@ -1770,8 +1995,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1770
1995
  **kwargs,
1771
1996
  )
1772
1997
  artifact.n_observations = len(df)
1998
+
1773
1999
  if schema is not None:
1774
- from ..curators import DataFrameCurator
2000
+ from lamindb.curators.core import ComponentCurator
1775
2001
 
1776
2002
  if not artifact._state.adding and artifact.suffix != ".parquet":
1777
2003
  logger.warning(
@@ -1780,12 +2006,56 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1780
2006
  )
1781
2007
  return artifact
1782
2008
 
1783
- curator = DataFrameCurator(artifact, schema)
1784
- curator.validate()
1785
- artifact.schema = schema
1786
- artifact._curator = curator
2009
+ # Handle external features validation for Composite schemas
2010
+ if schema.itype == "Composite" and features is not None:
2011
+ try:
2012
+ external_slot = next(
2013
+ k for k in schema.slots.keys() if "__external__" in k
2014
+ )
2015
+ validation_schema = schema.slots[external_slot]
2016
+ except StopIteration:
2017
+ raise ValueError(
2018
+ "External feature validation requires a slot __external__."
2019
+ ) from None
2020
+
2021
+ external_curator = ComponentCurator(
2022
+ pd.DataFrame([features]), validation_schema
2023
+ )
2024
+ external_curator.validate()
2025
+ artifact._external_features = features
2026
+
2027
+ # Validate main DataFrame if not Composite or if Composite has attrs
2028
+ if schema.itype != "Composite" or "attrs" in schema.slots:
2029
+ curator = ComponentCurator(artifact, schema)
2030
+ curator.validate()
2031
+ artifact.schema = schema
2032
+ artifact._curator = curator
2033
+
1787
2034
  return artifact
1788
2035
 
2036
+ @classmethod
2037
+ @deprecated("from_dataframe")
2038
+ def from_df(
2039
+ cls,
2040
+ df: pd.DataFrame,
2041
+ *,
2042
+ key: str | None = None,
2043
+ description: str | None = None,
2044
+ run: Run | None = None,
2045
+ revises: Artifact | None = None,
2046
+ schema: Schema | None = None,
2047
+ **kwargs,
2048
+ ) -> Artifact:
2049
+ return cls.from_dataframe(
2050
+ df,
2051
+ key=key,
2052
+ description=description,
2053
+ run=run,
2054
+ revises=revises,
2055
+ schema=schema,
2056
+ **kwargs,
2057
+ )
2058
+
1789
2059
  @classmethod
1790
2060
  def from_anndata(
1791
2061
  cls,
@@ -2285,17 +2555,19 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2285
2555
  ):
2286
2556
  """Open a dataset for streaming.
2287
2557
 
2288
- Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
2289
- `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` or `polars` compatible formats
2558
+ Works for `AnnData` (`.h5ad` and `.zarr`), `SpatialData` (`.zarr`),
2559
+ generic `hdf5` and `zarr`, `tiledbsoma` objects (`.tiledbsoma`),
2560
+ `pyarrow` or `polars` compatible formats
2290
2561
  (`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
2291
2562
 
2292
2563
  Args:
2293
- mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
2564
+ mode: can be `"r"` or `"w"` (write mode) for `tiledbsoma` stores,
2565
+ `"r"` or `"r+"` for `AnnData` or `SpatialData` `zarr` stores,
2294
2566
  otherwise should be always `"r"` (read-only mode).
2295
2567
  engine: Which module to use for lazy loading of a dataframe
2296
2568
  from `pyarrow` or `polars` compatible formats.
2297
2569
  This has no effect if the artifact is not a dataframe, i.e.
2298
- if it is an `AnnData,` `hdf5`, `zarr` or `tiledbsoma` object.
2570
+ if it is an `AnnData,` `hdf5`, `zarr`, `tiledbsoma` object etc.
2299
2571
  is_run_input: Whether to track this artifact as run input.
2300
2572
  **kwargs: Keyword arguments for the accessor, i.e. `h5py` or `zarr` connection,
2301
2573
  `pyarrow.dataset.dataset`, `polars.scan_*` function.
@@ -2339,7 +2611,8 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2339
2611
  s + ".gz" for s in PYARROW_SUFFIXES
2340
2612
  ) # this doesn't work for externally gzipped files, REMOVE LATER
2341
2613
  )
2342
- if self.suffix not in suffixes:
2614
+ suffix = self.suffix
2615
+ if suffix not in suffixes:
2343
2616
  raise ValueError(
2344
2617
  "Artifact should have a zarr, h5, tiledbsoma object"
2345
2618
  " or a compatible `pyarrow.dataset.dataset` or `polars.scan_*` directory"
@@ -2348,23 +2621,28 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2348
2621
  f" Or no suffix for a folder with {', '.join(df_suffixes)} files"
2349
2622
  " (no mixing allowed)."
2350
2623
  )
2351
- if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
2352
- raise ValueError(
2353
- "Only a tiledbsoma store can be openened with `mode!='r'`."
2354
- )
2355
-
2356
2624
  using_key = settings._using_key
2357
2625
  filepath, cache_key = filepath_cache_key_from_artifact(
2358
2626
  self, using_key=using_key
2359
2627
  )
2628
+
2360
2629
  is_tiledbsoma_w = (
2361
- filepath.name == "soma" or self.suffix == ".tiledbsoma"
2630
+ filepath.name == "soma" or suffix == ".tiledbsoma"
2362
2631
  ) and mode == "w"
2632
+ is_zarr_w = suffix == ".zarr" and mode == "r+"
2633
+
2634
+ if mode != "r" and not (is_tiledbsoma_w or is_zarr_w):
2635
+ raise ValueError(
2636
+ f"It is not allowed to open a {suffix} object with mode='{mode}'. "
2637
+ "You can open all supported formats with mode='r', "
2638
+ "a tiledbsoma store with mode='w', "
2639
+ "AnnData or SpatialData zarr store with mode='r+'."
2640
+ )
2363
2641
  # consider the case where an object is already locally cached
2364
2642
  localpath = setup_settings.paths.cloud_to_local_no_update(
2365
2643
  filepath, cache_key=cache_key
2366
2644
  )
2367
- if is_tiledbsoma_w:
2645
+ if is_tiledbsoma_w or is_zarr_w:
2368
2646
  open_cache = False
2369
2647
  else:
2370
2648
  open_cache = not isinstance(
@@ -2395,9 +2673,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2395
2673
  else:
2396
2674
  localpath.unlink(missing_ok=True)
2397
2675
  else:
2398
- access = backed_access(
2399
- filepath, mode, engine, using_key=using_key, **kwargs
2400
- )
2676
+ access = backed_access(self, mode, engine, using_key=using_key, **kwargs)
2401
2677
  if is_tiledbsoma_w:
2402
2678
 
2403
2679
  def finalize():
@@ -2413,6 +2689,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2413
2689
  new_version = Artifact(
2414
2690
  filepath, revises=self, _is_internal_call=True
2415
2691
  ).save()
2692
+ # note: sets _state.db = "default"
2416
2693
  init_self_from_db(self, new_version)
2417
2694
 
2418
2695
  if localpath != filepath and localpath.exists():
@@ -2569,94 +2846,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2569
2846
  artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
2570
2847
  artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
2571
2848
  """
2572
- # we're *not* running the line below because the case `storage is None` triggers user feedback in one case
2573
- # storage = True if storage is None else storage
2574
-
2575
- # this first check means an invalid delete fails fast rather than cascading through
2576
- # database and storage permission errors
2577
- if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
2578
- isettings = setup_settings.instance
2579
- if self.storage.instance_uid != isettings.uid and (
2580
- storage or storage is None
2581
- ):
2582
- raise IntegrityError(
2583
- "Cannot simply delete artifacts outside of this instance's managed storage locations."
2584
- "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
2585
- f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
2586
- f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
2587
- )
2588
- # by default, we only move artifacts into the trash (branch_id = -1)
2589
- trash_branch_id = -1
2590
- if self.branch_id > trash_branch_id and not permanent:
2591
- if storage is not None:
2592
- logger.warning("moving artifact to trash, storage arg is ignored")
2593
- # move to trash
2594
- self.branch_id = trash_branch_id
2595
- self.save()
2596
- logger.important(f"moved artifact to trash (branch_id = {trash_branch_id})")
2597
- return
2598
-
2599
- # if the artifact is already in the trash
2600
- # permanent delete skips the trash
2601
- if permanent is None:
2602
- # ask for confirmation of permanent delete
2603
- response = input(
2604
- "Artifact record is already in trash! Are you sure you want to permanently"
2605
- " delete it? (y/n) You can't undo this action."
2606
- )
2607
- delete_record = response == "y"
2608
- else:
2609
- assert permanent # noqa: S101
2610
- delete_record = True
2611
-
2612
- if delete_record:
2613
- # need to grab file path before deletion
2614
- try:
2615
- path, _ = filepath_from_artifact(self, using_key)
2616
- except OSError:
2617
- # we can still delete the record
2618
- logger.warning("Could not get path")
2619
- storage = False
2620
- # only delete in storage if DB delete is successful
2621
- # DB delete might error because of a foreign key constraint violated etc.
2622
- if self._overwrite_versions and self.is_latest:
2623
- logger.important(
2624
- "deleting all versions of this artifact because they all share the same store"
2625
- )
2626
- for version in self.versions.all(): # includes self
2627
- _delete_skip_storage(version)
2628
- else:
2629
- self._delete_skip_storage()
2630
- # by default do not delete storage if deleting only a previous version
2631
- # and the underlying store is mutable
2632
- if self._overwrite_versions and not self.is_latest:
2633
- delete_in_storage = False
2634
- if storage:
2635
- logger.warning(
2636
- "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
2637
- )
2638
- elif self.key is None or self._key_is_virtual:
2639
- # do not ask for confirmation also if storage is None
2640
- delete_in_storage = storage is None or storage
2641
- else:
2642
- # for artifacts with non-virtual semantic storage keys (key is not None)
2643
- # ask for extra-confirmation
2644
- if storage is None:
2645
- response = input(
2646
- f"Are you sure to want to delete {path}? (y/n) You can't undo"
2647
- " this action."
2648
- )
2649
- delete_in_storage = response == "y"
2650
- else:
2651
- delete_in_storage = storage
2652
- if not delete_in_storage:
2653
- logger.important(f"a file/folder remains here: {path}")
2654
- # we don't yet have logic to bring back the deleted metadata record
2655
- # in case storage deletion fails - this is important for ACID down the road
2656
- if delete_in_storage:
2657
- delete_msg = delete_storage(path, raise_file_not_found_error=False)
2658
- if delete_msg != "did-not-delete":
2659
- logger.success(f"deleted {colors.yellow(f'{path}')}")
2849
+ super().delete(permanent=permanent, storage=storage, using_key=using_key)
2660
2850
 
2661
2851
  @property
2662
2852
  def _is_saved_to_storage_location(self) -> bool | None:
@@ -2785,11 +2975,20 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2785
2975
  local_path_cache,
2786
2976
  )
2787
2977
  logger.important(f"moved local artifact to cache: {local_path_cache}")
2978
+
2979
+ # Handle external features
2980
+ if hasattr(self, "_external_features") and self._external_features is not None:
2981
+ external_features = self._external_features
2982
+ delattr(self, "_external_features")
2983
+ self.features.add_values(external_features)
2984
+
2985
+ # annotate Artifact
2788
2986
  if hasattr(self, "_curator"):
2789
2987
  curator = self._curator
2790
2988
  delattr(self, "_curator")
2791
2989
  # just annotates this artifact
2792
2990
  curator.save_artifact()
2991
+
2793
2992
  return self
2794
2993
 
2795
2994
  def restore(self) -> None:
@@ -2837,7 +3036,7 @@ def _synchronize_cleanup_on_error(
2837
3036
 
2838
3037
 
2839
3038
  def _delete_skip_storage(artifact, *args, **kwargs) -> None:
2840
- super(Artifact, artifact).delete(*args, **kwargs)
3039
+ super(SQLRecord, artifact).delete(*args, **kwargs)
2841
3040
 
2842
3041
 
2843
3042
  def _save_skip_storage(artifact, **kwargs) -> None:
@@ -2855,6 +3054,7 @@ class ArtifactFeatureValue(BaseSQLRecord, IsLink, TracksRun):
2855
3054
  featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
2856
3055
 
2857
3056
  class Meta:
3057
+ app_label = "lamindb"
2858
3058
  unique_together = ("artifact", "featurevalue")
2859
3059
 
2860
3060