lamindb 1.10.2__py3-none-any.whl → 1.11a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +14 -12
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +2 -2
  8. lamindb/core/storage/_anndata_accessor.py +29 -9
  9. lamindb/curators/_legacy.py +16 -3
  10. lamindb/curators/core.py +432 -186
  11. lamindb/examples/cellxgene/__init__.py +8 -3
  12. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  13. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  14. lamindb/examples/croissant/__init__.py +12 -2
  15. lamindb/examples/datasets/__init__.py +2 -2
  16. lamindb/examples/datasets/_core.py +1 -1
  17. lamindb/examples/datasets/_small.py +66 -22
  18. lamindb/examples/datasets/mini_immuno.py +1 -0
  19. lamindb/migrations/0119_squashed.py +5 -2
  20. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  21. lamindb/migrations/0121_recorduser.py +53 -0
  22. lamindb/models/__init__.py +3 -1
  23. lamindb/models/_describe.py +2 -2
  24. lamindb/models/_feature_manager.py +53 -53
  25. lamindb/models/_from_values.py +2 -2
  26. lamindb/models/_is_versioned.py +4 -4
  27. lamindb/models/_label_manager.py +4 -4
  28. lamindb/models/artifact.py +305 -116
  29. lamindb/models/artifact_set.py +36 -1
  30. lamindb/models/can_curate.py +1 -2
  31. lamindb/models/collection.py +3 -34
  32. lamindb/models/feature.py +111 -7
  33. lamindb/models/has_parents.py +11 -11
  34. lamindb/models/project.py +18 -0
  35. lamindb/models/query_manager.py +16 -7
  36. lamindb/models/query_set.py +59 -34
  37. lamindb/models/record.py +25 -4
  38. lamindb/models/run.py +8 -6
  39. lamindb/models/schema.py +54 -26
  40. lamindb/models/sqlrecord.py +123 -25
  41. lamindb/models/storage.py +59 -14
  42. lamindb/models/transform.py +17 -17
  43. lamindb/models/ulabel.py +6 -1
  44. {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/METADATA +4 -5
  45. {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/RECORD +47 -44
  46. {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/WHEEL +1 -1
  47. {lamindb-1.10.2.dist-info/licenses → lamindb-1.11a1.dist-info}/LICENSE +0 -0
@@ -1,7 +1,6 @@
1
1
  # ruff: noqa: TC004
2
2
  from __future__ import annotations
3
3
 
4
- import os
5
4
  import shutil
6
5
  from collections import defaultdict
7
6
  from pathlib import Path, PurePath, PurePosixPath
@@ -63,7 +62,7 @@ from ..core.storage.paths import (
63
62
  filepath_cache_key_from_artifact,
64
63
  filepath_from_artifact,
65
64
  )
66
- from ..errors import IntegrityError, InvalidArgument, ValidationError
65
+ from ..errors import InvalidArgument, ValidationError
67
66
  from ..models._is_versioned import (
68
67
  create_uid,
69
68
  )
@@ -201,7 +200,7 @@ def process_pathlike(
201
200
  # hence, we revert the creation and throw an error
202
201
  storage_record.delete()
203
202
  raise UnknownStorageLocation(
204
- f"Path {filepath} is not contained in any known storage location:\n{Storage.df()[['uid', 'root', 'type']]}\n\n"
203
+ f"Path {filepath} is not contained in any known storage location:\n{Storage.to_dataframe()[['uid', 'root', 'type']]}\n\n"
205
204
  f"Create a managed storage location that contains the path, e.g., by calling: ln.Storage(root='{new_root}').save()"
206
205
  )
207
206
  use_existing_storage_key = True
@@ -552,12 +551,19 @@ def data_is_scversedatastructure(
552
551
  file_suffix = ".h5mu"
553
552
  # SpatialData does not have a unique suffix but `.zarr`
554
553
 
554
+ # AnnData allows both AnnDataAccessor and AnnData
555
+ class_name = data.__class__.__name__
555
556
  if structure_type is None:
556
557
  return any(
557
- hasattr(data, "__class__") and data.__class__.__name__ == cl_name
558
+ class_name
559
+ in (["AnnData", "AnnDataAccessor"] if cl_name == "AnnData" else [cl_name])
558
560
  for cl_name in ["AnnData", "MuData", "SpatialData"]
559
561
  )
560
- elif hasattr(data, "__class__") and data.__class__.__name__ == structure_type:
562
+ elif class_name in (
563
+ ["AnnData", "AnnDataAccessor"]
564
+ if structure_type == "AnnData"
565
+ else [structure_type]
566
+ ):
561
567
  return True
562
568
 
563
569
  data_type = structure_type.lower()
@@ -586,6 +592,7 @@ def data_is_scversedatastructure(
586
592
  f"we do not check whether cloud zarr is {structure_type}"
587
593
  )
588
594
  return False
595
+
589
596
  return False
590
597
 
591
598
 
@@ -605,7 +612,7 @@ def _check_otype_artifact(
605
612
  ) -> str:
606
613
  if otype is None:
607
614
  if isinstance(data, pd.DataFrame):
608
- logger.warning("data is a DataFrame, please use .from_df()")
615
+ logger.warning("data is a DataFrame, please use .from_dataframe()")
609
616
  otype = "DataFrame"
610
617
  return otype
611
618
 
@@ -873,7 +880,7 @@ def get_labels(
873
880
 
874
881
  values = []
875
882
  for v in qs_by_registry.values():
876
- values += v.list(get_name_field(v))
883
+ values += v.to_list(get_name_field(v))
877
884
  return values
878
885
  if len(registries_to_check) == 1 and registry in qs_by_registry:
879
886
  return qs_by_registry[registry]
@@ -896,7 +903,7 @@ def add_labels(
896
903
  raise ValueError("Please save the artifact/collection before adding a label!")
897
904
 
898
905
  if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both
899
- records = records.list()
906
+ records = records.to_list()
900
907
  if isinstance(records, (str, SQLRecord)):
901
908
  records = [records]
902
909
  if not isinstance(records, list): # avoids warning for pd Series
@@ -995,6 +1002,112 @@ def add_labels(
995
1002
  )
996
1003
 
997
1004
 
1005
+ def delete_permanently(artifact: Artifact, storage: bool, using_key: str):
1006
+ # need to grab file path before deletion
1007
+ try:
1008
+ path, _ = filepath_from_artifact(artifact, using_key)
1009
+ except OSError:
1010
+ # we can still delete the record
1011
+ logger.warning("Could not get path")
1012
+ storage = False
1013
+ # only delete in storage if DB delete is successful
1014
+ # DB delete might error because of a foreign key constraint violated etc.
1015
+ if artifact._overwrite_versions and artifact.is_latest:
1016
+ logger.important(
1017
+ "deleting all versions of this artifact because they all share the same store"
1018
+ )
1019
+ for version in artifact.versions.all(): # includes artifact
1020
+ _delete_skip_storage(version)
1021
+ else:
1022
+ artifact._delete_skip_storage()
1023
+ # by default do not delete storage if deleting only a previous version
1024
+ # and the underlying store is mutable
1025
+ if artifact._overwrite_versions and not artifact.is_latest:
1026
+ delete_in_storage = False
1027
+ if storage:
1028
+ logger.warning(
1029
+ "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
1030
+ )
1031
+ elif artifact.key is None or artifact._key_is_virtual:
1032
+ # do not ask for confirmation also if storage is None
1033
+ delete_in_storage = storage is None or storage
1034
+ else:
1035
+ # for artifacts with non-virtual semantic storage keys (key is not None)
1036
+ # ask for extra-confirmation
1037
+ if storage is None:
1038
+ response = input(
1039
+ f"Are you sure to want to delete {path}? (y/n) You can't undo"
1040
+ " this action."
1041
+ )
1042
+ delete_in_storage = response == "y"
1043
+ else:
1044
+ delete_in_storage = storage
1045
+ if not delete_in_storage:
1046
+ logger.important(f"a file/folder remains here: {path}")
1047
+ # we don't yet have logic to bring back the deleted metadata record
1048
+ # in case storage deletion fails - this is important for ACID down the road
1049
+ if delete_in_storage:
1050
+ delete_msg = delete_storage(path, raise_file_not_found_error=False)
1051
+ if delete_msg != "did-not-delete":
1052
+ logger.success(f"deleted {colors.yellow(f'{path}')}")
1053
+
1054
+
1055
+ class LazyArtifact:
1056
+ """Lazy artifact for streaming to auto-generated internal paths.
1057
+
1058
+ This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
1059
+ and register the path as an artifact (see :class:`~lamindb.Artifact`).
1060
+
1061
+ This object creates a real artifact on `.save()` with the provided arguments.
1062
+
1063
+ Args:
1064
+ suffix: The suffix for the auto-generated internal path
1065
+ overwrite_versions: Whether to overwrite versions.
1066
+ **kwargs: Keyword arguments for the artifact to be created.
1067
+
1068
+ Examples:
1069
+
1070
+ Create a lazy artifact, write to the path and save to get a real artifact::
1071
+
1072
+ lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
1073
+ zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
1074
+ artifact = lazy.save()
1075
+ """
1076
+
1077
+ def __init__(self, suffix: str, overwrite_versions: bool, **kwargs):
1078
+ self.kwargs = kwargs
1079
+ self.kwargs["overwrite_versions"] = overwrite_versions
1080
+
1081
+ if (key := kwargs.get("key")) is not None and extract_suffix_from_path(
1082
+ PurePosixPath(key)
1083
+ ) != suffix:
1084
+ raise ValueError(
1085
+ "The suffix argument and the suffix of key should be the same."
1086
+ )
1087
+
1088
+ uid, _ = create_uid(n_full_id=20)
1089
+ storage_key = auto_storage_key_from_artifact_uid(
1090
+ uid, suffix, overwrite_versions=overwrite_versions
1091
+ )
1092
+ storepath = setup_settings.storage.root / storage_key
1093
+
1094
+ self._path = storepath
1095
+
1096
+ @property
1097
+ def path(self) -> UPath:
1098
+ return self._path
1099
+
1100
+ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1101
+ artifact = Artifact(self.path, _is_internal_call=True, **self.kwargs)
1102
+ return artifact.save(upload=upload, **kwargs)
1103
+
1104
+ def __repr__(self) -> str: # pragma: no cover
1105
+ show_kwargs = {k: v for k, v in self.kwargs.items() if v is not None}
1106
+ return (
1107
+ f"LazyArtifact object with\n path: {self.path}\n arguments: {show_kwargs}"
1108
+ )
1109
+
1110
+
998
1111
  class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
999
1112
  # Note that this docstring has to be consistent with Curator.save_artifact()
1000
1113
  """Datasets & models stored as files, folders, or arrays.
@@ -1030,15 +1143,22 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1030
1143
 
1031
1144
  artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
1032
1145
 
1033
- If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_df()`, `.from_anndata()`, ... constructors::
1146
+ If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_dataframe()`, `.from_anndata()`, ... constructors::
1034
1147
 
1035
1148
  schema = ln.Schema(itype=ln.Feature) # a schema that merely enforces that feature names exist in the Feature registry
1036
- artifact = ln.Artifact.from_df("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
1149
+ artifact = ln.Artifact.from_dataframe("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
1150
+
1151
+ To annotate by **external features**::
1152
+
1153
+ schema = ln.examples.schemas.valid_features()
1154
+ artifact = ln.Artifact("./my_file.parquet", features={"species": "bird"}).save()
1155
+
1156
+ A `schema` can be optionally passed to also validate the features.
1037
1157
 
1038
1158
  You can make a **new version** of an artifact by passing an existing `key`::
1039
1159
 
1040
1160
  artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
1041
- artifact_v2.versions.df() # see all versions
1161
+ artifact_v2.versions.to_dataframe() # see all versions
1042
1162
 
1043
1163
  You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
1044
1164
 
@@ -1112,6 +1232,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1112
1232
 
1113
1233
  class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
1114
1234
  abstract = False
1235
+ app_label = "lamindb"
1115
1236
  constraints = [
1116
1237
  # a simple hard unique constraint on `hash` clashes with the fact
1117
1238
  # that pipelines sometimes aim to ingest the exact same file in different
@@ -1159,11 +1280,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1159
1280
 
1160
1281
  ln.Artifact.filter(scientist="Barbara McClintock")
1161
1282
 
1162
- Features may or may not be part of the dataset, i.e., the artifact content in storage. For
1163
- instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1164
- `DataFrame`-like artifact and annotates it with features corresponding to
1165
- these columns. `artifact.features.add_values`, by contrast, does not
1166
- validate the content of the artifact.
1283
+ Features may or may not be part of the dataset, i.e., the artifact content in storage.
1284
+ For instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1285
+ `DataFrame`-like artifact and annotates it with features corresponding to these columns.
1286
+ `artifact.features.add_values`, by contrast, does not validate the content of the artifact.
1167
1287
 
1168
1288
  .. dropdown:: An example for a model-like artifact
1169
1289
 
@@ -1178,6 +1298,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1178
1298
  "subset_highlyvariable": True,
1179
1299
  },
1180
1300
  })
1301
+
1302
+ To validate external features::
1303
+
1304
+ schema = ln.Schema([ln.Feature(name="species", dtype=str).save()]).save()
1305
+ artifact.features.add_values({"species": "bird"}, schema=schema)
1181
1306
  """
1182
1307
  from ._feature_manager import FeatureManager
1183
1308
 
@@ -1387,15 +1512,46 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1387
1512
  # now proceed with the user-facing constructor
1388
1513
  if len(args) > 1:
1389
1514
  raise ValueError("Only one non-keyword arg allowed: data")
1515
+
1390
1516
  data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
1391
1517
  kind: str = kwargs.pop("kind", None)
1392
1518
  key: str | None = kwargs.pop("key", None)
1393
1519
  run_id: int | None = kwargs.pop("run_id", None) # for REST API
1394
1520
  run: Run | None = kwargs.pop("run", None)
1521
+ using_key = kwargs.pop("using_key", None)
1395
1522
  description: str | None = kwargs.pop("description", None)
1396
1523
  revises: Artifact | None = kwargs.pop("revises", None)
1397
1524
  overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None)
1398
1525
  version: str | None = kwargs.pop("version", None)
1526
+
1527
+ features: dict[str, Any] = kwargs.pop("features", None)
1528
+ schema: Schema | None = kwargs.pop("schema", None)
1529
+ if features is not None and schema is not None:
1530
+ from lamindb.curators import DataFrameCurator
1531
+
1532
+ temp_df = pd.DataFrame([features])
1533
+ validation_schema = schema
1534
+ if schema.itype == "Composite" and schema.slots:
1535
+ if len(schema.slots) > 1:
1536
+ raise ValueError(
1537
+ f"Composite schema has {len(schema.slots)} slots. "
1538
+ "External feature validation only supports schemas with a single slot."
1539
+ )
1540
+ try:
1541
+ validation_schema = next(
1542
+ k for k in schema.slots.keys() if k.startswith("__external")
1543
+ )
1544
+ except StopIteration:
1545
+ raise ValueError(
1546
+ "External feature validation requires a slot that starts with __external."
1547
+ ) from None
1548
+
1549
+ external_curator = DataFrameCurator(temp_df, validation_schema)
1550
+ external_curator.validate()
1551
+ external_curator._artifact = self
1552
+
1553
+ self._external_features = features
1554
+
1399
1555
  branch_id: int | None = None
1400
1556
  if "visibility" in kwargs: # backward compat
1401
1557
  branch_id = kwargs.pop("visibility")
@@ -1406,13 +1562,16 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1406
1562
  else:
1407
1563
  branch_id = 1
1408
1564
  branch = kwargs.pop("branch", None)
1565
+
1409
1566
  space = kwargs.pop("space", None)
1410
- space_id = kwargs.pop("space_id", 1)
1567
+ assert "space_id" not in kwargs, "please pass space instead" # noqa: S101
1411
1568
  format = kwargs.pop("format", None)
1412
1569
  _is_internal_call = kwargs.pop("_is_internal_call", False)
1413
1570
  skip_check_exists = kwargs.pop("skip_check_exists", False)
1571
+ storage_was_passed = False
1414
1572
  if "storage" in kwargs:
1415
1573
  storage = kwargs.pop("storage")
1574
+ storage_was_passed = True
1416
1575
  elif (
1417
1576
  setup_settings.instance.keep_artifacts_local
1418
1577
  and setup_settings.instance._local_storage is not None
@@ -1420,7 +1579,24 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1420
1579
  storage = setup_settings.instance.local_storage.record
1421
1580
  else:
1422
1581
  storage = setup_settings.instance.storage.record
1423
- using_key = kwargs.pop("using_key", None)
1582
+ if space is None:
1583
+ from lamindb import context as run_context
1584
+
1585
+ if run_context.space is not None:
1586
+ space = run_context.space
1587
+ elif setup_settings.space is not None:
1588
+ space = setup_settings.space
1589
+ if space is not None and space != storage.space:
1590
+ if storage_was_passed:
1591
+ logger.warning(
1592
+ "storage argument ignored as storage information from space takes precedence"
1593
+ )
1594
+ storage_locs_for_space = Storage.filter(space=space)
1595
+ storage = storage_locs_for_space.first()
1596
+ if len(storage_locs_for_space) > 1:
1597
+ logger.warning(
1598
+ f"more than one storage location for space {space}, choosing {storage}"
1599
+ )
1424
1600
  otype = kwargs.pop("otype") if "otype" in kwargs else None
1425
1601
  if isinstance(data, str) and data.startswith("s3:///"):
1426
1602
  # issue in Groovy / nf-lamin producing malformed S3 paths
@@ -1461,6 +1637,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1461
1637
  )
1462
1638
  else:
1463
1639
  is_automanaged_path = False
1640
+
1464
1641
  provisional_uid, revises = create_uid(revises=revises, version=version)
1465
1642
  kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
1466
1643
  data=data,
@@ -1518,7 +1695,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1518
1695
  uid, revises = create_uid(revises=revises, version=version)
1519
1696
  kwargs["uid"] = uid
1520
1697
 
1521
- # only set key now so that we don't do a look-up on it in case revises is passed
1698
+ # only set key now so that we don't perform a look-up on it in case revises is passed
1522
1699
  if revises is not None and revises.key is not None and kwargs["key"] is None:
1523
1700
  kwargs["key"] = revises.key
1524
1701
 
@@ -1530,7 +1707,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1530
1707
  kwargs["branch"] = branch
1531
1708
  kwargs["branch_id"] = branch_id
1532
1709
  kwargs["space"] = space
1533
- kwargs["space_id"] = space_id
1534
1710
  kwargs["otype"] = otype
1535
1711
  kwargs["revises"] = revises
1536
1712
  # this check needs to come down here because key might be populated from an
@@ -1544,6 +1720,43 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1544
1720
 
1545
1721
  super().__init__(**kwargs)
1546
1722
 
1723
+ @classmethod
1724
+ def from_lazy(
1725
+ cls,
1726
+ suffix: str,
1727
+ overwrite_versions: bool,
1728
+ key: str | None = None,
1729
+ description: str | None = None,
1730
+ run: Run | None = None,
1731
+ **kwargs,
1732
+ ) -> LazyArtifact:
1733
+ """Create a lazy artifact for streaming to auto-generated internal paths.
1734
+
1735
+ This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
1736
+ and register the path as an artifact.
1737
+
1738
+ The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
1739
+ on `.save()` with the provided arguments.
1740
+
1741
+ Args:
1742
+ suffix: The suffix for the auto-generated internal path
1743
+ overwrite_versions: Whether to overwrite versions.
1744
+ key: An optional key to reference the artifact.
1745
+ description: A description.
1746
+ run: The run that creates the artifact.
1747
+ **kwargs: Other keyword arguments for the artifact to be created.
1748
+
1749
+ Examples:
1750
+
1751
+ Create a lazy artifact, write to the path and save to get a real artifact::
1752
+
1753
+ lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
1754
+ zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
1755
+ artifact = lazy.save()
1756
+ """
1757
+ args = {"key": key, "description": description, "run": run, **kwargs}
1758
+ return LazyArtifact(suffix, overwrite_versions, **args)
1759
+
1547
1760
  @property
1548
1761
  @deprecated("kind")
1549
1762
  def type(self) -> str:
@@ -1627,6 +1840,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1627
1840
  idlike: Either a uid stub, uid or an integer id.
1628
1841
  is_run_input: Whether to track this artifact as run input.
1629
1842
  expressions: Fields and values passed as Django query expressions.
1843
+ Use `path=...` to get an artifact for a local or remote filepath if exists.
1630
1844
 
1631
1845
  Raises:
1632
1846
  :exc:`docs:lamindb.errors.DoesNotExist`: In case no matching record is found.
@@ -1641,6 +1855,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1641
1855
 
1642
1856
  artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
1643
1857
  artifact = ln.Arfifact.get(key="examples/my_file.parquet")
1858
+ artifact = ln.Artifact.get(path="s3://bucket/folder/adata.h5ad")
1644
1859
  """
1645
1860
  from .query_set import QuerySet
1646
1861
 
@@ -1710,7 +1925,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1710
1925
  )
1711
1926
 
1712
1927
  @classmethod
1713
- def from_df(
1928
+ def from_dataframe(
1714
1929
  cls,
1715
1930
  df: pd.DataFrame,
1716
1931
  *,
@@ -1719,6 +1934,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1719
1934
  run: Run | None = None,
1720
1935
  revises: Artifact | None = None,
1721
1936
  schema: Schema | None = None,
1937
+ features: dict[str, Any] | None = None,
1722
1938
  **kwargs,
1723
1939
  ) -> Artifact:
1724
1940
  """Create from `DataFrame`, optionally validate & annotate.
@@ -1731,6 +1947,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1731
1947
  revises: An old version of the artifact.
1732
1948
  run: The run that creates the artifact.
1733
1949
  schema: A schema that defines how to validate & annotate.
1950
+ features: External features dict for additional annotation.
1734
1951
 
1735
1952
  See Also:
1736
1953
  :meth:`~lamindb.Collection`
@@ -1745,7 +1962,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1745
1962
  import lamindb as ln
1746
1963
 
1747
1964
  df = ln.core.datasets.mini_immuno.get_dataset1()
1748
- artifact = ln.Artifact.from_df(df, key="examples/dataset1.parquet").save()
1965
+ artifact = ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet").save()
1749
1966
 
1750
1967
  With validation and annotation.
1751
1968
 
@@ -1762,6 +1979,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1762
1979
  .. literalinclude:: scripts/define_mini_immuno_features_labels.py
1763
1980
  :language: python
1764
1981
 
1982
+ External features:
1983
+
1984
+ .. literalinclude:: scripts/curate_dataframe_external_features.py
1985
+ :language: python
1765
1986
  """
1766
1987
  artifact = Artifact( # type: ignore
1767
1988
  data=df,
@@ -1774,8 +1995,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1774
1995
  **kwargs,
1775
1996
  )
1776
1997
  artifact.n_observations = len(df)
1998
+
1777
1999
  if schema is not None:
1778
- from ..curators import DataFrameCurator
2000
+ from lamindb.curators.core import ComponentCurator
1779
2001
 
1780
2002
  if not artifact._state.adding and artifact.suffix != ".parquet":
1781
2003
  logger.warning(
@@ -1784,12 +2006,56 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1784
2006
  )
1785
2007
  return artifact
1786
2008
 
1787
- curator = DataFrameCurator(artifact, schema)
1788
- curator.validate()
1789
- artifact.schema = schema
1790
- artifact._curator = curator
2009
+ # Handle external features validation for Composite schemas
2010
+ if schema.itype == "Composite" and features is not None:
2011
+ try:
2012
+ external_slot = next(
2013
+ k for k in schema.slots.keys() if "__external__" in k
2014
+ )
2015
+ validation_schema = schema.slots[external_slot]
2016
+ except StopIteration:
2017
+ raise ValueError(
2018
+ "External feature validation requires a slot __external__."
2019
+ ) from None
2020
+
2021
+ external_curator = ComponentCurator(
2022
+ pd.DataFrame([features]), validation_schema
2023
+ )
2024
+ external_curator.validate()
2025
+ artifact._external_features = features
2026
+
2027
+ # Validate main DataFrame if not Composite or if Composite has attrs
2028
+ if schema.itype != "Composite" or "attrs" in schema.slots:
2029
+ curator = ComponentCurator(artifact, schema)
2030
+ curator.validate()
2031
+ artifact.schema = schema
2032
+ artifact._curator = curator
2033
+
1791
2034
  return artifact
1792
2035
 
2036
+ @classmethod
2037
+ @deprecated("from_dataframe")
2038
+ def from_df(
2039
+ cls,
2040
+ df: pd.DataFrame,
2041
+ *,
2042
+ key: str | None = None,
2043
+ description: str | None = None,
2044
+ run: Run | None = None,
2045
+ revises: Artifact | None = None,
2046
+ schema: Schema | None = None,
2047
+ **kwargs,
2048
+ ) -> Artifact:
2049
+ return cls.from_dataframe(
2050
+ df,
2051
+ key=key,
2052
+ description=description,
2053
+ run=run,
2054
+ revises=revises,
2055
+ schema=schema,
2056
+ **kwargs,
2057
+ )
2058
+
1793
2059
  @classmethod
1794
2060
  def from_anndata(
1795
2061
  cls,
@@ -2580,94 +2846,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2580
2846
  artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
2581
2847
  artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
2582
2848
  """
2583
- # we're *not* running the line below because the case `storage is None` triggers user feedback in one case
2584
- # storage = True if storage is None else storage
2585
-
2586
- # this first check means an invalid delete fails fast rather than cascading through
2587
- # database and storage permission errors
2588
- if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
2589
- isettings = setup_settings.instance
2590
- if self.storage.instance_uid != isettings.uid and (
2591
- storage or storage is None
2592
- ):
2593
- raise IntegrityError(
2594
- "Cannot simply delete artifacts outside of this instance's managed storage locations."
2595
- "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
2596
- f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
2597
- f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
2598
- )
2599
- # by default, we only move artifacts into the trash (branch_id = -1)
2600
- trash_branch_id = -1
2601
- if self.branch_id > trash_branch_id and not permanent:
2602
- if storage is not None:
2603
- logger.warning("moving artifact to trash, storage arg is ignored")
2604
- # move to trash
2605
- self.branch_id = trash_branch_id
2606
- self.save()
2607
- logger.important(f"moved artifact to trash (branch_id = {trash_branch_id})")
2608
- return
2609
-
2610
- # if the artifact is already in the trash
2611
- # permanent delete skips the trash
2612
- if permanent is None:
2613
- # ask for confirmation of permanent delete
2614
- response = input(
2615
- "Artifact record is already in trash! Are you sure you want to permanently"
2616
- " delete it? (y/n) You can't undo this action."
2617
- )
2618
- delete_record = response == "y"
2619
- else:
2620
- assert permanent # noqa: S101
2621
- delete_record = True
2622
-
2623
- if delete_record:
2624
- # need to grab file path before deletion
2625
- try:
2626
- path, _ = filepath_from_artifact(self, using_key)
2627
- except OSError:
2628
- # we can still delete the record
2629
- logger.warning("Could not get path")
2630
- storage = False
2631
- # only delete in storage if DB delete is successful
2632
- # DB delete might error because of a foreign key constraint violated etc.
2633
- if self._overwrite_versions and self.is_latest:
2634
- logger.important(
2635
- "deleting all versions of this artifact because they all share the same store"
2636
- )
2637
- for version in self.versions.all(): # includes self
2638
- _delete_skip_storage(version)
2639
- else:
2640
- self._delete_skip_storage()
2641
- # by default do not delete storage if deleting only a previous version
2642
- # and the underlying store is mutable
2643
- if self._overwrite_versions and not self.is_latest:
2644
- delete_in_storage = False
2645
- if storage:
2646
- logger.warning(
2647
- "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
2648
- )
2649
- elif self.key is None or self._key_is_virtual:
2650
- # do not ask for confirmation also if storage is None
2651
- delete_in_storage = storage is None or storage
2652
- else:
2653
- # for artifacts with non-virtual semantic storage keys (key is not None)
2654
- # ask for extra-confirmation
2655
- if storage is None:
2656
- response = input(
2657
- f"Are you sure to want to delete {path}? (y/n) You can't undo"
2658
- " this action."
2659
- )
2660
- delete_in_storage = response == "y"
2661
- else:
2662
- delete_in_storage = storage
2663
- if not delete_in_storage:
2664
- logger.important(f"a file/folder remains here: {path}")
2665
- # we don't yet have logic to bring back the deleted metadata record
2666
- # in case storage deletion fails - this is important for ACID down the road
2667
- if delete_in_storage:
2668
- delete_msg = delete_storage(path, raise_file_not_found_error=False)
2669
- if delete_msg != "did-not-delete":
2670
- logger.success(f"deleted {colors.yellow(f'{path}')}")
2849
+ super().delete(permanent=permanent, storage=storage, using_key=using_key)
2671
2850
 
2672
2851
  @property
2673
2852
  def _is_saved_to_storage_location(self) -> bool | None:
@@ -2796,11 +2975,20 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2796
2975
  local_path_cache,
2797
2976
  )
2798
2977
  logger.important(f"moved local artifact to cache: {local_path_cache}")
2978
+
2979
+ # Handle external features
2980
+ if hasattr(self, "_external_features") and self._external_features is not None:
2981
+ external_features = self._external_features
2982
+ delattr(self, "_external_features")
2983
+ self.features.add_values(external_features)
2984
+
2985
+ # annotate Artifact
2799
2986
  if hasattr(self, "_curator"):
2800
2987
  curator = self._curator
2801
2988
  delattr(self, "_curator")
2802
2989
  # just annotates this artifact
2803
2990
  curator.save_artifact()
2991
+
2804
2992
  return self
2805
2993
 
2806
2994
  def restore(self) -> None:
@@ -2848,7 +3036,7 @@ def _synchronize_cleanup_on_error(
2848
3036
 
2849
3037
 
2850
3038
  def _delete_skip_storage(artifact, *args, **kwargs) -> None:
2851
- super(Artifact, artifact).delete(*args, **kwargs)
3039
+ super(SQLRecord, artifact).delete(*args, **kwargs)
2852
3040
 
2853
3041
 
2854
3042
  def _save_skip_storage(artifact, **kwargs) -> None:
@@ -2866,6 +3054,7 @@ class ArtifactFeatureValue(BaseSQLRecord, IsLink, TracksRun):
2866
3054
  featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
2867
3055
 
2868
3056
  class Meta:
3057
+ app_label = "lamindb"
2869
3058
  unique_together = ("artifact", "featurevalue")
2870
3059
 
2871
3060