lamindb 1.10.2__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +17 -15
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +2 -2
  8. lamindb/core/storage/_anndata_accessor.py +29 -9
  9. lamindb/curators/_legacy.py +16 -3
  10. lamindb/curators/core.py +442 -188
  11. lamindb/errors.py +6 -0
  12. lamindb/examples/cellxgene/__init__.py +8 -3
  13. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  14. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  15. lamindb/examples/croissant/__init__.py +32 -6
  16. lamindb/examples/datasets/__init__.py +2 -2
  17. lamindb/examples/datasets/_core.py +9 -2
  18. lamindb/examples/datasets/_small.py +66 -22
  19. lamindb/examples/fixtures/sheets.py +8 -2
  20. lamindb/integrations/_croissant.py +34 -11
  21. lamindb/migrations/0119_squashed.py +5 -2
  22. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  23. lamindb/migrations/0121_recorduser.py +60 -0
  24. lamindb/models/__init__.py +4 -1
  25. lamindb/models/_describe.py +2 -2
  26. lamindb/models/_feature_manager.py +131 -71
  27. lamindb/models/_from_values.py +2 -2
  28. lamindb/models/_is_versioned.py +4 -4
  29. lamindb/models/_label_manager.py +4 -4
  30. lamindb/models/artifact.py +326 -172
  31. lamindb/models/artifact_set.py +45 -1
  32. lamindb/models/can_curate.py +1 -2
  33. lamindb/models/collection.py +3 -34
  34. lamindb/models/feature.py +111 -7
  35. lamindb/models/has_parents.py +11 -11
  36. lamindb/models/project.py +18 -0
  37. lamindb/models/query_manager.py +16 -7
  38. lamindb/models/query_set.py +191 -78
  39. lamindb/models/record.py +30 -5
  40. lamindb/models/run.py +10 -33
  41. lamindb/models/save.py +6 -8
  42. lamindb/models/schema.py +54 -26
  43. lamindb/models/sqlrecord.py +152 -40
  44. lamindb/models/storage.py +59 -14
  45. lamindb/models/transform.py +17 -17
  46. lamindb/models/ulabel.py +6 -1
  47. {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/METADATA +12 -18
  48. {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/RECORD +50 -47
  49. {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/WHEEL +1 -1
  50. {lamindb-1.10.2.dist-info/licenses → lamindb-1.11.0.dist-info}/LICENSE +0 -0
@@ -1,7 +1,6 @@
1
1
  # ruff: noqa: TC004
2
2
  from __future__ import annotations
3
3
 
4
- import os
5
4
  import shutil
6
5
  from collections import defaultdict
7
6
  from pathlib import Path, PurePath, PurePosixPath
@@ -9,7 +8,6 @@ from typing import TYPE_CHECKING, Any, Literal, Union, overload
9
8
 
10
9
  import fsspec
11
10
  import lamindb_setup as ln_setup
12
- import numpy as np
13
11
  import pandas as pd
14
12
  from anndata import AnnData
15
13
  from django.db import connections, models
@@ -63,14 +61,13 @@ from ..core.storage.paths import (
63
61
  filepath_cache_key_from_artifact,
64
62
  filepath_from_artifact,
65
63
  )
66
- from ..errors import IntegrityError, InvalidArgument, ValidationError
64
+ from ..errors import InvalidArgument, ValidationError
67
65
  from ..models._is_versioned import (
68
66
  create_uid,
69
67
  )
70
68
  from ._django import get_artifact_with_related, get_collection_with_related
71
69
  from ._feature_manager import (
72
70
  FeatureManager,
73
- filter_base,
74
71
  get_label_links,
75
72
  )
76
73
  from ._is_versioned import IsVersioned
@@ -201,7 +198,7 @@ def process_pathlike(
201
198
  # hence, we revert the creation and throw an error
202
199
  storage_record.delete()
203
200
  raise UnknownStorageLocation(
204
- f"Path {filepath} is not contained in any known storage location:\n{Storage.df()[['uid', 'root', 'type']]}\n\n"
201
+ f"Path {filepath} is not contained in any known storage location:\n{Storage.to_dataframe()[['uid', 'root', 'type']]}\n\n"
205
202
  f"Create a managed storage location that contains the path, e.g., by calling: ln.Storage(root='{new_root}').save()"
206
203
  )
207
204
  use_existing_storage_key = True
@@ -419,24 +416,6 @@ def get_artifact_kwargs_from_data(
419
416
  skip_check_exists,
420
417
  is_replace=is_replace,
421
418
  )
422
- stat_or_artifact = get_stat_or_artifact(
423
- path=path,
424
- key=key,
425
- instance=using_key,
426
- is_replace=is_replace,
427
- )
428
- if isinstance(stat_or_artifact, Artifact):
429
- existing_artifact = stat_or_artifact
430
- if run is not None:
431
- existing_artifact._populate_subsequent_runs(run)
432
- return existing_artifact, None
433
- else:
434
- size, hash, hash_type, n_files, revises = stat_or_artifact
435
-
436
- if revises is not None: # update provisional_uid
437
- provisional_uid, revises = create_uid(revises=revises, version=version)
438
- if settings.cache_dir in path.parents:
439
- path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
440
419
 
441
420
  check_path_in_storage = False
442
421
  if use_existing_storage_key:
@@ -457,6 +436,25 @@ def get_artifact_kwargs_from_data(
457
436
  else:
458
437
  storage = storage
459
438
 
439
+ stat_or_artifact = get_stat_or_artifact(
440
+ path=path,
441
+ key=key,
442
+ instance=using_key,
443
+ is_replace=is_replace,
444
+ )
445
+ if isinstance(stat_or_artifact, Artifact):
446
+ existing_artifact = stat_or_artifact
447
+ if run is not None:
448
+ existing_artifact._populate_subsequent_runs(run)
449
+ return existing_artifact, None
450
+ else:
451
+ size, hash, hash_type, n_files, revises = stat_or_artifact
452
+
453
+ if revises is not None: # update provisional_uid
454
+ provisional_uid, revises = create_uid(revises=revises, version=version)
455
+ if settings.cache_dir in path.parents:
456
+ path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
457
+
460
458
  log_storage_hint(
461
459
  check_path_in_storage=check_path_in_storage,
462
460
  storage=storage,
@@ -552,12 +550,19 @@ def data_is_scversedatastructure(
552
550
  file_suffix = ".h5mu"
553
551
  # SpatialData does not have a unique suffix but `.zarr`
554
552
 
553
+ # AnnData allows both AnnDataAccessor and AnnData
554
+ class_name = data.__class__.__name__
555
555
  if structure_type is None:
556
556
  return any(
557
- hasattr(data, "__class__") and data.__class__.__name__ == cl_name
557
+ class_name
558
+ in (["AnnData", "AnnDataAccessor"] if cl_name == "AnnData" else [cl_name])
558
559
  for cl_name in ["AnnData", "MuData", "SpatialData"]
559
560
  )
560
- elif hasattr(data, "__class__") and data.__class__.__name__ == structure_type:
561
+ elif class_name in (
562
+ ["AnnData", "AnnDataAccessor"]
563
+ if structure_type == "AnnData"
564
+ else [structure_type]
565
+ ):
561
566
  return True
562
567
 
563
568
  data_type = structure_type.lower()
@@ -586,6 +591,7 @@ def data_is_scversedatastructure(
586
591
  f"we do not check whether cloud zarr is {structure_type}"
587
592
  )
588
593
  return False
594
+
589
595
  return False
590
596
 
591
597
 
@@ -605,7 +611,7 @@ def _check_otype_artifact(
605
611
  ) -> str:
606
612
  if otype is None:
607
613
  if isinstance(data, pd.DataFrame):
608
- logger.warning("data is a DataFrame, please use .from_df()")
614
+ logger.warning("data is a DataFrame, please use .from_dataframe()")
609
615
  otype = "DataFrame"
610
616
  return otype
611
617
 
@@ -873,7 +879,7 @@ def get_labels(
873
879
 
874
880
  values = []
875
881
  for v in qs_by_registry.values():
876
- values += v.list(get_name_field(v))
882
+ values += v.to_list(get_name_field(v))
877
883
  return values
878
884
  if len(registries_to_check) == 1 and registry in qs_by_registry:
879
885
  return qs_by_registry[registry]
@@ -896,7 +902,7 @@ def add_labels(
896
902
  raise ValueError("Please save the artifact/collection before adding a label!")
897
903
 
898
904
  if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both
899
- records = records.list()
905
+ records = records.to_list()
900
906
  if isinstance(records, (str, SQLRecord)):
901
907
  records = [records]
902
908
  if not isinstance(records, list): # avoids warning for pd Series
@@ -995,6 +1001,112 @@ def add_labels(
995
1001
  )
996
1002
 
997
1003
 
1004
+ def delete_permanently(artifact: Artifact, storage: bool, using_key: str):
1005
+ # need to grab file path before deletion
1006
+ try:
1007
+ path, _ = filepath_from_artifact(artifact, using_key)
1008
+ except OSError:
1009
+ # we can still delete the record
1010
+ logger.warning("Could not get path")
1011
+ storage = False
1012
+ # only delete in storage if DB delete is successful
1013
+ # DB delete might error because of a foreign key constraint violated etc.
1014
+ if artifact._overwrite_versions and artifact.is_latest:
1015
+ logger.important(
1016
+ "deleting all versions of this artifact because they all share the same store"
1017
+ )
1018
+ for version in artifact.versions.all(): # includes artifact
1019
+ _delete_skip_storage(version)
1020
+ else:
1021
+ artifact._delete_skip_storage()
1022
+ # by default do not delete storage if deleting only a previous version
1023
+ # and the underlying store is mutable
1024
+ if artifact._overwrite_versions and not artifact.is_latest:
1025
+ delete_in_storage = False
1026
+ if storage:
1027
+ logger.warning(
1028
+ "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
1029
+ )
1030
+ elif artifact.key is None or artifact._key_is_virtual:
1031
+ # do not ask for confirmation also if storage is None
1032
+ delete_in_storage = storage is None or storage
1033
+ else:
1034
+ # for artifacts with non-virtual semantic storage keys (key is not None)
1035
+ # ask for extra-confirmation if storage is None
1036
+ if storage is None:
1037
+ response = input(
1038
+ f"Are you sure to want to delete {path}? (y/n) You can't undo"
1039
+ " this action."
1040
+ )
1041
+ delete_in_storage = response == "y"
1042
+ else:
1043
+ delete_in_storage = storage
1044
+ if not delete_in_storage:
1045
+ logger.important(f"a file/folder remains here: {path}")
1046
+ # we don't yet have logic to bring back the deleted metadata record
1047
+ # in case storage deletion fails - this is important for ACID down the road
1048
+ if delete_in_storage:
1049
+ delete_msg = delete_storage(path, raise_file_not_found_error=False)
1050
+ if delete_msg != "did-not-delete":
1051
+ logger.success(f"deleted {colors.yellow(f'{path}')}")
1052
+
1053
+
1054
+ class LazyArtifact:
1055
+ """Lazy artifact for streaming to auto-generated internal paths.
1056
+
1057
+ This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
1058
+ and register the path as an artifact (see :class:`~lamindb.Artifact`).
1059
+
1060
+ This object creates a real artifact on `.save()` with the provided arguments.
1061
+
1062
+ Args:
1063
+ suffix: The suffix for the auto-generated internal path
1064
+ overwrite_versions: Whether to overwrite versions.
1065
+ **kwargs: Keyword arguments for the artifact to be created.
1066
+
1067
+ Examples:
1068
+
1069
+ Create a lazy artifact, write to the path and save to get a real artifact::
1070
+
1071
+ lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
1072
+ zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
1073
+ artifact = lazy.save()
1074
+ """
1075
+
1076
+ def __init__(self, suffix: str, overwrite_versions: bool, **kwargs):
1077
+ self.kwargs = kwargs
1078
+ self.kwargs["overwrite_versions"] = overwrite_versions
1079
+
1080
+ if (key := kwargs.get("key")) is not None and extract_suffix_from_path(
1081
+ PurePosixPath(key)
1082
+ ) != suffix:
1083
+ raise ValueError(
1084
+ "The suffix argument and the suffix of key should be the same."
1085
+ )
1086
+
1087
+ uid, _ = create_uid(n_full_id=20)
1088
+ storage_key = auto_storage_key_from_artifact_uid(
1089
+ uid, suffix, overwrite_versions=overwrite_versions
1090
+ )
1091
+ storepath = setup_settings.storage.root / storage_key
1092
+
1093
+ self._path = storepath
1094
+
1095
+ @property
1096
+ def path(self) -> UPath:
1097
+ return self._path
1098
+
1099
+ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1100
+ artifact = Artifact(self.path, _is_internal_call=True, **self.kwargs)
1101
+ return artifact.save(upload=upload, **kwargs)
1102
+
1103
+ def __repr__(self) -> str: # pragma: no cover
1104
+ show_kwargs = {k: v for k, v in self.kwargs.items() if v is not None}
1105
+ return (
1106
+ f"LazyArtifact object with\n path: {self.path}\n arguments: {show_kwargs}"
1107
+ )
1108
+
1109
+
998
1110
  class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
999
1111
  # Note that this docstring has to be consistent with Curator.save_artifact()
1000
1112
  """Datasets & models stored as files, folders, or arrays.
@@ -1030,15 +1142,22 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1030
1142
 
1031
1143
  artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
1032
1144
 
1033
- If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_df()`, `.from_anndata()`, ... constructors::
1145
+ If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_dataframe()`, `.from_anndata()`, ... constructors::
1034
1146
 
1035
1147
  schema = ln.Schema(itype=ln.Feature) # a schema that merely enforces that feature names exist in the Feature registry
1036
- artifact = ln.Artifact.from_df("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
1148
+ artifact = ln.Artifact.from_dataframe("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
1149
+
1150
+ To annotate by **external features**::
1151
+
1152
+ schema = ln.examples.schemas.valid_features()
1153
+ artifact = ln.Artifact("./my_file.parquet", features={"species": "bird"}).save()
1154
+
1155
+ A `schema` can be optionally passed to also validate the features.
1037
1156
 
1038
1157
  You can make a **new version** of an artifact by passing an existing `key`::
1039
1158
 
1040
1159
  artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
1041
- artifact_v2.versions.df() # see all versions
1160
+ artifact_v2.versions.to_dataframe() # see all versions
1042
1161
 
1043
1162
  You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
1044
1163
 
@@ -1112,6 +1231,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1112
1231
 
1113
1232
  class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
1114
1233
  abstract = False
1234
+ app_label = "lamindb"
1115
1235
  constraints = [
1116
1236
  # a simple hard unique constraint on `hash` clashes with the fact
1117
1237
  # that pipelines sometimes aim to ingest the exact same file in different
@@ -1159,11 +1279,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1159
1279
 
1160
1280
  ln.Artifact.filter(scientist="Barbara McClintock")
1161
1281
 
1162
- Features may or may not be part of the dataset, i.e., the artifact content in storage. For
1163
- instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1164
- `DataFrame`-like artifact and annotates it with features corresponding to
1165
- these columns. `artifact.features.add_values`, by contrast, does not
1166
- validate the content of the artifact.
1282
+ Features may or may not be part of the dataset, i.e., the artifact content in storage.
1283
+ For instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1284
+ `DataFrame`-like artifact and annotates it with features corresponding to these columns.
1285
+ `artifact.features.add_values`, by contrast, does not validate the content of the artifact.
1167
1286
 
1168
1287
  .. dropdown:: An example for a model-like artifact
1169
1288
 
@@ -1178,6 +1297,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1178
1297
  "subset_highlyvariable": True,
1179
1298
  },
1180
1299
  })
1300
+
1301
+ To validate external features::
1302
+
1303
+ schema = ln.Schema([ln.Feature(name="species", dtype=str).save()]).save()
1304
+ artifact.features.add_values({"species": "bird"}, schema=schema)
1181
1305
  """
1182
1306
  from ._feature_manager import FeatureManager
1183
1307
 
@@ -1387,15 +1511,46 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1387
1511
  # now proceed with the user-facing constructor
1388
1512
  if len(args) > 1:
1389
1513
  raise ValueError("Only one non-keyword arg allowed: data")
1514
+
1390
1515
  data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
1391
1516
  kind: str = kwargs.pop("kind", None)
1392
1517
  key: str | None = kwargs.pop("key", None)
1393
1518
  run_id: int | None = kwargs.pop("run_id", None) # for REST API
1394
1519
  run: Run | None = kwargs.pop("run", None)
1520
+ using_key = kwargs.pop("using_key", None)
1395
1521
  description: str | None = kwargs.pop("description", None)
1396
1522
  revises: Artifact | None = kwargs.pop("revises", None)
1397
1523
  overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None)
1398
1524
  version: str | None = kwargs.pop("version", None)
1525
+
1526
+ features: dict[str, Any] = kwargs.pop("features", None)
1527
+ schema: Schema | None = kwargs.pop("schema", None)
1528
+ if features is not None and schema is not None:
1529
+ from lamindb.curators import DataFrameCurator
1530
+
1531
+ temp_df = pd.DataFrame([features])
1532
+ validation_schema = schema
1533
+ if schema.itype == "Composite" and schema.slots:
1534
+ if len(schema.slots) > 1:
1535
+ raise ValueError(
1536
+ f"Composite schema has {len(schema.slots)} slots. "
1537
+ "External feature validation only supports schemas with a single slot."
1538
+ )
1539
+ try:
1540
+ validation_schema = next(
1541
+ k for k in schema.slots.keys() if k.startswith("__external")
1542
+ )
1543
+ except StopIteration:
1544
+ raise ValueError(
1545
+ "External feature validation requires a slot that starts with __external."
1546
+ ) from None
1547
+
1548
+ external_curator = DataFrameCurator(temp_df, validation_schema)
1549
+ external_curator.validate()
1550
+ external_curator._artifact = self
1551
+
1552
+ self._external_features = features
1553
+
1399
1554
  branch_id: int | None = None
1400
1555
  if "visibility" in kwargs: # backward compat
1401
1556
  branch_id = kwargs.pop("visibility")
@@ -1406,13 +1561,16 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1406
1561
  else:
1407
1562
  branch_id = 1
1408
1563
  branch = kwargs.pop("branch", None)
1564
+
1409
1565
  space = kwargs.pop("space", None)
1410
- space_id = kwargs.pop("space_id", 1)
1566
+ assert "space_id" not in kwargs, "please pass space instead" # noqa: S101
1411
1567
  format = kwargs.pop("format", None)
1412
1568
  _is_internal_call = kwargs.pop("_is_internal_call", False)
1413
1569
  skip_check_exists = kwargs.pop("skip_check_exists", False)
1570
+ storage_was_passed = False
1414
1571
  if "storage" in kwargs:
1415
1572
  storage = kwargs.pop("storage")
1573
+ storage_was_passed = True
1416
1574
  elif (
1417
1575
  setup_settings.instance.keep_artifacts_local
1418
1576
  and setup_settings.instance._local_storage is not None
@@ -1420,7 +1578,24 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1420
1578
  storage = setup_settings.instance.local_storage.record
1421
1579
  else:
1422
1580
  storage = setup_settings.instance.storage.record
1423
- using_key = kwargs.pop("using_key", None)
1581
+ if space is None:
1582
+ from lamindb import context as run_context
1583
+
1584
+ if run_context.space is not None:
1585
+ space = run_context.space
1586
+ elif setup_settings.space is not None:
1587
+ space = setup_settings.space
1588
+ if space is not None and space != storage.space:
1589
+ if storage_was_passed:
1590
+ logger.warning(
1591
+ "storage argument ignored as storage information from space takes precedence"
1592
+ )
1593
+ storage_locs_for_space = Storage.filter(space=space)
1594
+ storage = storage_locs_for_space.first()
1595
+ if len(storage_locs_for_space) > 1:
1596
+ logger.warning(
1597
+ f"more than one storage location for space {space}, choosing {storage}"
1598
+ )
1424
1599
  otype = kwargs.pop("otype") if "otype" in kwargs else None
1425
1600
  if isinstance(data, str) and data.startswith("s3:///"):
1426
1601
  # issue in Groovy / nf-lamin producing malformed S3 paths
@@ -1461,6 +1636,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1461
1636
  )
1462
1637
  else:
1463
1638
  is_automanaged_path = False
1639
+
1464
1640
  provisional_uid, revises = create_uid(revises=revises, version=version)
1465
1641
  kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
1466
1642
  data=data,
@@ -1518,7 +1694,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1518
1694
  uid, revises = create_uid(revises=revises, version=version)
1519
1695
  kwargs["uid"] = uid
1520
1696
 
1521
- # only set key now so that we don't do a look-up on it in case revises is passed
1697
+ # only set key now so that we don't perform a look-up on it in case revises is passed
1522
1698
  if revises is not None and revises.key is not None and kwargs["key"] is None:
1523
1699
  kwargs["key"] = revises.key
1524
1700
 
@@ -1530,7 +1706,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1530
1706
  kwargs["branch"] = branch
1531
1707
  kwargs["branch_id"] = branch_id
1532
1708
  kwargs["space"] = space
1533
- kwargs["space_id"] = space_id
1534
1709
  kwargs["otype"] = otype
1535
1710
  kwargs["revises"] = revises
1536
1711
  # this check needs to come down here because key might be populated from an
@@ -1544,6 +1719,43 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1544
1719
 
1545
1720
  super().__init__(**kwargs)
1546
1721
 
1722
+ @classmethod
1723
+ def from_lazy(
1724
+ cls,
1725
+ suffix: str,
1726
+ overwrite_versions: bool,
1727
+ key: str | None = None,
1728
+ description: str | None = None,
1729
+ run: Run | None = None,
1730
+ **kwargs,
1731
+ ) -> LazyArtifact:
1732
+ """Create a lazy artifact for streaming to auto-generated internal paths.
1733
+
1734
+ This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
1735
+ and register the path as an artifact.
1736
+
1737
+ The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
1738
+ on `.save()` with the provided arguments.
1739
+
1740
+ Args:
1741
+ suffix: The suffix for the auto-generated internal path
1742
+ overwrite_versions: Whether to overwrite versions.
1743
+ key: An optional key to reference the artifact.
1744
+ description: A description.
1745
+ run: The run that creates the artifact.
1746
+ **kwargs: Other keyword arguments for the artifact to be created.
1747
+
1748
+ Examples:
1749
+
1750
+ Create a lazy artifact, write to the path and save to get a real artifact::
1751
+
1752
+ lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
1753
+ zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
1754
+ artifact = lazy.save()
1755
+ """
1756
+ args = {"key": key, "description": description, "run": run, **kwargs}
1757
+ return LazyArtifact(suffix, overwrite_versions, **args)
1758
+
1547
1759
  @property
1548
1760
  @deprecated("kind")
1549
1761
  def type(self) -> str:
@@ -1627,6 +1839,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1627
1839
  idlike: Either a uid stub, uid or an integer id.
1628
1840
  is_run_input: Whether to track this artifact as run input.
1629
1841
  expressions: Fields and values passed as Django query expressions.
1842
+ Use `path=...` to get an artifact for a local or remote filepath if exists.
1630
1843
 
1631
1844
  Raises:
1632
1845
  :exc:`docs:lamindb.errors.DoesNotExist`: In case no matching record is found.
@@ -1641,6 +1854,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1641
1854
 
1642
1855
  artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
1643
1856
  artifact = ln.Arfifact.get(key="examples/my_file.parquet")
1857
+ artifact = ln.Artifact.get(path="s3://bucket/folder/adata.h5ad")
1644
1858
  """
1645
1859
  from .query_set import QuerySet
1646
1860
 
@@ -1672,45 +1886,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1672
1886
  ln.Arfifact.filter(cell_type_by_model__name="T cell")
1673
1887
 
1674
1888
  """
1675
- from .query_set import QuerySet
1676
-
1677
- if expressions:
1678
- keys_normalized = [key.split("__")[0] for key in expressions]
1679
- field_or_feature_or_param = keys_normalized[0].split("__")[0]
1680
- if field_or_feature_or_param in Artifact.__get_available_fields__():
1681
- qs = QuerySet(model=cls).filter(*queries, **expressions)
1682
- if not any(e.startswith("kind") for e in expressions):
1683
- return qs.exclude(kind="__lamindb_run__")
1684
- else:
1685
- return qs
1686
- elif all(
1687
- features_validated := Feature.validate(
1688
- keys_normalized, field="name", mute=True
1689
- )
1690
- ):
1691
- return filter_base(Artifact, **expressions)
1692
- else:
1693
- features = ", ".join(
1694
- sorted(np.array(keys_normalized)[~features_validated])
1695
- )
1696
- message = f"feature names: {features}"
1697
- avail_fields = cls.__get_available_fields__()
1698
- if "_branch_code" in avail_fields:
1699
- avail_fields.remove("_branch_code") # backward compat
1700
- fields = ", ".join(sorted(avail_fields))
1701
- raise InvalidArgument(
1702
- f"You can query either by available fields: {fields}\n"
1703
- f"Or fix invalid {message}"
1704
- )
1705
- else:
1706
- return (
1707
- QuerySet(model=cls)
1708
- .filter(*queries, **expressions)
1709
- .exclude(kind="__lamindb_run__")
1710
- )
1889
+ # from Registry metaclass
1890
+ return type(cls).filter(cls, *queries, **expressions)
1711
1891
 
1712
1892
  @classmethod
1713
- def from_df(
1893
+ def from_dataframe(
1714
1894
  cls,
1715
1895
  df: pd.DataFrame,
1716
1896
  *,
@@ -1719,6 +1899,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1719
1899
  run: Run | None = None,
1720
1900
  revises: Artifact | None = None,
1721
1901
  schema: Schema | None = None,
1902
+ features: dict[str, Any] | None = None,
1722
1903
  **kwargs,
1723
1904
  ) -> Artifact:
1724
1905
  """Create from `DataFrame`, optionally validate & annotate.
@@ -1731,6 +1912,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1731
1912
  revises: An old version of the artifact.
1732
1913
  run: The run that creates the artifact.
1733
1914
  schema: A schema that defines how to validate & annotate.
1915
+ features: External features dict for additional annotation.
1734
1916
 
1735
1917
  See Also:
1736
1918
  :meth:`~lamindb.Collection`
@@ -1745,7 +1927,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1745
1927
  import lamindb as ln
1746
1928
 
1747
1929
  df = ln.core.datasets.mini_immuno.get_dataset1()
1748
- artifact = ln.Artifact.from_df(df, key="examples/dataset1.parquet").save()
1930
+ artifact = ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet").save()
1749
1931
 
1750
1932
  With validation and annotation.
1751
1933
 
@@ -1762,6 +1944,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1762
1944
  .. literalinclude:: scripts/define_mini_immuno_features_labels.py
1763
1945
  :language: python
1764
1946
 
1947
+ External features:
1948
+
1949
+ .. literalinclude:: scripts/curate_dataframe_external_features.py
1950
+ :language: python
1765
1951
  """
1766
1952
  artifact = Artifact( # type: ignore
1767
1953
  data=df,
@@ -1774,8 +1960,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1774
1960
  **kwargs,
1775
1961
  )
1776
1962
  artifact.n_observations = len(df)
1963
+
1777
1964
  if schema is not None:
1778
- from ..curators import DataFrameCurator
1965
+ from lamindb.curators.core import ComponentCurator
1779
1966
 
1780
1967
  if not artifact._state.adding and artifact.suffix != ".parquet":
1781
1968
  logger.warning(
@@ -1784,12 +1971,56 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1784
1971
  )
1785
1972
  return artifact
1786
1973
 
1787
- curator = DataFrameCurator(artifact, schema)
1788
- curator.validate()
1789
- artifact.schema = schema
1790
- artifact._curator = curator
1974
+ # Handle external features validation for Composite schemas
1975
+ if schema.itype == "Composite" and features is not None:
1976
+ try:
1977
+ external_slot = next(
1978
+ k for k in schema.slots.keys() if "__external__" in k
1979
+ )
1980
+ validation_schema = schema.slots[external_slot]
1981
+ except StopIteration:
1982
+ raise ValueError(
1983
+ "External feature validation requires a slot __external__."
1984
+ ) from None
1985
+
1986
+ external_curator = ComponentCurator(
1987
+ pd.DataFrame([features]), validation_schema
1988
+ )
1989
+ external_curator.validate()
1990
+ artifact._external_features = features
1991
+
1992
+ # Validate main DataFrame if not Composite or if Composite has attrs
1993
+ if schema.itype != "Composite" or "attrs" in schema.slots:
1994
+ curator = ComponentCurator(artifact, schema)
1995
+ curator.validate()
1996
+ artifact.schema = schema
1997
+ artifact._curator = curator
1998
+
1791
1999
  return artifact
1792
2000
 
2001
+ @classmethod
2002
+ @deprecated("from_dataframe")
2003
+ def from_df(
2004
+ cls,
2005
+ df: pd.DataFrame,
2006
+ *,
2007
+ key: str | None = None,
2008
+ description: str | None = None,
2009
+ run: Run | None = None,
2010
+ revises: Artifact | None = None,
2011
+ schema: Schema | None = None,
2012
+ **kwargs,
2013
+ ) -> Artifact:
2014
+ return cls.from_dataframe(
2015
+ df,
2016
+ key=key,
2017
+ description=description,
2018
+ run=run,
2019
+ revises=revises,
2020
+ schema=schema,
2021
+ **kwargs,
2022
+ )
2023
+
1793
2024
  @classmethod
1794
2025
  def from_anndata(
1795
2026
  cls,
@@ -2580,94 +2811,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2580
2811
  artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
2581
2812
  artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
2582
2813
  """
2583
- # we're *not* running the line below because the case `storage is None` triggers user feedback in one case
2584
- # storage = True if storage is None else storage
2585
-
2586
- # this first check means an invalid delete fails fast rather than cascading through
2587
- # database and storage permission errors
2588
- if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
2589
- isettings = setup_settings.instance
2590
- if self.storage.instance_uid != isettings.uid and (
2591
- storage or storage is None
2592
- ):
2593
- raise IntegrityError(
2594
- "Cannot simply delete artifacts outside of this instance's managed storage locations."
2595
- "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
2596
- f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
2597
- f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
2598
- )
2599
- # by default, we only move artifacts into the trash (branch_id = -1)
2600
- trash_branch_id = -1
2601
- if self.branch_id > trash_branch_id and not permanent:
2602
- if storage is not None:
2603
- logger.warning("moving artifact to trash, storage arg is ignored")
2604
- # move to trash
2605
- self.branch_id = trash_branch_id
2606
- self.save()
2607
- logger.important(f"moved artifact to trash (branch_id = {trash_branch_id})")
2608
- return
2609
-
2610
- # if the artifact is already in the trash
2611
- # permanent delete skips the trash
2612
- if permanent is None:
2613
- # ask for confirmation of permanent delete
2614
- response = input(
2615
- "Artifact record is already in trash! Are you sure you want to permanently"
2616
- " delete it? (y/n) You can't undo this action."
2617
- )
2618
- delete_record = response == "y"
2619
- else:
2620
- assert permanent # noqa: S101
2621
- delete_record = True
2622
-
2623
- if delete_record:
2624
- # need to grab file path before deletion
2625
- try:
2626
- path, _ = filepath_from_artifact(self, using_key)
2627
- except OSError:
2628
- # we can still delete the record
2629
- logger.warning("Could not get path")
2630
- storage = False
2631
- # only delete in storage if DB delete is successful
2632
- # DB delete might error because of a foreign key constraint violated etc.
2633
- if self._overwrite_versions and self.is_latest:
2634
- logger.important(
2635
- "deleting all versions of this artifact because they all share the same store"
2636
- )
2637
- for version in self.versions.all(): # includes self
2638
- _delete_skip_storage(version)
2639
- else:
2640
- self._delete_skip_storage()
2641
- # by default do not delete storage if deleting only a previous version
2642
- # and the underlying store is mutable
2643
- if self._overwrite_versions and not self.is_latest:
2644
- delete_in_storage = False
2645
- if storage:
2646
- logger.warning(
2647
- "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
2648
- )
2649
- elif self.key is None or self._key_is_virtual:
2650
- # do not ask for confirmation also if storage is None
2651
- delete_in_storage = storage is None or storage
2652
- else:
2653
- # for artifacts with non-virtual semantic storage keys (key is not None)
2654
- # ask for extra-confirmation
2655
- if storage is None:
2656
- response = input(
2657
- f"Are you sure to want to delete {path}? (y/n) You can't undo"
2658
- " this action."
2659
- )
2660
- delete_in_storage = response == "y"
2661
- else:
2662
- delete_in_storage = storage
2663
- if not delete_in_storage:
2664
- logger.important(f"a file/folder remains here: {path}")
2665
- # we don't yet have logic to bring back the deleted metadata record
2666
- # in case storage deletion fails - this is important for ACID down the road
2667
- if delete_in_storage:
2668
- delete_msg = delete_storage(path, raise_file_not_found_error=False)
2669
- if delete_msg != "did-not-delete":
2670
- logger.success(f"deleted {colors.yellow(f'{path}')}")
2814
+ super().delete(permanent=permanent, storage=storage, using_key=using_key)
2671
2815
 
2672
2816
  @property
2673
2817
  def _is_saved_to_storage_location(self) -> bool | None:
@@ -2796,11 +2940,20 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2796
2940
  local_path_cache,
2797
2941
  )
2798
2942
  logger.important(f"moved local artifact to cache: {local_path_cache}")
2943
+
2944
+ # Handle external features
2945
+ if hasattr(self, "_external_features") and self._external_features is not None:
2946
+ external_features = self._external_features
2947
+ delattr(self, "_external_features")
2948
+ self.features.add_values(external_features)
2949
+
2950
+ # annotate Artifact
2799
2951
  if hasattr(self, "_curator"):
2800
2952
  curator = self._curator
2801
2953
  delattr(self, "_curator")
2802
2954
  # just annotates this artifact
2803
2955
  curator.save_artifact()
2956
+
2804
2957
  return self
2805
2958
 
2806
2959
  def restore(self) -> None:
@@ -2848,7 +3001,7 @@ def _synchronize_cleanup_on_error(
2848
3001
 
2849
3002
 
2850
3003
  def _delete_skip_storage(artifact, *args, **kwargs) -> None:
2851
- super(Artifact, artifact).delete(*args, **kwargs)
3004
+ super(SQLRecord, artifact).delete(*args, **kwargs)
2852
3005
 
2853
3006
 
2854
3007
  def _save_skip_storage(artifact, **kwargs) -> None:
@@ -2866,6 +3019,7 @@ class ArtifactFeatureValue(BaseSQLRecord, IsLink, TracksRun):
2866
3019
  featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
2867
3020
 
2868
3021
  class Meta:
3022
+ app_label = "lamindb"
2869
3023
  unique_together = ("artifact", "featurevalue")
2870
3024
 
2871
3025