lamindb 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +17 -15
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +45 -2
  8. lamindb/core/storage/_anndata_accessor.py +118 -26
  9. lamindb/core/storage/_backed_access.py +10 -7
  10. lamindb/core/storage/_spatialdata_accessor.py +15 -4
  11. lamindb/core/storage/_zarr.py +3 -0
  12. lamindb/curators/_legacy.py +16 -3
  13. lamindb/curators/core.py +449 -193
  14. lamindb/errors.py +6 -0
  15. lamindb/examples/cellxgene/__init__.py +8 -3
  16. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  17. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  18. lamindb/examples/croissant/__init__.py +32 -6
  19. lamindb/examples/datasets/__init__.py +2 -2
  20. lamindb/examples/datasets/_core.py +9 -2
  21. lamindb/examples/datasets/_small.py +66 -22
  22. lamindb/examples/fixtures/sheets.py +8 -2
  23. lamindb/integrations/_croissant.py +34 -11
  24. lamindb/migrations/0118_alter_recordproject_value_projectrecord.py +99 -0
  25. lamindb/migrations/0119_rename_records_project_linked_in_records.py +26 -0
  26. lamindb/migrations/{0117_squashed.py → 0119_squashed.py} +92 -5
  27. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  28. lamindb/migrations/0121_recorduser.py +60 -0
  29. lamindb/models/__init__.py +4 -1
  30. lamindb/models/_describe.py +2 -2
  31. lamindb/models/_feature_manager.py +131 -71
  32. lamindb/models/_from_values.py +2 -2
  33. lamindb/models/_is_versioned.py +4 -4
  34. lamindb/models/_label_manager.py +4 -4
  35. lamindb/models/artifact.py +357 -192
  36. lamindb/models/artifact_set.py +45 -1
  37. lamindb/models/can_curate.py +1 -2
  38. lamindb/models/collection.py +3 -34
  39. lamindb/models/feature.py +111 -7
  40. lamindb/models/has_parents.py +11 -11
  41. lamindb/models/project.py +42 -2
  42. lamindb/models/query_manager.py +16 -7
  43. lamindb/models/query_set.py +191 -78
  44. lamindb/models/record.py +30 -5
  45. lamindb/models/run.py +10 -33
  46. lamindb/models/save.py +6 -8
  47. lamindb/models/schema.py +54 -26
  48. lamindb/models/sqlrecord.py +152 -40
  49. lamindb/models/storage.py +59 -14
  50. lamindb/models/transform.py +17 -17
  51. lamindb/models/ulabel.py +6 -1
  52. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/METADATA +11 -16
  53. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/RECORD +55 -50
  54. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/LICENSE +0 -0
  55. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/WHEEL +0 -0
@@ -1,7 +1,6 @@
1
1
  # ruff: noqa: TC004
2
2
  from __future__ import annotations
3
3
 
4
- import os
5
4
  import shutil
6
5
  from collections import defaultdict
7
6
  from pathlib import Path, PurePath, PurePosixPath
@@ -9,7 +8,6 @@ from typing import TYPE_CHECKING, Any, Literal, Union, overload
9
8
 
10
9
  import fsspec
11
10
  import lamindb_setup as ln_setup
12
- import numpy as np
13
11
  import pandas as pd
14
12
  from anndata import AnnData
15
13
  from django.db import connections, models
@@ -63,14 +61,13 @@ from ..core.storage.paths import (
63
61
  filepath_cache_key_from_artifact,
64
62
  filepath_from_artifact,
65
63
  )
66
- from ..errors import IntegrityError, InvalidArgument, ValidationError
64
+ from ..errors import InvalidArgument, ValidationError
67
65
  from ..models._is_versioned import (
68
66
  create_uid,
69
67
  )
70
68
  from ._django import get_artifact_with_related, get_collection_with_related
71
69
  from ._feature_manager import (
72
70
  FeatureManager,
73
- filter_base,
74
71
  get_label_links,
75
72
  )
76
73
  from ._is_versioned import IsVersioned
@@ -201,7 +198,7 @@ def process_pathlike(
201
198
  # hence, we revert the creation and throw an error
202
199
  storage_record.delete()
203
200
  raise UnknownStorageLocation(
204
- f"Path {filepath} is not contained in any known storage location:\n{Storage.df()[['uid', 'root', 'type']]}\n\n"
201
+ f"Path {filepath} is not contained in any known storage location:\n{Storage.to_dataframe()[['uid', 'root', 'type']]}\n\n"
205
202
  f"Create a managed storage location that contains the path, e.g., by calling: ln.Storage(root='{new_root}').save()"
206
203
  )
207
204
  use_existing_storage_key = True
@@ -419,24 +416,6 @@ def get_artifact_kwargs_from_data(
419
416
  skip_check_exists,
420
417
  is_replace=is_replace,
421
418
  )
422
- stat_or_artifact = get_stat_or_artifact(
423
- path=path,
424
- key=key,
425
- instance=using_key,
426
- is_replace=is_replace,
427
- )
428
- if isinstance(stat_or_artifact, Artifact):
429
- existing_artifact = stat_or_artifact
430
- if run is not None:
431
- existing_artifact._populate_subsequent_runs(run)
432
- return existing_artifact, None
433
- else:
434
- size, hash, hash_type, n_files, revises = stat_or_artifact
435
-
436
- if revises is not None: # update provisional_uid
437
- provisional_uid, revises = create_uid(revises=revises, version=version)
438
- if settings.cache_dir in path.parents:
439
- path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
440
419
 
441
420
  check_path_in_storage = False
442
421
  if use_existing_storage_key:
@@ -457,6 +436,25 @@ def get_artifact_kwargs_from_data(
457
436
  else:
458
437
  storage = storage
459
438
 
439
+ stat_or_artifact = get_stat_or_artifact(
440
+ path=path,
441
+ key=key,
442
+ instance=using_key,
443
+ is_replace=is_replace,
444
+ )
445
+ if isinstance(stat_or_artifact, Artifact):
446
+ existing_artifact = stat_or_artifact
447
+ if run is not None:
448
+ existing_artifact._populate_subsequent_runs(run)
449
+ return existing_artifact, None
450
+ else:
451
+ size, hash, hash_type, n_files, revises = stat_or_artifact
452
+
453
+ if revises is not None: # update provisional_uid
454
+ provisional_uid, revises = create_uid(revises=revises, version=version)
455
+ if settings.cache_dir in path.parents:
456
+ path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
457
+
460
458
  log_storage_hint(
461
459
  check_path_in_storage=check_path_in_storage,
462
460
  storage=storage,
@@ -542,6 +540,7 @@ def log_storage_hint(
542
540
  def data_is_scversedatastructure(
543
541
  data: ScverseDataStructures | UPathStr,
544
542
  structure_type: Literal["AnnData", "MuData", "SpatialData"] | None = None,
543
+ cloud_warning: bool = True,
545
544
  ) -> bool:
546
545
  """Determine whether a specific in-memory object or a UPathstr is any or a specific scverse data structure."""
547
546
  file_suffix = None
@@ -551,12 +550,19 @@ def data_is_scversedatastructure(
551
550
  file_suffix = ".h5mu"
552
551
  # SpatialData does not have a unique suffix but `.zarr`
553
552
 
553
+ # AnnData allows both AnnDataAccessor and AnnData
554
+ class_name = data.__class__.__name__
554
555
  if structure_type is None:
555
556
  return any(
556
- hasattr(data, "__class__") and data.__class__.__name__ == cl_name
557
+ class_name
558
+ in (["AnnData", "AnnDataAccessor"] if cl_name == "AnnData" else [cl_name])
557
559
  for cl_name in ["AnnData", "MuData", "SpatialData"]
558
560
  )
559
- elif hasattr(data, "__class__") and data.__class__.__name__ == structure_type:
561
+ elif class_name in (
562
+ ["AnnData", "AnnDataAccessor"]
563
+ if structure_type == "AnnData"
564
+ else [structure_type]
565
+ ):
560
566
  return True
561
567
 
562
568
  data_type = structure_type.lower()
@@ -580,11 +586,12 @@ def data_is_scversedatastructure(
580
586
  )
581
587
  == data_type
582
588
  )
583
- else:
589
+ elif cloud_warning:
584
590
  logger.warning(
585
591
  f"we do not check whether cloud zarr is {structure_type}"
586
592
  )
587
593
  return False
594
+
588
595
  return False
589
596
 
590
597
 
@@ -600,23 +607,24 @@ def data_is_soma_experiment(data: SOMAExperiment | UPathStr) -> bool:
600
607
  def _check_otype_artifact(
601
608
  data: UPathStr | pd.DataFrame | ScverseDataStructures,
602
609
  otype: str | None = None,
610
+ cloud_warning: bool = True,
603
611
  ) -> str:
604
612
  if otype is None:
605
613
  if isinstance(data, pd.DataFrame):
606
- logger.warning("data is a DataFrame, please use .from_df()")
614
+ logger.warning("data is a DataFrame, please use .from_dataframe()")
607
615
  otype = "DataFrame"
608
616
  return otype
609
617
 
610
618
  data_is_path = isinstance(data, (str, Path))
611
- if data_is_scversedatastructure(data, "AnnData"):
619
+ if data_is_scversedatastructure(data, "AnnData", cloud_warning):
612
620
  if not data_is_path:
613
621
  logger.warning("data is an AnnData, please use .from_anndata()")
614
622
  otype = "AnnData"
615
- elif data_is_scversedatastructure(data, "MuData"):
623
+ elif data_is_scversedatastructure(data, "MuData", cloud_warning):
616
624
  if not data_is_path:
617
625
  logger.warning("data is a MuData, please use .from_mudata()")
618
626
  otype = "MuData"
619
- elif data_is_scversedatastructure(data, "SpatialData"):
627
+ elif data_is_scversedatastructure(data, "SpatialData", cloud_warning):
620
628
  if not data_is_path:
621
629
  logger.warning("data is a SpatialData, please use .from_spatialdata()")
622
630
  otype = "SpatialData"
@@ -871,7 +879,7 @@ def get_labels(
871
879
 
872
880
  values = []
873
881
  for v in qs_by_registry.values():
874
- values += v.list(get_name_field(v))
882
+ values += v.to_list(get_name_field(v))
875
883
  return values
876
884
  if len(registries_to_check) == 1 and registry in qs_by_registry:
877
885
  return qs_by_registry[registry]
@@ -894,7 +902,7 @@ def add_labels(
894
902
  raise ValueError("Please save the artifact/collection before adding a label!")
895
903
 
896
904
  if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both
897
- records = records.list()
905
+ records = records.to_list()
898
906
  if isinstance(records, (str, SQLRecord)):
899
907
  records = [records]
900
908
  if not isinstance(records, list): # avoids warning for pd Series
@@ -993,6 +1001,112 @@ def add_labels(
993
1001
  )
994
1002
 
995
1003
 
1004
+ def delete_permanently(artifact: Artifact, storage: bool, using_key: str):
1005
+ # need to grab file path before deletion
1006
+ try:
1007
+ path, _ = filepath_from_artifact(artifact, using_key)
1008
+ except OSError:
1009
+ # we can still delete the record
1010
+ logger.warning("Could not get path")
1011
+ storage = False
1012
+ # only delete in storage if DB delete is successful
1013
+ # DB delete might error because of a foreign key constraint violated etc.
1014
+ if artifact._overwrite_versions and artifact.is_latest:
1015
+ logger.important(
1016
+ "deleting all versions of this artifact because they all share the same store"
1017
+ )
1018
+ for version in artifact.versions.all(): # includes artifact
1019
+ _delete_skip_storage(version)
1020
+ else:
1021
+ artifact._delete_skip_storage()
1022
+ # by default do not delete storage if deleting only a previous version
1023
+ # and the underlying store is mutable
1024
+ if artifact._overwrite_versions and not artifact.is_latest:
1025
+ delete_in_storage = False
1026
+ if storage:
1027
+ logger.warning(
1028
+ "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
1029
+ )
1030
+ elif artifact.key is None or artifact._key_is_virtual:
1031
+ # do not ask for confirmation also if storage is None
1032
+ delete_in_storage = storage is None or storage
1033
+ else:
1034
+ # for artifacts with non-virtual semantic storage keys (key is not None)
1035
+ # ask for extra-confirmation if storage is None
1036
+ if storage is None:
1037
+ response = input(
1038
+ f"Are you sure to want to delete {path}? (y/n) You can't undo"
1039
+ " this action."
1040
+ )
1041
+ delete_in_storage = response == "y"
1042
+ else:
1043
+ delete_in_storage = storage
1044
+ if not delete_in_storage:
1045
+ logger.important(f"a file/folder remains here: {path}")
1046
+ # we don't yet have logic to bring back the deleted metadata record
1047
+ # in case storage deletion fails - this is important for ACID down the road
1048
+ if delete_in_storage:
1049
+ delete_msg = delete_storage(path, raise_file_not_found_error=False)
1050
+ if delete_msg != "did-not-delete":
1051
+ logger.success(f"deleted {colors.yellow(f'{path}')}")
1052
+
1053
+
1054
+ class LazyArtifact:
1055
+ """Lazy artifact for streaming to auto-generated internal paths.
1056
+
1057
+ This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
1058
+ and register the path as an artifact (see :class:`~lamindb.Artifact`).
1059
+
1060
+ This object creates a real artifact on `.save()` with the provided arguments.
1061
+
1062
+ Args:
1063
+ suffix: The suffix for the auto-generated internal path
1064
+ overwrite_versions: Whether to overwrite versions.
1065
+ **kwargs: Keyword arguments for the artifact to be created.
1066
+
1067
+ Examples:
1068
+
1069
+ Create a lazy artifact, write to the path and save to get a real artifact::
1070
+
1071
+ lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
1072
+ zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
1073
+ artifact = lazy.save()
1074
+ """
1075
+
1076
+ def __init__(self, suffix: str, overwrite_versions: bool, **kwargs):
1077
+ self.kwargs = kwargs
1078
+ self.kwargs["overwrite_versions"] = overwrite_versions
1079
+
1080
+ if (key := kwargs.get("key")) is not None and extract_suffix_from_path(
1081
+ PurePosixPath(key)
1082
+ ) != suffix:
1083
+ raise ValueError(
1084
+ "The suffix argument and the suffix of key should be the same."
1085
+ )
1086
+
1087
+ uid, _ = create_uid(n_full_id=20)
1088
+ storage_key = auto_storage_key_from_artifact_uid(
1089
+ uid, suffix, overwrite_versions=overwrite_versions
1090
+ )
1091
+ storepath = setup_settings.storage.root / storage_key
1092
+
1093
+ self._path = storepath
1094
+
1095
+ @property
1096
+ def path(self) -> UPath:
1097
+ return self._path
1098
+
1099
+ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1100
+ artifact = Artifact(self.path, _is_internal_call=True, **self.kwargs)
1101
+ return artifact.save(upload=upload, **kwargs)
1102
+
1103
+ def __repr__(self) -> str: # pragma: no cover
1104
+ show_kwargs = {k: v for k, v in self.kwargs.items() if v is not None}
1105
+ return (
1106
+ f"LazyArtifact object with\n path: {self.path}\n arguments: {show_kwargs}"
1107
+ )
1108
+
1109
+
996
1110
  class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
997
1111
  # Note that this docstring has to be consistent with Curator.save_artifact()
998
1112
  """Datasets & models stored as files, folders, or arrays.
@@ -1028,15 +1142,22 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1028
1142
 
1029
1143
  artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
1030
1144
 
1031
- If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_df()`, `.from_anndata()`, ... constructors::
1145
+ If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_dataframe()`, `.from_anndata()`, ... constructors::
1032
1146
 
1033
1147
  schema = ln.Schema(itype=ln.Feature) # a schema that merely enforces that feature names exist in the Feature registry
1034
- artifact = ln.Artifact.from_df("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
1148
+ artifact = ln.Artifact.from_dataframe("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
1149
+
1150
+ To annotate by **external features**::
1151
+
1152
+ schema = ln.examples.schemas.valid_features()
1153
+ artifact = ln.Artifact("./my_file.parquet", features={"species": "bird"}).save()
1154
+
1155
+ A `schema` can be optionally passed to also validate the features.
1035
1156
 
1036
1157
  You can make a **new version** of an artifact by passing an existing `key`::
1037
1158
 
1038
1159
  artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
1039
- artifact_v2.versions.df() # see all versions
1160
+ artifact_v2.versions.to_dataframe() # see all versions
1040
1161
 
1041
1162
  You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
1042
1163
 
@@ -1110,6 +1231,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1110
1231
 
1111
1232
  class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
1112
1233
  abstract = False
1234
+ app_label = "lamindb"
1113
1235
  constraints = [
1114
1236
  # a simple hard unique constraint on `hash` clashes with the fact
1115
1237
  # that pipelines sometimes aim to ingest the exact same file in different
@@ -1157,11 +1279,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1157
1279
 
1158
1280
  ln.Artifact.filter(scientist="Barbara McClintock")
1159
1281
 
1160
- Features may or may not be part of the dataset, i.e., the artifact content in storage. For
1161
- instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1162
- `DataFrame`-like artifact and annotates it with features corresponding to
1163
- these columns. `artifact.features.add_values`, by contrast, does not
1164
- validate the content of the artifact.
1282
+ Features may or may not be part of the dataset, i.e., the artifact content in storage.
1283
+ For instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1284
+ `DataFrame`-like artifact and annotates it with features corresponding to these columns.
1285
+ `artifact.features.add_values`, by contrast, does not validate the content of the artifact.
1165
1286
 
1166
1287
  .. dropdown:: An example for a model-like artifact
1167
1288
 
@@ -1176,6 +1297,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1176
1297
  "subset_highlyvariable": True,
1177
1298
  },
1178
1299
  })
1300
+
1301
+ To validate external features::
1302
+
1303
+ schema = ln.Schema([ln.Feature(name="species", dtype=str).save()]).save()
1304
+ artifact.features.add_values({"species": "bird"}, schema=schema)
1179
1305
  """
1180
1306
  from ._feature_manager import FeatureManager
1181
1307
 
@@ -1385,15 +1511,46 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1385
1511
  # now proceed with the user-facing constructor
1386
1512
  if len(args) > 1:
1387
1513
  raise ValueError("Only one non-keyword arg allowed: data")
1514
+
1388
1515
  data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
1389
1516
  kind: str = kwargs.pop("kind", None)
1390
1517
  key: str | None = kwargs.pop("key", None)
1391
1518
  run_id: int | None = kwargs.pop("run_id", None) # for REST API
1392
1519
  run: Run | None = kwargs.pop("run", None)
1520
+ using_key = kwargs.pop("using_key", None)
1393
1521
  description: str | None = kwargs.pop("description", None)
1394
1522
  revises: Artifact | None = kwargs.pop("revises", None)
1395
1523
  overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None)
1396
1524
  version: str | None = kwargs.pop("version", None)
1525
+
1526
+ features: dict[str, Any] = kwargs.pop("features", None)
1527
+ schema: Schema | None = kwargs.pop("schema", None)
1528
+ if features is not None and schema is not None:
1529
+ from lamindb.curators import DataFrameCurator
1530
+
1531
+ temp_df = pd.DataFrame([features])
1532
+ validation_schema = schema
1533
+ if schema.itype == "Composite" and schema.slots:
1534
+ if len(schema.slots) > 1:
1535
+ raise ValueError(
1536
+ f"Composite schema has {len(schema.slots)} slots. "
1537
+ "External feature validation only supports schemas with a single slot."
1538
+ )
1539
+ try:
1540
+ validation_schema = next(
1541
+ k for k in schema.slots.keys() if k.startswith("__external")
1542
+ )
1543
+ except StopIteration:
1544
+ raise ValueError(
1545
+ "External feature validation requires a slot that starts with __external."
1546
+ ) from None
1547
+
1548
+ external_curator = DataFrameCurator(temp_df, validation_schema)
1549
+ external_curator.validate()
1550
+ external_curator._artifact = self
1551
+
1552
+ self._external_features = features
1553
+
1397
1554
  branch_id: int | None = None
1398
1555
  if "visibility" in kwargs: # backward compat
1399
1556
  branch_id = kwargs.pop("visibility")
@@ -1404,13 +1561,16 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1404
1561
  else:
1405
1562
  branch_id = 1
1406
1563
  branch = kwargs.pop("branch", None)
1564
+
1407
1565
  space = kwargs.pop("space", None)
1408
- space_id = kwargs.pop("space_id", 1)
1566
+ assert "space_id" not in kwargs, "please pass space instead" # noqa: S101
1409
1567
  format = kwargs.pop("format", None)
1410
1568
  _is_internal_call = kwargs.pop("_is_internal_call", False)
1411
1569
  skip_check_exists = kwargs.pop("skip_check_exists", False)
1570
+ storage_was_passed = False
1412
1571
  if "storage" in kwargs:
1413
1572
  storage = kwargs.pop("storage")
1573
+ storage_was_passed = True
1414
1574
  elif (
1415
1575
  setup_settings.instance.keep_artifacts_local
1416
1576
  and setup_settings.instance._local_storage is not None
@@ -1418,13 +1578,32 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1418
1578
  storage = setup_settings.instance.local_storage.record
1419
1579
  else:
1420
1580
  storage = setup_settings.instance.storage.record
1421
- using_key = kwargs.pop("using_key", None)
1581
+ if space is None:
1582
+ from lamindb import context as run_context
1583
+
1584
+ if run_context.space is not None:
1585
+ space = run_context.space
1586
+ elif setup_settings.space is not None:
1587
+ space = setup_settings.space
1588
+ if space is not None and space != storage.space:
1589
+ if storage_was_passed:
1590
+ logger.warning(
1591
+ "storage argument ignored as storage information from space takes precedence"
1592
+ )
1593
+ storage_locs_for_space = Storage.filter(space=space)
1594
+ storage = storage_locs_for_space.first()
1595
+ if len(storage_locs_for_space) > 1:
1596
+ logger.warning(
1597
+ f"more than one storage location for space {space}, choosing {storage}"
1598
+ )
1422
1599
  otype = kwargs.pop("otype") if "otype" in kwargs else None
1423
1600
  if isinstance(data, str) and data.startswith("s3:///"):
1424
1601
  # issue in Groovy / nf-lamin producing malformed S3 paths
1425
1602
  # https://laminlabs.slack.com/archives/C08J590666Q/p1751315027830849?thread_ts=1751039961.479259&cid=C08J590666Q
1426
1603
  data = data.replace("s3:///", "s3://")
1427
- otype = _check_otype_artifact(data=data, otype=otype)
1604
+ otype = _check_otype_artifact(
1605
+ data=data, otype=otype, cloud_warning=not _is_internal_call
1606
+ )
1428
1607
  if "type" in kwargs:
1429
1608
  logger.warning("`type` will be removed soon, please use `kind`")
1430
1609
  kind = kwargs.pop("type")
@@ -1457,6 +1636,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1457
1636
  )
1458
1637
  else:
1459
1638
  is_automanaged_path = False
1639
+
1460
1640
  provisional_uid, revises = create_uid(revises=revises, version=version)
1461
1641
  kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
1462
1642
  data=data,
@@ -1514,7 +1694,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1514
1694
  uid, revises = create_uid(revises=revises, version=version)
1515
1695
  kwargs["uid"] = uid
1516
1696
 
1517
- # only set key now so that we don't do a look-up on it in case revises is passed
1697
+ # only set key now so that we don't perform a look-up on it in case revises is passed
1518
1698
  if revises is not None and revises.key is not None and kwargs["key"] is None:
1519
1699
  kwargs["key"] = revises.key
1520
1700
 
@@ -1526,7 +1706,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1526
1706
  kwargs["branch"] = branch
1527
1707
  kwargs["branch_id"] = branch_id
1528
1708
  kwargs["space"] = space
1529
- kwargs["space_id"] = space_id
1530
1709
  kwargs["otype"] = otype
1531
1710
  kwargs["revises"] = revises
1532
1711
  # this check needs to come down here because key might be populated from an
@@ -1540,6 +1719,43 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1540
1719
 
1541
1720
  super().__init__(**kwargs)
1542
1721
 
1722
+ @classmethod
1723
+ def from_lazy(
1724
+ cls,
1725
+ suffix: str,
1726
+ overwrite_versions: bool,
1727
+ key: str | None = None,
1728
+ description: str | None = None,
1729
+ run: Run | None = None,
1730
+ **kwargs,
1731
+ ) -> LazyArtifact:
1732
+ """Create a lazy artifact for streaming to auto-generated internal paths.
1733
+
1734
+ This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
1735
+ and register the path as an artifact.
1736
+
1737
+ The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
1738
+ on `.save()` with the provided arguments.
1739
+
1740
+ Args:
1741
+ suffix: The suffix for the auto-generated internal path
1742
+ overwrite_versions: Whether to overwrite versions.
1743
+ key: An optional key to reference the artifact.
1744
+ description: A description.
1745
+ run: The run that creates the artifact.
1746
+ **kwargs: Other keyword arguments for the artifact to be created.
1747
+
1748
+ Examples:
1749
+
1750
+ Create a lazy artifact, write to the path and save to get a real artifact::
1751
+
1752
+ lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
1753
+ zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
1754
+ artifact = lazy.save()
1755
+ """
1756
+ args = {"key": key, "description": description, "run": run, **kwargs}
1757
+ return LazyArtifact(suffix, overwrite_versions, **args)
1758
+
1543
1759
  @property
1544
1760
  @deprecated("kind")
1545
1761
  def type(self) -> str:
@@ -1623,6 +1839,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1623
1839
  idlike: Either a uid stub, uid or an integer id.
1624
1840
  is_run_input: Whether to track this artifact as run input.
1625
1841
  expressions: Fields and values passed as Django query expressions.
1842
+ Use `path=...` to get an artifact for a local or remote filepath if exists.
1626
1843
 
1627
1844
  Raises:
1628
1845
  :exc:`docs:lamindb.errors.DoesNotExist`: In case no matching record is found.
@@ -1637,6 +1854,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1637
1854
 
1638
1855
  artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
1639
1856
  artifact = ln.Arfifact.get(key="examples/my_file.parquet")
1857
+ artifact = ln.Artifact.get(path="s3://bucket/folder/adata.h5ad")
1640
1858
  """
1641
1859
  from .query_set import QuerySet
1642
1860
 
@@ -1668,45 +1886,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1668
1886
  ln.Arfifact.filter(cell_type_by_model__name="T cell")
1669
1887
 
1670
1888
  """
1671
- from .query_set import QuerySet
1672
-
1673
- if expressions:
1674
- keys_normalized = [key.split("__")[0] for key in expressions]
1675
- field_or_feature_or_param = keys_normalized[0].split("__")[0]
1676
- if field_or_feature_or_param in Artifact.__get_available_fields__():
1677
- qs = QuerySet(model=cls).filter(*queries, **expressions)
1678
- if not any(e.startswith("kind") for e in expressions):
1679
- return qs.exclude(kind="__lamindb_run__")
1680
- else:
1681
- return qs
1682
- elif all(
1683
- features_validated := Feature.validate(
1684
- keys_normalized, field="name", mute=True
1685
- )
1686
- ):
1687
- return filter_base(Artifact, **expressions)
1688
- else:
1689
- features = ", ".join(
1690
- sorted(np.array(keys_normalized)[~features_validated])
1691
- )
1692
- message = f"feature names: {features}"
1693
- avail_fields = cls.__get_available_fields__()
1694
- if "_branch_code" in avail_fields:
1695
- avail_fields.remove("_branch_code") # backward compat
1696
- fields = ", ".join(sorted(avail_fields))
1697
- raise InvalidArgument(
1698
- f"You can query either by available fields: {fields}\n"
1699
- f"Or fix invalid {message}"
1700
- )
1701
- else:
1702
- return (
1703
- QuerySet(model=cls)
1704
- .filter(*queries, **expressions)
1705
- .exclude(kind="__lamindb_run__")
1706
- )
1889
+ # from Registry metaclass
1890
+ return type(cls).filter(cls, *queries, **expressions)
1707
1891
 
1708
1892
  @classmethod
1709
- def from_df(
1893
+ def from_dataframe(
1710
1894
  cls,
1711
1895
  df: pd.DataFrame,
1712
1896
  *,
@@ -1715,6 +1899,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1715
1899
  run: Run | None = None,
1716
1900
  revises: Artifact | None = None,
1717
1901
  schema: Schema | None = None,
1902
+ features: dict[str, Any] | None = None,
1718
1903
  **kwargs,
1719
1904
  ) -> Artifact:
1720
1905
  """Create from `DataFrame`, optionally validate & annotate.
@@ -1727,6 +1912,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1727
1912
  revises: An old version of the artifact.
1728
1913
  run: The run that creates the artifact.
1729
1914
  schema: A schema that defines how to validate & annotate.
1915
+ features: External features dict for additional annotation.
1730
1916
 
1731
1917
  See Also:
1732
1918
  :meth:`~lamindb.Collection`
@@ -1741,7 +1927,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1741
1927
  import lamindb as ln
1742
1928
 
1743
1929
  df = ln.core.datasets.mini_immuno.get_dataset1()
1744
- artifact = ln.Artifact.from_df(df, key="examples/dataset1.parquet").save()
1930
+ artifact = ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet").save()
1745
1931
 
1746
1932
  With validation and annotation.
1747
1933
 
@@ -1758,6 +1944,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1758
1944
  .. literalinclude:: scripts/define_mini_immuno_features_labels.py
1759
1945
  :language: python
1760
1946
 
1947
+ External features:
1948
+
1949
+ .. literalinclude:: scripts/curate_dataframe_external_features.py
1950
+ :language: python
1761
1951
  """
1762
1952
  artifact = Artifact( # type: ignore
1763
1953
  data=df,
@@ -1770,8 +1960,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1770
1960
  **kwargs,
1771
1961
  )
1772
1962
  artifact.n_observations = len(df)
1963
+
1773
1964
  if schema is not None:
1774
- from ..curators import DataFrameCurator
1965
+ from lamindb.curators.core import ComponentCurator
1775
1966
 
1776
1967
  if not artifact._state.adding and artifact.suffix != ".parquet":
1777
1968
  logger.warning(
@@ -1780,12 +1971,56 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1780
1971
  )
1781
1972
  return artifact
1782
1973
 
1783
- curator = DataFrameCurator(artifact, schema)
1784
- curator.validate()
1785
- artifact.schema = schema
1786
- artifact._curator = curator
1974
+ # Handle external features validation for Composite schemas
1975
+ if schema.itype == "Composite" and features is not None:
1976
+ try:
1977
+ external_slot = next(
1978
+ k for k in schema.slots.keys() if "__external__" in k
1979
+ )
1980
+ validation_schema = schema.slots[external_slot]
1981
+ except StopIteration:
1982
+ raise ValueError(
1983
+ "External feature validation requires a slot __external__."
1984
+ ) from None
1985
+
1986
+ external_curator = ComponentCurator(
1987
+ pd.DataFrame([features]), validation_schema
1988
+ )
1989
+ external_curator.validate()
1990
+ artifact._external_features = features
1991
+
1992
+ # Validate main DataFrame if not Composite or if Composite has attrs
1993
+ if schema.itype != "Composite" or "attrs" in schema.slots:
1994
+ curator = ComponentCurator(artifact, schema)
1995
+ curator.validate()
1996
+ artifact.schema = schema
1997
+ artifact._curator = curator
1998
+
1787
1999
  return artifact
1788
2000
 
2001
+ @classmethod
2002
+ @deprecated("from_dataframe")
2003
+ def from_df(
2004
+ cls,
2005
+ df: pd.DataFrame,
2006
+ *,
2007
+ key: str | None = None,
2008
+ description: str | None = None,
2009
+ run: Run | None = None,
2010
+ revises: Artifact | None = None,
2011
+ schema: Schema | None = None,
2012
+ **kwargs,
2013
+ ) -> Artifact:
2014
+ return cls.from_dataframe(
2015
+ df,
2016
+ key=key,
2017
+ description=description,
2018
+ run=run,
2019
+ revises=revises,
2020
+ schema=schema,
2021
+ **kwargs,
2022
+ )
2023
+
1789
2024
  @classmethod
1790
2025
  def from_anndata(
1791
2026
  cls,
@@ -2285,17 +2520,19 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2285
2520
  ):
2286
2521
  """Open a dataset for streaming.
2287
2522
 
2288
- Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
2289
- `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` or `polars` compatible formats
2523
+ Works for `AnnData` (`.h5ad` and `.zarr`), `SpatialData` (`.zarr`),
2524
+ generic `hdf5` and `zarr`, `tiledbsoma` objects (`.tiledbsoma`),
2525
+ `pyarrow` or `polars` compatible formats
2290
2526
  (`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
2291
2527
 
2292
2528
  Args:
2293
- mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
2529
+ mode: can be `"r"` or `"w"` (write mode) for `tiledbsoma` stores,
2530
+ `"r"` or `"r+"` for `AnnData` or `SpatialData` `zarr` stores,
2294
2531
  otherwise should be always `"r"` (read-only mode).
2295
2532
  engine: Which module to use for lazy loading of a dataframe
2296
2533
  from `pyarrow` or `polars` compatible formats.
2297
2534
  This has no effect if the artifact is not a dataframe, i.e.
2298
- if it is an `AnnData,` `hdf5`, `zarr` or `tiledbsoma` object.
2535
+ if it is an `AnnData,` `hdf5`, `zarr`, `tiledbsoma` object etc.
2299
2536
  is_run_input: Whether to track this artifact as run input.
2300
2537
  **kwargs: Keyword arguments for the accessor, i.e. `h5py` or `zarr` connection,
2301
2538
  `pyarrow.dataset.dataset`, `polars.scan_*` function.
@@ -2339,7 +2576,8 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2339
2576
  s + ".gz" for s in PYARROW_SUFFIXES
2340
2577
  ) # this doesn't work for externally gzipped files, REMOVE LATER
2341
2578
  )
2342
- if self.suffix not in suffixes:
2579
+ suffix = self.suffix
2580
+ if suffix not in suffixes:
2343
2581
  raise ValueError(
2344
2582
  "Artifact should have a zarr, h5, tiledbsoma object"
2345
2583
  " or a compatible `pyarrow.dataset.dataset` or `polars.scan_*` directory"
@@ -2348,23 +2586,28 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2348
2586
  f" Or no suffix for a folder with {', '.join(df_suffixes)} files"
2349
2587
  " (no mixing allowed)."
2350
2588
  )
2351
- if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
2352
- raise ValueError(
2353
- "Only a tiledbsoma store can be openened with `mode!='r'`."
2354
- )
2355
-
2356
2589
  using_key = settings._using_key
2357
2590
  filepath, cache_key = filepath_cache_key_from_artifact(
2358
2591
  self, using_key=using_key
2359
2592
  )
2593
+
2360
2594
  is_tiledbsoma_w = (
2361
- filepath.name == "soma" or self.suffix == ".tiledbsoma"
2595
+ filepath.name == "soma" or suffix == ".tiledbsoma"
2362
2596
  ) and mode == "w"
2597
+ is_zarr_w = suffix == ".zarr" and mode == "r+"
2598
+
2599
+ if mode != "r" and not (is_tiledbsoma_w or is_zarr_w):
2600
+ raise ValueError(
2601
+ f"It is not allowed to open a {suffix} object with mode='{mode}'. "
2602
+ "You can open all supported formats with mode='r', "
2603
+ "a tiledbsoma store with mode='w', "
2604
+ "AnnData or SpatialData zarr store with mode='r+'."
2605
+ )
2363
2606
  # consider the case where an object is already locally cached
2364
2607
  localpath = setup_settings.paths.cloud_to_local_no_update(
2365
2608
  filepath, cache_key=cache_key
2366
2609
  )
2367
- if is_tiledbsoma_w:
2610
+ if is_tiledbsoma_w or is_zarr_w:
2368
2611
  open_cache = False
2369
2612
  else:
2370
2613
  open_cache = not isinstance(
@@ -2395,9 +2638,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2395
2638
  else:
2396
2639
  localpath.unlink(missing_ok=True)
2397
2640
  else:
2398
- access = backed_access(
2399
- filepath, mode, engine, using_key=using_key, **kwargs
2400
- )
2641
+ access = backed_access(self, mode, engine, using_key=using_key, **kwargs)
2401
2642
  if is_tiledbsoma_w:
2402
2643
 
2403
2644
  def finalize():
@@ -2413,6 +2654,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2413
2654
  new_version = Artifact(
2414
2655
  filepath, revises=self, _is_internal_call=True
2415
2656
  ).save()
2657
+ # note: sets _state.db = "default"
2416
2658
  init_self_from_db(self, new_version)
2417
2659
 
2418
2660
  if localpath != filepath and localpath.exists():
@@ -2569,94 +2811,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2569
2811
  artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
2570
2812
  artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
2571
2813
  """
2572
- # we're *not* running the line below because the case `storage is None` triggers user feedback in one case
2573
- # storage = True if storage is None else storage
2574
-
2575
- # this first check means an invalid delete fails fast rather than cascading through
2576
- # database and storage permission errors
2577
- if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
2578
- isettings = setup_settings.instance
2579
- if self.storage.instance_uid != isettings.uid and (
2580
- storage or storage is None
2581
- ):
2582
- raise IntegrityError(
2583
- "Cannot simply delete artifacts outside of this instance's managed storage locations."
2584
- "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
2585
- f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
2586
- f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
2587
- )
2588
- # by default, we only move artifacts into the trash (branch_id = -1)
2589
- trash_branch_id = -1
2590
- if self.branch_id > trash_branch_id and not permanent:
2591
- if storage is not None:
2592
- logger.warning("moving artifact to trash, storage arg is ignored")
2593
- # move to trash
2594
- self.branch_id = trash_branch_id
2595
- self.save()
2596
- logger.important(f"moved artifact to trash (branch_id = {trash_branch_id})")
2597
- return
2598
-
2599
- # if the artifact is already in the trash
2600
- # permanent delete skips the trash
2601
- if permanent is None:
2602
- # ask for confirmation of permanent delete
2603
- response = input(
2604
- "Artifact record is already in trash! Are you sure you want to permanently"
2605
- " delete it? (y/n) You can't undo this action."
2606
- )
2607
- delete_record = response == "y"
2608
- else:
2609
- assert permanent # noqa: S101
2610
- delete_record = True
2611
-
2612
- if delete_record:
2613
- # need to grab file path before deletion
2614
- try:
2615
- path, _ = filepath_from_artifact(self, using_key)
2616
- except OSError:
2617
- # we can still delete the record
2618
- logger.warning("Could not get path")
2619
- storage = False
2620
- # only delete in storage if DB delete is successful
2621
- # DB delete might error because of a foreign key constraint violated etc.
2622
- if self._overwrite_versions and self.is_latest:
2623
- logger.important(
2624
- "deleting all versions of this artifact because they all share the same store"
2625
- )
2626
- for version in self.versions.all(): # includes self
2627
- _delete_skip_storage(version)
2628
- else:
2629
- self._delete_skip_storage()
2630
- # by default do not delete storage if deleting only a previous version
2631
- # and the underlying store is mutable
2632
- if self._overwrite_versions and not self.is_latest:
2633
- delete_in_storage = False
2634
- if storage:
2635
- logger.warning(
2636
- "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
2637
- )
2638
- elif self.key is None or self._key_is_virtual:
2639
- # do not ask for confirmation also if storage is None
2640
- delete_in_storage = storage is None or storage
2641
- else:
2642
- # for artifacts with non-virtual semantic storage keys (key is not None)
2643
- # ask for extra-confirmation
2644
- if storage is None:
2645
- response = input(
2646
- f"Are you sure to want to delete {path}? (y/n) You can't undo"
2647
- " this action."
2648
- )
2649
- delete_in_storage = response == "y"
2650
- else:
2651
- delete_in_storage = storage
2652
- if not delete_in_storage:
2653
- logger.important(f"a file/folder remains here: {path}")
2654
- # we don't yet have logic to bring back the deleted metadata record
2655
- # in case storage deletion fails - this is important for ACID down the road
2656
- if delete_in_storage:
2657
- delete_msg = delete_storage(path, raise_file_not_found_error=False)
2658
- if delete_msg != "did-not-delete":
2659
- logger.success(f"deleted {colors.yellow(f'{path}')}")
2814
+ super().delete(permanent=permanent, storage=storage, using_key=using_key)
2660
2815
 
2661
2816
  @property
2662
2817
  def _is_saved_to_storage_location(self) -> bool | None:
@@ -2785,11 +2940,20 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2785
2940
  local_path_cache,
2786
2941
  )
2787
2942
  logger.important(f"moved local artifact to cache: {local_path_cache}")
2943
+
2944
+ # Handle external features
2945
+ if hasattr(self, "_external_features") and self._external_features is not None:
2946
+ external_features = self._external_features
2947
+ delattr(self, "_external_features")
2948
+ self.features.add_values(external_features)
2949
+
2950
+ # annotate Artifact
2788
2951
  if hasattr(self, "_curator"):
2789
2952
  curator = self._curator
2790
2953
  delattr(self, "_curator")
2791
2954
  # just annotates this artifact
2792
2955
  curator.save_artifact()
2956
+
2793
2957
  return self
2794
2958
 
2795
2959
  def restore(self) -> None:
@@ -2837,7 +3001,7 @@ def _synchronize_cleanup_on_error(
2837
3001
 
2838
3002
 
2839
3003
  def _delete_skip_storage(artifact, *args, **kwargs) -> None:
2840
- super(Artifact, artifact).delete(*args, **kwargs)
3004
+ super(SQLRecord, artifact).delete(*args, **kwargs)
2841
3005
 
2842
3006
 
2843
3007
  def _save_skip_storage(artifact, **kwargs) -> None:
@@ -2855,6 +3019,7 @@ class ArtifactFeatureValue(BaseSQLRecord, IsLink, TracksRun):
2855
3019
  featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
2856
3020
 
2857
3021
  class Meta:
3022
+ app_label = "lamindb"
2858
3023
  unique_together = ("artifact", "featurevalue")
2859
3024
 
2860
3025