lamindb 1.10.2__py3-none-any.whl → 1.11a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +89 -49
- lamindb/_finish.py +14 -12
- lamindb/_tracked.py +2 -4
- lamindb/_view.py +1 -1
- lamindb/base/__init__.py +2 -1
- lamindb/base/dtypes.py +76 -0
- lamindb/core/_settings.py +2 -2
- lamindb/core/storage/_anndata_accessor.py +29 -9
- lamindb/curators/_legacy.py +16 -3
- lamindb/curators/core.py +432 -186
- lamindb/examples/cellxgene/__init__.py +8 -3
- lamindb/examples/cellxgene/_cellxgene.py +127 -13
- lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
- lamindb/examples/croissant/__init__.py +12 -2
- lamindb/examples/datasets/__init__.py +2 -2
- lamindb/examples/datasets/_core.py +1 -1
- lamindb/examples/datasets/_small.py +66 -22
- lamindb/examples/datasets/mini_immuno.py +1 -0
- lamindb/migrations/0119_squashed.py +5 -2
- lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
- lamindb/migrations/0121_recorduser.py +53 -0
- lamindb/models/__init__.py +3 -1
- lamindb/models/_describe.py +2 -2
- lamindb/models/_feature_manager.py +53 -53
- lamindb/models/_from_values.py +2 -2
- lamindb/models/_is_versioned.py +4 -4
- lamindb/models/_label_manager.py +4 -4
- lamindb/models/artifact.py +305 -116
- lamindb/models/artifact_set.py +36 -1
- lamindb/models/can_curate.py +1 -2
- lamindb/models/collection.py +3 -34
- lamindb/models/feature.py +111 -7
- lamindb/models/has_parents.py +11 -11
- lamindb/models/project.py +18 -0
- lamindb/models/query_manager.py +16 -7
- lamindb/models/query_set.py +59 -34
- lamindb/models/record.py +25 -4
- lamindb/models/run.py +8 -6
- lamindb/models/schema.py +54 -26
- lamindb/models/sqlrecord.py +123 -25
- lamindb/models/storage.py +59 -14
- lamindb/models/transform.py +17 -17
- lamindb/models/ulabel.py +6 -1
- {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/METADATA +4 -5
- {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/RECORD +47 -44
- {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/WHEEL +1 -1
- {lamindb-1.10.2.dist-info/licenses → lamindb-1.11a1.dist-info}/LICENSE +0 -0
lamindb/models/artifact.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
# ruff: noqa: TC004
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
import os
|
5
4
|
import shutil
|
6
5
|
from collections import defaultdict
|
7
6
|
from pathlib import Path, PurePath, PurePosixPath
|
@@ -63,7 +62,7 @@ from ..core.storage.paths import (
|
|
63
62
|
filepath_cache_key_from_artifact,
|
64
63
|
filepath_from_artifact,
|
65
64
|
)
|
66
|
-
from ..errors import
|
65
|
+
from ..errors import InvalidArgument, ValidationError
|
67
66
|
from ..models._is_versioned import (
|
68
67
|
create_uid,
|
69
68
|
)
|
@@ -201,7 +200,7 @@ def process_pathlike(
|
|
201
200
|
# hence, we revert the creation and throw an error
|
202
201
|
storage_record.delete()
|
203
202
|
raise UnknownStorageLocation(
|
204
|
-
f"Path {filepath} is not contained in any known storage location:\n{Storage.
|
203
|
+
f"Path {filepath} is not contained in any known storage location:\n{Storage.to_dataframe()[['uid', 'root', 'type']]}\n\n"
|
205
204
|
f"Create a managed storage location that contains the path, e.g., by calling: ln.Storage(root='{new_root}').save()"
|
206
205
|
)
|
207
206
|
use_existing_storage_key = True
|
@@ -552,12 +551,19 @@ def data_is_scversedatastructure(
|
|
552
551
|
file_suffix = ".h5mu"
|
553
552
|
# SpatialData does not have a unique suffix but `.zarr`
|
554
553
|
|
554
|
+
# AnnData allows both AnnDataAccessor and AnnData
|
555
|
+
class_name = data.__class__.__name__
|
555
556
|
if structure_type is None:
|
556
557
|
return any(
|
557
|
-
|
558
|
+
class_name
|
559
|
+
in (["AnnData", "AnnDataAccessor"] if cl_name == "AnnData" else [cl_name])
|
558
560
|
for cl_name in ["AnnData", "MuData", "SpatialData"]
|
559
561
|
)
|
560
|
-
elif
|
562
|
+
elif class_name in (
|
563
|
+
["AnnData", "AnnDataAccessor"]
|
564
|
+
if structure_type == "AnnData"
|
565
|
+
else [structure_type]
|
566
|
+
):
|
561
567
|
return True
|
562
568
|
|
563
569
|
data_type = structure_type.lower()
|
@@ -586,6 +592,7 @@ def data_is_scversedatastructure(
|
|
586
592
|
f"we do not check whether cloud zarr is {structure_type}"
|
587
593
|
)
|
588
594
|
return False
|
595
|
+
|
589
596
|
return False
|
590
597
|
|
591
598
|
|
@@ -605,7 +612,7 @@ def _check_otype_artifact(
|
|
605
612
|
) -> str:
|
606
613
|
if otype is None:
|
607
614
|
if isinstance(data, pd.DataFrame):
|
608
|
-
logger.warning("data is a DataFrame, please use .
|
615
|
+
logger.warning("data is a DataFrame, please use .from_dataframe()")
|
609
616
|
otype = "DataFrame"
|
610
617
|
return otype
|
611
618
|
|
@@ -873,7 +880,7 @@ def get_labels(
|
|
873
880
|
|
874
881
|
values = []
|
875
882
|
for v in qs_by_registry.values():
|
876
|
-
values += v.
|
883
|
+
values += v.to_list(get_name_field(v))
|
877
884
|
return values
|
878
885
|
if len(registries_to_check) == 1 and registry in qs_by_registry:
|
879
886
|
return qs_by_registry[registry]
|
@@ -896,7 +903,7 @@ def add_labels(
|
|
896
903
|
raise ValueError("Please save the artifact/collection before adding a label!")
|
897
904
|
|
898
905
|
if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both
|
899
|
-
records = records.
|
906
|
+
records = records.to_list()
|
900
907
|
if isinstance(records, (str, SQLRecord)):
|
901
908
|
records = [records]
|
902
909
|
if not isinstance(records, list): # avoids warning for pd Series
|
@@ -995,6 +1002,112 @@ def add_labels(
|
|
995
1002
|
)
|
996
1003
|
|
997
1004
|
|
1005
|
+
def delete_permanently(artifact: Artifact, storage: bool, using_key: str):
|
1006
|
+
# need to grab file path before deletion
|
1007
|
+
try:
|
1008
|
+
path, _ = filepath_from_artifact(artifact, using_key)
|
1009
|
+
except OSError:
|
1010
|
+
# we can still delete the record
|
1011
|
+
logger.warning("Could not get path")
|
1012
|
+
storage = False
|
1013
|
+
# only delete in storage if DB delete is successful
|
1014
|
+
# DB delete might error because of a foreign key constraint violated etc.
|
1015
|
+
if artifact._overwrite_versions and artifact.is_latest:
|
1016
|
+
logger.important(
|
1017
|
+
"deleting all versions of this artifact because they all share the same store"
|
1018
|
+
)
|
1019
|
+
for version in artifact.versions.all(): # includes artifact
|
1020
|
+
_delete_skip_storage(version)
|
1021
|
+
else:
|
1022
|
+
artifact._delete_skip_storage()
|
1023
|
+
# by default do not delete storage if deleting only a previous version
|
1024
|
+
# and the underlying store is mutable
|
1025
|
+
if artifact._overwrite_versions and not artifact.is_latest:
|
1026
|
+
delete_in_storage = False
|
1027
|
+
if storage:
|
1028
|
+
logger.warning(
|
1029
|
+
"storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
|
1030
|
+
)
|
1031
|
+
elif artifact.key is None or artifact._key_is_virtual:
|
1032
|
+
# do not ask for confirmation also if storage is None
|
1033
|
+
delete_in_storage = storage is None or storage
|
1034
|
+
else:
|
1035
|
+
# for artifacts with non-virtual semantic storage keys (key is not None)
|
1036
|
+
# ask for extra-confirmation
|
1037
|
+
if storage is None:
|
1038
|
+
response = input(
|
1039
|
+
f"Are you sure to want to delete {path}? (y/n) You can't undo"
|
1040
|
+
" this action."
|
1041
|
+
)
|
1042
|
+
delete_in_storage = response == "y"
|
1043
|
+
else:
|
1044
|
+
delete_in_storage = storage
|
1045
|
+
if not delete_in_storage:
|
1046
|
+
logger.important(f"a file/folder remains here: {path}")
|
1047
|
+
# we don't yet have logic to bring back the deleted metadata record
|
1048
|
+
# in case storage deletion fails - this is important for ACID down the road
|
1049
|
+
if delete_in_storage:
|
1050
|
+
delete_msg = delete_storage(path, raise_file_not_found_error=False)
|
1051
|
+
if delete_msg != "did-not-delete":
|
1052
|
+
logger.success(f"deleted {colors.yellow(f'{path}')}")
|
1053
|
+
|
1054
|
+
|
1055
|
+
class LazyArtifact:
|
1056
|
+
"""Lazy artifact for streaming to auto-generated internal paths.
|
1057
|
+
|
1058
|
+
This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
|
1059
|
+
and register the path as an artifact (see :class:`~lamindb.Artifact`).
|
1060
|
+
|
1061
|
+
This object creates a real artifact on `.save()` with the provided arguments.
|
1062
|
+
|
1063
|
+
Args:
|
1064
|
+
suffix: The suffix for the auto-generated internal path
|
1065
|
+
overwrite_versions: Whether to overwrite versions.
|
1066
|
+
**kwargs: Keyword arguments for the artifact to be created.
|
1067
|
+
|
1068
|
+
Examples:
|
1069
|
+
|
1070
|
+
Create a lazy artifact, write to the path and save to get a real artifact::
|
1071
|
+
|
1072
|
+
lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
|
1073
|
+
zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
|
1074
|
+
artifact = lazy.save()
|
1075
|
+
"""
|
1076
|
+
|
1077
|
+
def __init__(self, suffix: str, overwrite_versions: bool, **kwargs):
|
1078
|
+
self.kwargs = kwargs
|
1079
|
+
self.kwargs["overwrite_versions"] = overwrite_versions
|
1080
|
+
|
1081
|
+
if (key := kwargs.get("key")) is not None and extract_suffix_from_path(
|
1082
|
+
PurePosixPath(key)
|
1083
|
+
) != suffix:
|
1084
|
+
raise ValueError(
|
1085
|
+
"The suffix argument and the suffix of key should be the same."
|
1086
|
+
)
|
1087
|
+
|
1088
|
+
uid, _ = create_uid(n_full_id=20)
|
1089
|
+
storage_key = auto_storage_key_from_artifact_uid(
|
1090
|
+
uid, suffix, overwrite_versions=overwrite_versions
|
1091
|
+
)
|
1092
|
+
storepath = setup_settings.storage.root / storage_key
|
1093
|
+
|
1094
|
+
self._path = storepath
|
1095
|
+
|
1096
|
+
@property
|
1097
|
+
def path(self) -> UPath:
|
1098
|
+
return self._path
|
1099
|
+
|
1100
|
+
def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
1101
|
+
artifact = Artifact(self.path, _is_internal_call=True, **self.kwargs)
|
1102
|
+
return artifact.save(upload=upload, **kwargs)
|
1103
|
+
|
1104
|
+
def __repr__(self) -> str: # pragma: no cover
|
1105
|
+
show_kwargs = {k: v for k, v in self.kwargs.items() if v is not None}
|
1106
|
+
return (
|
1107
|
+
f"LazyArtifact object with\n path: {self.path}\n arguments: {show_kwargs}"
|
1108
|
+
)
|
1109
|
+
|
1110
|
+
|
998
1111
|
class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
999
1112
|
# Note that this docstring has to be consistent with Curator.save_artifact()
|
1000
1113
|
"""Datasets & models stored as files, folders, or arrays.
|
@@ -1030,15 +1143,22 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1030
1143
|
|
1031
1144
|
artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
|
1032
1145
|
|
1033
|
-
If you want to **validate & annotate** an array, pass a `schema` to one of the `.
|
1146
|
+
If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_dataframe()`, `.from_anndata()`, ... constructors::
|
1034
1147
|
|
1035
1148
|
schema = ln.Schema(itype=ln.Feature) # a schema that merely enforces that feature names exist in the Feature registry
|
1036
|
-
artifact = ln.Artifact.
|
1149
|
+
artifact = ln.Artifact.from_dataframe("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
|
1150
|
+
|
1151
|
+
To annotate by **external features**::
|
1152
|
+
|
1153
|
+
schema = ln.examples.schemas.valid_features()
|
1154
|
+
artifact = ln.Artifact("./my_file.parquet", features={"species": "bird"}).save()
|
1155
|
+
|
1156
|
+
A `schema` can be optionally passed to also validate the features.
|
1037
1157
|
|
1038
1158
|
You can make a **new version** of an artifact by passing an existing `key`::
|
1039
1159
|
|
1040
1160
|
artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
|
1041
|
-
artifact_v2.versions.
|
1161
|
+
artifact_v2.versions.to_dataframe() # see all versions
|
1042
1162
|
|
1043
1163
|
You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
|
1044
1164
|
|
@@ -1112,6 +1232,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1112
1232
|
|
1113
1233
|
class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
1114
1234
|
abstract = False
|
1235
|
+
app_label = "lamindb"
|
1115
1236
|
constraints = [
|
1116
1237
|
# a simple hard unique constraint on `hash` clashes with the fact
|
1117
1238
|
# that pipelines sometimes aim to ingest the exact same file in different
|
@@ -1159,11 +1280,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1159
1280
|
|
1160
1281
|
ln.Artifact.filter(scientist="Barbara McClintock")
|
1161
1282
|
|
1162
|
-
Features may or may not be part of the dataset, i.e., the artifact content in storage.
|
1163
|
-
instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
|
1164
|
-
`DataFrame`-like artifact and annotates it with features corresponding to
|
1165
|
-
|
1166
|
-
validate the content of the artifact.
|
1283
|
+
Features may or may not be part of the dataset, i.e., the artifact content in storage.
|
1284
|
+
For instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
|
1285
|
+
`DataFrame`-like artifact and annotates it with features corresponding to these columns.
|
1286
|
+
`artifact.features.add_values`, by contrast, does not validate the content of the artifact.
|
1167
1287
|
|
1168
1288
|
.. dropdown:: An example for a model-like artifact
|
1169
1289
|
|
@@ -1178,6 +1298,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1178
1298
|
"subset_highlyvariable": True,
|
1179
1299
|
},
|
1180
1300
|
})
|
1301
|
+
|
1302
|
+
To validate external features::
|
1303
|
+
|
1304
|
+
schema = ln.Schema([ln.Feature(name="species", dtype=str).save()]).save()
|
1305
|
+
artifact.features.add_values({"species": "bird"}, schema=schema)
|
1181
1306
|
"""
|
1182
1307
|
from ._feature_manager import FeatureManager
|
1183
1308
|
|
@@ -1387,15 +1512,46 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1387
1512
|
# now proceed with the user-facing constructor
|
1388
1513
|
if len(args) > 1:
|
1389
1514
|
raise ValueError("Only one non-keyword arg allowed: data")
|
1515
|
+
|
1390
1516
|
data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
|
1391
1517
|
kind: str = kwargs.pop("kind", None)
|
1392
1518
|
key: str | None = kwargs.pop("key", None)
|
1393
1519
|
run_id: int | None = kwargs.pop("run_id", None) # for REST API
|
1394
1520
|
run: Run | None = kwargs.pop("run", None)
|
1521
|
+
using_key = kwargs.pop("using_key", None)
|
1395
1522
|
description: str | None = kwargs.pop("description", None)
|
1396
1523
|
revises: Artifact | None = kwargs.pop("revises", None)
|
1397
1524
|
overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None)
|
1398
1525
|
version: str | None = kwargs.pop("version", None)
|
1526
|
+
|
1527
|
+
features: dict[str, Any] = kwargs.pop("features", None)
|
1528
|
+
schema: Schema | None = kwargs.pop("schema", None)
|
1529
|
+
if features is not None and schema is not None:
|
1530
|
+
from lamindb.curators import DataFrameCurator
|
1531
|
+
|
1532
|
+
temp_df = pd.DataFrame([features])
|
1533
|
+
validation_schema = schema
|
1534
|
+
if schema.itype == "Composite" and schema.slots:
|
1535
|
+
if len(schema.slots) > 1:
|
1536
|
+
raise ValueError(
|
1537
|
+
f"Composite schema has {len(schema.slots)} slots. "
|
1538
|
+
"External feature validation only supports schemas with a single slot."
|
1539
|
+
)
|
1540
|
+
try:
|
1541
|
+
validation_schema = next(
|
1542
|
+
k for k in schema.slots.keys() if k.startswith("__external")
|
1543
|
+
)
|
1544
|
+
except StopIteration:
|
1545
|
+
raise ValueError(
|
1546
|
+
"External feature validation requires a slot that starts with __external."
|
1547
|
+
) from None
|
1548
|
+
|
1549
|
+
external_curator = DataFrameCurator(temp_df, validation_schema)
|
1550
|
+
external_curator.validate()
|
1551
|
+
external_curator._artifact = self
|
1552
|
+
|
1553
|
+
self._external_features = features
|
1554
|
+
|
1399
1555
|
branch_id: int | None = None
|
1400
1556
|
if "visibility" in kwargs: # backward compat
|
1401
1557
|
branch_id = kwargs.pop("visibility")
|
@@ -1406,13 +1562,16 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1406
1562
|
else:
|
1407
1563
|
branch_id = 1
|
1408
1564
|
branch = kwargs.pop("branch", None)
|
1565
|
+
|
1409
1566
|
space = kwargs.pop("space", None)
|
1410
|
-
space_id
|
1567
|
+
assert "space_id" not in kwargs, "please pass space instead" # noqa: S101
|
1411
1568
|
format = kwargs.pop("format", None)
|
1412
1569
|
_is_internal_call = kwargs.pop("_is_internal_call", False)
|
1413
1570
|
skip_check_exists = kwargs.pop("skip_check_exists", False)
|
1571
|
+
storage_was_passed = False
|
1414
1572
|
if "storage" in kwargs:
|
1415
1573
|
storage = kwargs.pop("storage")
|
1574
|
+
storage_was_passed = True
|
1416
1575
|
elif (
|
1417
1576
|
setup_settings.instance.keep_artifacts_local
|
1418
1577
|
and setup_settings.instance._local_storage is not None
|
@@ -1420,7 +1579,24 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1420
1579
|
storage = setup_settings.instance.local_storage.record
|
1421
1580
|
else:
|
1422
1581
|
storage = setup_settings.instance.storage.record
|
1423
|
-
|
1582
|
+
if space is None:
|
1583
|
+
from lamindb import context as run_context
|
1584
|
+
|
1585
|
+
if run_context.space is not None:
|
1586
|
+
space = run_context.space
|
1587
|
+
elif setup_settings.space is not None:
|
1588
|
+
space = setup_settings.space
|
1589
|
+
if space is not None and space != storage.space:
|
1590
|
+
if storage_was_passed:
|
1591
|
+
logger.warning(
|
1592
|
+
"storage argument ignored as storage information from space takes precedence"
|
1593
|
+
)
|
1594
|
+
storage_locs_for_space = Storage.filter(space=space)
|
1595
|
+
storage = storage_locs_for_space.first()
|
1596
|
+
if len(storage_locs_for_space) > 1:
|
1597
|
+
logger.warning(
|
1598
|
+
f"more than one storage location for space {space}, choosing {storage}"
|
1599
|
+
)
|
1424
1600
|
otype = kwargs.pop("otype") if "otype" in kwargs else None
|
1425
1601
|
if isinstance(data, str) and data.startswith("s3:///"):
|
1426
1602
|
# issue in Groovy / nf-lamin producing malformed S3 paths
|
@@ -1461,6 +1637,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1461
1637
|
)
|
1462
1638
|
else:
|
1463
1639
|
is_automanaged_path = False
|
1640
|
+
|
1464
1641
|
provisional_uid, revises = create_uid(revises=revises, version=version)
|
1465
1642
|
kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
|
1466
1643
|
data=data,
|
@@ -1518,7 +1695,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1518
1695
|
uid, revises = create_uid(revises=revises, version=version)
|
1519
1696
|
kwargs["uid"] = uid
|
1520
1697
|
|
1521
|
-
# only set key now so that we don't
|
1698
|
+
# only set key now so that we don't perform a look-up on it in case revises is passed
|
1522
1699
|
if revises is not None and revises.key is not None and kwargs["key"] is None:
|
1523
1700
|
kwargs["key"] = revises.key
|
1524
1701
|
|
@@ -1530,7 +1707,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1530
1707
|
kwargs["branch"] = branch
|
1531
1708
|
kwargs["branch_id"] = branch_id
|
1532
1709
|
kwargs["space"] = space
|
1533
|
-
kwargs["space_id"] = space_id
|
1534
1710
|
kwargs["otype"] = otype
|
1535
1711
|
kwargs["revises"] = revises
|
1536
1712
|
# this check needs to come down here because key might be populated from an
|
@@ -1544,6 +1720,43 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1544
1720
|
|
1545
1721
|
super().__init__(**kwargs)
|
1546
1722
|
|
1723
|
+
@classmethod
|
1724
|
+
def from_lazy(
|
1725
|
+
cls,
|
1726
|
+
suffix: str,
|
1727
|
+
overwrite_versions: bool,
|
1728
|
+
key: str | None = None,
|
1729
|
+
description: str | None = None,
|
1730
|
+
run: Run | None = None,
|
1731
|
+
**kwargs,
|
1732
|
+
) -> LazyArtifact:
|
1733
|
+
"""Create a lazy artifact for streaming to auto-generated internal paths.
|
1734
|
+
|
1735
|
+
This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
|
1736
|
+
and register the path as an artifact.
|
1737
|
+
|
1738
|
+
The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
|
1739
|
+
on `.save()` with the provided arguments.
|
1740
|
+
|
1741
|
+
Args:
|
1742
|
+
suffix: The suffix for the auto-generated internal path
|
1743
|
+
overwrite_versions: Whether to overwrite versions.
|
1744
|
+
key: An optional key to reference the artifact.
|
1745
|
+
description: A description.
|
1746
|
+
run: The run that creates the artifact.
|
1747
|
+
**kwargs: Other keyword arguments for the artifact to be created.
|
1748
|
+
|
1749
|
+
Examples:
|
1750
|
+
|
1751
|
+
Create a lazy artifact, write to the path and save to get a real artifact::
|
1752
|
+
|
1753
|
+
lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
|
1754
|
+
zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
|
1755
|
+
artifact = lazy.save()
|
1756
|
+
"""
|
1757
|
+
args = {"key": key, "description": description, "run": run, **kwargs}
|
1758
|
+
return LazyArtifact(suffix, overwrite_versions, **args)
|
1759
|
+
|
1547
1760
|
@property
|
1548
1761
|
@deprecated("kind")
|
1549
1762
|
def type(self) -> str:
|
@@ -1627,6 +1840,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1627
1840
|
idlike: Either a uid stub, uid or an integer id.
|
1628
1841
|
is_run_input: Whether to track this artifact as run input.
|
1629
1842
|
expressions: Fields and values passed as Django query expressions.
|
1843
|
+
Use `path=...` to get an artifact for a local or remote filepath if exists.
|
1630
1844
|
|
1631
1845
|
Raises:
|
1632
1846
|
:exc:`docs:lamindb.errors.DoesNotExist`: In case no matching record is found.
|
@@ -1641,6 +1855,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1641
1855
|
|
1642
1856
|
artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
|
1643
1857
|
artifact = ln.Arfifact.get(key="examples/my_file.parquet")
|
1858
|
+
artifact = ln.Artifact.get(path="s3://bucket/folder/adata.h5ad")
|
1644
1859
|
"""
|
1645
1860
|
from .query_set import QuerySet
|
1646
1861
|
|
@@ -1710,7 +1925,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1710
1925
|
)
|
1711
1926
|
|
1712
1927
|
@classmethod
|
1713
|
-
def
|
1928
|
+
def from_dataframe(
|
1714
1929
|
cls,
|
1715
1930
|
df: pd.DataFrame,
|
1716
1931
|
*,
|
@@ -1719,6 +1934,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1719
1934
|
run: Run | None = None,
|
1720
1935
|
revises: Artifact | None = None,
|
1721
1936
|
schema: Schema | None = None,
|
1937
|
+
features: dict[str, Any] | None = None,
|
1722
1938
|
**kwargs,
|
1723
1939
|
) -> Artifact:
|
1724
1940
|
"""Create from `DataFrame`, optionally validate & annotate.
|
@@ -1731,6 +1947,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1731
1947
|
revises: An old version of the artifact.
|
1732
1948
|
run: The run that creates the artifact.
|
1733
1949
|
schema: A schema that defines how to validate & annotate.
|
1950
|
+
features: External features dict for additional annotation.
|
1734
1951
|
|
1735
1952
|
See Also:
|
1736
1953
|
:meth:`~lamindb.Collection`
|
@@ -1745,7 +1962,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1745
1962
|
import lamindb as ln
|
1746
1963
|
|
1747
1964
|
df = ln.core.datasets.mini_immuno.get_dataset1()
|
1748
|
-
artifact = ln.Artifact.
|
1965
|
+
artifact = ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet").save()
|
1749
1966
|
|
1750
1967
|
With validation and annotation.
|
1751
1968
|
|
@@ -1762,6 +1979,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1762
1979
|
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
1763
1980
|
:language: python
|
1764
1981
|
|
1982
|
+
External features:
|
1983
|
+
|
1984
|
+
.. literalinclude:: scripts/curate_dataframe_external_features.py
|
1985
|
+
:language: python
|
1765
1986
|
"""
|
1766
1987
|
artifact = Artifact( # type: ignore
|
1767
1988
|
data=df,
|
@@ -1774,8 +1995,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1774
1995
|
**kwargs,
|
1775
1996
|
)
|
1776
1997
|
artifact.n_observations = len(df)
|
1998
|
+
|
1777
1999
|
if schema is not None:
|
1778
|
-
from
|
2000
|
+
from lamindb.curators.core import ComponentCurator
|
1779
2001
|
|
1780
2002
|
if not artifact._state.adding and artifact.suffix != ".parquet":
|
1781
2003
|
logger.warning(
|
@@ -1784,12 +2006,56 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1784
2006
|
)
|
1785
2007
|
return artifact
|
1786
2008
|
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
2009
|
+
# Handle external features validation for Composite schemas
|
2010
|
+
if schema.itype == "Composite" and features is not None:
|
2011
|
+
try:
|
2012
|
+
external_slot = next(
|
2013
|
+
k for k in schema.slots.keys() if "__external__" in k
|
2014
|
+
)
|
2015
|
+
validation_schema = schema.slots[external_slot]
|
2016
|
+
except StopIteration:
|
2017
|
+
raise ValueError(
|
2018
|
+
"External feature validation requires a slot __external__."
|
2019
|
+
) from None
|
2020
|
+
|
2021
|
+
external_curator = ComponentCurator(
|
2022
|
+
pd.DataFrame([features]), validation_schema
|
2023
|
+
)
|
2024
|
+
external_curator.validate()
|
2025
|
+
artifact._external_features = features
|
2026
|
+
|
2027
|
+
# Validate main DataFrame if not Composite or if Composite has attrs
|
2028
|
+
if schema.itype != "Composite" or "attrs" in schema.slots:
|
2029
|
+
curator = ComponentCurator(artifact, schema)
|
2030
|
+
curator.validate()
|
2031
|
+
artifact.schema = schema
|
2032
|
+
artifact._curator = curator
|
2033
|
+
|
1791
2034
|
return artifact
|
1792
2035
|
|
2036
|
+
@classmethod
|
2037
|
+
@deprecated("from_dataframe")
|
2038
|
+
def from_df(
|
2039
|
+
cls,
|
2040
|
+
df: pd.DataFrame,
|
2041
|
+
*,
|
2042
|
+
key: str | None = None,
|
2043
|
+
description: str | None = None,
|
2044
|
+
run: Run | None = None,
|
2045
|
+
revises: Artifact | None = None,
|
2046
|
+
schema: Schema | None = None,
|
2047
|
+
**kwargs,
|
2048
|
+
) -> Artifact:
|
2049
|
+
return cls.from_dataframe(
|
2050
|
+
df,
|
2051
|
+
key=key,
|
2052
|
+
description=description,
|
2053
|
+
run=run,
|
2054
|
+
revises=revises,
|
2055
|
+
schema=schema,
|
2056
|
+
**kwargs,
|
2057
|
+
)
|
2058
|
+
|
1793
2059
|
@classmethod
|
1794
2060
|
def from_anndata(
|
1795
2061
|
cls,
|
@@ -2580,94 +2846,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2580
2846
|
artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
|
2581
2847
|
artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
|
2582
2848
|
"""
|
2583
|
-
|
2584
|
-
# storage = True if storage is None else storage
|
2585
|
-
|
2586
|
-
# this first check means an invalid delete fails fast rather than cascading through
|
2587
|
-
# database and storage permission errors
|
2588
|
-
if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
|
2589
|
-
isettings = setup_settings.instance
|
2590
|
-
if self.storage.instance_uid != isettings.uid and (
|
2591
|
-
storage or storage is None
|
2592
|
-
):
|
2593
|
-
raise IntegrityError(
|
2594
|
-
"Cannot simply delete artifacts outside of this instance's managed storage locations."
|
2595
|
-
"\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
|
2596
|
-
f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
|
2597
|
-
f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
|
2598
|
-
)
|
2599
|
-
# by default, we only move artifacts into the trash (branch_id = -1)
|
2600
|
-
trash_branch_id = -1
|
2601
|
-
if self.branch_id > trash_branch_id and not permanent:
|
2602
|
-
if storage is not None:
|
2603
|
-
logger.warning("moving artifact to trash, storage arg is ignored")
|
2604
|
-
# move to trash
|
2605
|
-
self.branch_id = trash_branch_id
|
2606
|
-
self.save()
|
2607
|
-
logger.important(f"moved artifact to trash (branch_id = {trash_branch_id})")
|
2608
|
-
return
|
2609
|
-
|
2610
|
-
# if the artifact is already in the trash
|
2611
|
-
# permanent delete skips the trash
|
2612
|
-
if permanent is None:
|
2613
|
-
# ask for confirmation of permanent delete
|
2614
|
-
response = input(
|
2615
|
-
"Artifact record is already in trash! Are you sure you want to permanently"
|
2616
|
-
" delete it? (y/n) You can't undo this action."
|
2617
|
-
)
|
2618
|
-
delete_record = response == "y"
|
2619
|
-
else:
|
2620
|
-
assert permanent # noqa: S101
|
2621
|
-
delete_record = True
|
2622
|
-
|
2623
|
-
if delete_record:
|
2624
|
-
# need to grab file path before deletion
|
2625
|
-
try:
|
2626
|
-
path, _ = filepath_from_artifact(self, using_key)
|
2627
|
-
except OSError:
|
2628
|
-
# we can still delete the record
|
2629
|
-
logger.warning("Could not get path")
|
2630
|
-
storage = False
|
2631
|
-
# only delete in storage if DB delete is successful
|
2632
|
-
# DB delete might error because of a foreign key constraint violated etc.
|
2633
|
-
if self._overwrite_versions and self.is_latest:
|
2634
|
-
logger.important(
|
2635
|
-
"deleting all versions of this artifact because they all share the same store"
|
2636
|
-
)
|
2637
|
-
for version in self.versions.all(): # includes self
|
2638
|
-
_delete_skip_storage(version)
|
2639
|
-
else:
|
2640
|
-
self._delete_skip_storage()
|
2641
|
-
# by default do not delete storage if deleting only a previous version
|
2642
|
-
# and the underlying store is mutable
|
2643
|
-
if self._overwrite_versions and not self.is_latest:
|
2644
|
-
delete_in_storage = False
|
2645
|
-
if storage:
|
2646
|
-
logger.warning(
|
2647
|
-
"storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
|
2648
|
-
)
|
2649
|
-
elif self.key is None or self._key_is_virtual:
|
2650
|
-
# do not ask for confirmation also if storage is None
|
2651
|
-
delete_in_storage = storage is None or storage
|
2652
|
-
else:
|
2653
|
-
# for artifacts with non-virtual semantic storage keys (key is not None)
|
2654
|
-
# ask for extra-confirmation
|
2655
|
-
if storage is None:
|
2656
|
-
response = input(
|
2657
|
-
f"Are you sure to want to delete {path}? (y/n) You can't undo"
|
2658
|
-
" this action."
|
2659
|
-
)
|
2660
|
-
delete_in_storage = response == "y"
|
2661
|
-
else:
|
2662
|
-
delete_in_storage = storage
|
2663
|
-
if not delete_in_storage:
|
2664
|
-
logger.important(f"a file/folder remains here: {path}")
|
2665
|
-
# we don't yet have logic to bring back the deleted metadata record
|
2666
|
-
# in case storage deletion fails - this is important for ACID down the road
|
2667
|
-
if delete_in_storage:
|
2668
|
-
delete_msg = delete_storage(path, raise_file_not_found_error=False)
|
2669
|
-
if delete_msg != "did-not-delete":
|
2670
|
-
logger.success(f"deleted {colors.yellow(f'{path}')}")
|
2849
|
+
super().delete(permanent=permanent, storage=storage, using_key=using_key)
|
2671
2850
|
|
2672
2851
|
@property
|
2673
2852
|
def _is_saved_to_storage_location(self) -> bool | None:
|
@@ -2796,11 +2975,20 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2796
2975
|
local_path_cache,
|
2797
2976
|
)
|
2798
2977
|
logger.important(f"moved local artifact to cache: {local_path_cache}")
|
2978
|
+
|
2979
|
+
# Handle external features
|
2980
|
+
if hasattr(self, "_external_features") and self._external_features is not None:
|
2981
|
+
external_features = self._external_features
|
2982
|
+
delattr(self, "_external_features")
|
2983
|
+
self.features.add_values(external_features)
|
2984
|
+
|
2985
|
+
# annotate Artifact
|
2799
2986
|
if hasattr(self, "_curator"):
|
2800
2987
|
curator = self._curator
|
2801
2988
|
delattr(self, "_curator")
|
2802
2989
|
# just annotates this artifact
|
2803
2990
|
curator.save_artifact()
|
2991
|
+
|
2804
2992
|
return self
|
2805
2993
|
|
2806
2994
|
def restore(self) -> None:
|
@@ -2848,7 +3036,7 @@ def _synchronize_cleanup_on_error(
|
|
2848
3036
|
|
2849
3037
|
|
2850
3038
|
def _delete_skip_storage(artifact, *args, **kwargs) -> None:
|
2851
|
-
super(
|
3039
|
+
super(SQLRecord, artifact).delete(*args, **kwargs)
|
2852
3040
|
|
2853
3041
|
|
2854
3042
|
def _save_skip_storage(artifact, **kwargs) -> None:
|
@@ -2866,6 +3054,7 @@ class ArtifactFeatureValue(BaseSQLRecord, IsLink, TracksRun):
|
|
2866
3054
|
featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
|
2867
3055
|
|
2868
3056
|
class Meta:
|
3057
|
+
app_label = "lamindb"
|
2869
3058
|
unique_together = ("artifact", "featurevalue")
|
2870
3059
|
|
2871
3060
|
|