lamindb 1.10.2__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +89 -49
- lamindb/_finish.py +17 -15
- lamindb/_tracked.py +2 -4
- lamindb/_view.py +1 -1
- lamindb/base/__init__.py +2 -1
- lamindb/base/dtypes.py +76 -0
- lamindb/core/_settings.py +2 -2
- lamindb/core/storage/_anndata_accessor.py +29 -9
- lamindb/curators/_legacy.py +16 -3
- lamindb/curators/core.py +442 -188
- lamindb/errors.py +6 -0
- lamindb/examples/cellxgene/__init__.py +8 -3
- lamindb/examples/cellxgene/_cellxgene.py +127 -13
- lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
- lamindb/examples/croissant/__init__.py +32 -6
- lamindb/examples/datasets/__init__.py +2 -2
- lamindb/examples/datasets/_core.py +9 -2
- lamindb/examples/datasets/_small.py +66 -22
- lamindb/examples/fixtures/sheets.py +8 -2
- lamindb/integrations/_croissant.py +34 -11
- lamindb/migrations/0119_squashed.py +5 -2
- lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
- lamindb/migrations/0121_recorduser.py +60 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +2 -2
- lamindb/models/_feature_manager.py +131 -71
- lamindb/models/_from_values.py +2 -2
- lamindb/models/_is_versioned.py +4 -4
- lamindb/models/_label_manager.py +4 -4
- lamindb/models/artifact.py +326 -172
- lamindb/models/artifact_set.py +45 -1
- lamindb/models/can_curate.py +1 -2
- lamindb/models/collection.py +3 -34
- lamindb/models/feature.py +111 -7
- lamindb/models/has_parents.py +11 -11
- lamindb/models/project.py +18 -0
- lamindb/models/query_manager.py +16 -7
- lamindb/models/query_set.py +191 -78
- lamindb/models/record.py +30 -5
- lamindb/models/run.py +10 -33
- lamindb/models/save.py +6 -8
- lamindb/models/schema.py +54 -26
- lamindb/models/sqlrecord.py +152 -40
- lamindb/models/storage.py +59 -14
- lamindb/models/transform.py +17 -17
- lamindb/models/ulabel.py +6 -1
- {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/METADATA +12 -18
- {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/RECORD +50 -47
- {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/WHEEL +1 -1
- {lamindb-1.10.2.dist-info/licenses → lamindb-1.11.0.dist-info}/LICENSE +0 -0
lamindb/models/artifact.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
# ruff: noqa: TC004
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
import os
|
5
4
|
import shutil
|
6
5
|
from collections import defaultdict
|
7
6
|
from pathlib import Path, PurePath, PurePosixPath
|
@@ -9,7 +8,6 @@ from typing import TYPE_CHECKING, Any, Literal, Union, overload
|
|
9
8
|
|
10
9
|
import fsspec
|
11
10
|
import lamindb_setup as ln_setup
|
12
|
-
import numpy as np
|
13
11
|
import pandas as pd
|
14
12
|
from anndata import AnnData
|
15
13
|
from django.db import connections, models
|
@@ -63,14 +61,13 @@ from ..core.storage.paths import (
|
|
63
61
|
filepath_cache_key_from_artifact,
|
64
62
|
filepath_from_artifact,
|
65
63
|
)
|
66
|
-
from ..errors import
|
64
|
+
from ..errors import InvalidArgument, ValidationError
|
67
65
|
from ..models._is_versioned import (
|
68
66
|
create_uid,
|
69
67
|
)
|
70
68
|
from ._django import get_artifact_with_related, get_collection_with_related
|
71
69
|
from ._feature_manager import (
|
72
70
|
FeatureManager,
|
73
|
-
filter_base,
|
74
71
|
get_label_links,
|
75
72
|
)
|
76
73
|
from ._is_versioned import IsVersioned
|
@@ -201,7 +198,7 @@ def process_pathlike(
|
|
201
198
|
# hence, we revert the creation and throw an error
|
202
199
|
storage_record.delete()
|
203
200
|
raise UnknownStorageLocation(
|
204
|
-
f"Path {filepath} is not contained in any known storage location:\n{Storage.
|
201
|
+
f"Path {filepath} is not contained in any known storage location:\n{Storage.to_dataframe()[['uid', 'root', 'type']]}\n\n"
|
205
202
|
f"Create a managed storage location that contains the path, e.g., by calling: ln.Storage(root='{new_root}').save()"
|
206
203
|
)
|
207
204
|
use_existing_storage_key = True
|
@@ -419,24 +416,6 @@ def get_artifact_kwargs_from_data(
|
|
419
416
|
skip_check_exists,
|
420
417
|
is_replace=is_replace,
|
421
418
|
)
|
422
|
-
stat_or_artifact = get_stat_or_artifact(
|
423
|
-
path=path,
|
424
|
-
key=key,
|
425
|
-
instance=using_key,
|
426
|
-
is_replace=is_replace,
|
427
|
-
)
|
428
|
-
if isinstance(stat_or_artifact, Artifact):
|
429
|
-
existing_artifact = stat_or_artifact
|
430
|
-
if run is not None:
|
431
|
-
existing_artifact._populate_subsequent_runs(run)
|
432
|
-
return existing_artifact, None
|
433
|
-
else:
|
434
|
-
size, hash, hash_type, n_files, revises = stat_or_artifact
|
435
|
-
|
436
|
-
if revises is not None: # update provisional_uid
|
437
|
-
provisional_uid, revises = create_uid(revises=revises, version=version)
|
438
|
-
if settings.cache_dir in path.parents:
|
439
|
-
path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
|
440
419
|
|
441
420
|
check_path_in_storage = False
|
442
421
|
if use_existing_storage_key:
|
@@ -457,6 +436,25 @@ def get_artifact_kwargs_from_data(
|
|
457
436
|
else:
|
458
437
|
storage = storage
|
459
438
|
|
439
|
+
stat_or_artifact = get_stat_or_artifact(
|
440
|
+
path=path,
|
441
|
+
key=key,
|
442
|
+
instance=using_key,
|
443
|
+
is_replace=is_replace,
|
444
|
+
)
|
445
|
+
if isinstance(stat_or_artifact, Artifact):
|
446
|
+
existing_artifact = stat_or_artifact
|
447
|
+
if run is not None:
|
448
|
+
existing_artifact._populate_subsequent_runs(run)
|
449
|
+
return existing_artifact, None
|
450
|
+
else:
|
451
|
+
size, hash, hash_type, n_files, revises = stat_or_artifact
|
452
|
+
|
453
|
+
if revises is not None: # update provisional_uid
|
454
|
+
provisional_uid, revises = create_uid(revises=revises, version=version)
|
455
|
+
if settings.cache_dir in path.parents:
|
456
|
+
path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
|
457
|
+
|
460
458
|
log_storage_hint(
|
461
459
|
check_path_in_storage=check_path_in_storage,
|
462
460
|
storage=storage,
|
@@ -552,12 +550,19 @@ def data_is_scversedatastructure(
|
|
552
550
|
file_suffix = ".h5mu"
|
553
551
|
# SpatialData does not have a unique suffix but `.zarr`
|
554
552
|
|
553
|
+
# AnnData allows both AnnDataAccessor and AnnData
|
554
|
+
class_name = data.__class__.__name__
|
555
555
|
if structure_type is None:
|
556
556
|
return any(
|
557
|
-
|
557
|
+
class_name
|
558
|
+
in (["AnnData", "AnnDataAccessor"] if cl_name == "AnnData" else [cl_name])
|
558
559
|
for cl_name in ["AnnData", "MuData", "SpatialData"]
|
559
560
|
)
|
560
|
-
elif
|
561
|
+
elif class_name in (
|
562
|
+
["AnnData", "AnnDataAccessor"]
|
563
|
+
if structure_type == "AnnData"
|
564
|
+
else [structure_type]
|
565
|
+
):
|
561
566
|
return True
|
562
567
|
|
563
568
|
data_type = structure_type.lower()
|
@@ -586,6 +591,7 @@ def data_is_scversedatastructure(
|
|
586
591
|
f"we do not check whether cloud zarr is {structure_type}"
|
587
592
|
)
|
588
593
|
return False
|
594
|
+
|
589
595
|
return False
|
590
596
|
|
591
597
|
|
@@ -605,7 +611,7 @@ def _check_otype_artifact(
|
|
605
611
|
) -> str:
|
606
612
|
if otype is None:
|
607
613
|
if isinstance(data, pd.DataFrame):
|
608
|
-
logger.warning("data is a DataFrame, please use .
|
614
|
+
logger.warning("data is a DataFrame, please use .from_dataframe()")
|
609
615
|
otype = "DataFrame"
|
610
616
|
return otype
|
611
617
|
|
@@ -873,7 +879,7 @@ def get_labels(
|
|
873
879
|
|
874
880
|
values = []
|
875
881
|
for v in qs_by_registry.values():
|
876
|
-
values += v.
|
882
|
+
values += v.to_list(get_name_field(v))
|
877
883
|
return values
|
878
884
|
if len(registries_to_check) == 1 and registry in qs_by_registry:
|
879
885
|
return qs_by_registry[registry]
|
@@ -896,7 +902,7 @@ def add_labels(
|
|
896
902
|
raise ValueError("Please save the artifact/collection before adding a label!")
|
897
903
|
|
898
904
|
if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both
|
899
|
-
records = records.
|
905
|
+
records = records.to_list()
|
900
906
|
if isinstance(records, (str, SQLRecord)):
|
901
907
|
records = [records]
|
902
908
|
if not isinstance(records, list): # avoids warning for pd Series
|
@@ -995,6 +1001,112 @@ def add_labels(
|
|
995
1001
|
)
|
996
1002
|
|
997
1003
|
|
1004
|
+
def delete_permanently(artifact: Artifact, storage: bool, using_key: str):
|
1005
|
+
# need to grab file path before deletion
|
1006
|
+
try:
|
1007
|
+
path, _ = filepath_from_artifact(artifact, using_key)
|
1008
|
+
except OSError:
|
1009
|
+
# we can still delete the record
|
1010
|
+
logger.warning("Could not get path")
|
1011
|
+
storage = False
|
1012
|
+
# only delete in storage if DB delete is successful
|
1013
|
+
# DB delete might error because of a foreign key constraint violated etc.
|
1014
|
+
if artifact._overwrite_versions and artifact.is_latest:
|
1015
|
+
logger.important(
|
1016
|
+
"deleting all versions of this artifact because they all share the same store"
|
1017
|
+
)
|
1018
|
+
for version in artifact.versions.all(): # includes artifact
|
1019
|
+
_delete_skip_storage(version)
|
1020
|
+
else:
|
1021
|
+
artifact._delete_skip_storage()
|
1022
|
+
# by default do not delete storage if deleting only a previous version
|
1023
|
+
# and the underlying store is mutable
|
1024
|
+
if artifact._overwrite_versions and not artifact.is_latest:
|
1025
|
+
delete_in_storage = False
|
1026
|
+
if storage:
|
1027
|
+
logger.warning(
|
1028
|
+
"storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
|
1029
|
+
)
|
1030
|
+
elif artifact.key is None or artifact._key_is_virtual:
|
1031
|
+
# do not ask for confirmation also if storage is None
|
1032
|
+
delete_in_storage = storage is None or storage
|
1033
|
+
else:
|
1034
|
+
# for artifacts with non-virtual semantic storage keys (key is not None)
|
1035
|
+
# ask for extra-confirmation if storage is None
|
1036
|
+
if storage is None:
|
1037
|
+
response = input(
|
1038
|
+
f"Are you sure to want to delete {path}? (y/n) You can't undo"
|
1039
|
+
" this action."
|
1040
|
+
)
|
1041
|
+
delete_in_storage = response == "y"
|
1042
|
+
else:
|
1043
|
+
delete_in_storage = storage
|
1044
|
+
if not delete_in_storage:
|
1045
|
+
logger.important(f"a file/folder remains here: {path}")
|
1046
|
+
# we don't yet have logic to bring back the deleted metadata record
|
1047
|
+
# in case storage deletion fails - this is important for ACID down the road
|
1048
|
+
if delete_in_storage:
|
1049
|
+
delete_msg = delete_storage(path, raise_file_not_found_error=False)
|
1050
|
+
if delete_msg != "did-not-delete":
|
1051
|
+
logger.success(f"deleted {colors.yellow(f'{path}')}")
|
1052
|
+
|
1053
|
+
|
1054
|
+
class LazyArtifact:
|
1055
|
+
"""Lazy artifact for streaming to auto-generated internal paths.
|
1056
|
+
|
1057
|
+
This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
|
1058
|
+
and register the path as an artifact (see :class:`~lamindb.Artifact`).
|
1059
|
+
|
1060
|
+
This object creates a real artifact on `.save()` with the provided arguments.
|
1061
|
+
|
1062
|
+
Args:
|
1063
|
+
suffix: The suffix for the auto-generated internal path
|
1064
|
+
overwrite_versions: Whether to overwrite versions.
|
1065
|
+
**kwargs: Keyword arguments for the artifact to be created.
|
1066
|
+
|
1067
|
+
Examples:
|
1068
|
+
|
1069
|
+
Create a lazy artifact, write to the path and save to get a real artifact::
|
1070
|
+
|
1071
|
+
lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
|
1072
|
+
zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
|
1073
|
+
artifact = lazy.save()
|
1074
|
+
"""
|
1075
|
+
|
1076
|
+
def __init__(self, suffix: str, overwrite_versions: bool, **kwargs):
|
1077
|
+
self.kwargs = kwargs
|
1078
|
+
self.kwargs["overwrite_versions"] = overwrite_versions
|
1079
|
+
|
1080
|
+
if (key := kwargs.get("key")) is not None and extract_suffix_from_path(
|
1081
|
+
PurePosixPath(key)
|
1082
|
+
) != suffix:
|
1083
|
+
raise ValueError(
|
1084
|
+
"The suffix argument and the suffix of key should be the same."
|
1085
|
+
)
|
1086
|
+
|
1087
|
+
uid, _ = create_uid(n_full_id=20)
|
1088
|
+
storage_key = auto_storage_key_from_artifact_uid(
|
1089
|
+
uid, suffix, overwrite_versions=overwrite_versions
|
1090
|
+
)
|
1091
|
+
storepath = setup_settings.storage.root / storage_key
|
1092
|
+
|
1093
|
+
self._path = storepath
|
1094
|
+
|
1095
|
+
@property
|
1096
|
+
def path(self) -> UPath:
|
1097
|
+
return self._path
|
1098
|
+
|
1099
|
+
def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
1100
|
+
artifact = Artifact(self.path, _is_internal_call=True, **self.kwargs)
|
1101
|
+
return artifact.save(upload=upload, **kwargs)
|
1102
|
+
|
1103
|
+
def __repr__(self) -> str: # pragma: no cover
|
1104
|
+
show_kwargs = {k: v for k, v in self.kwargs.items() if v is not None}
|
1105
|
+
return (
|
1106
|
+
f"LazyArtifact object with\n path: {self.path}\n arguments: {show_kwargs}"
|
1107
|
+
)
|
1108
|
+
|
1109
|
+
|
998
1110
|
class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
999
1111
|
# Note that this docstring has to be consistent with Curator.save_artifact()
|
1000
1112
|
"""Datasets & models stored as files, folders, or arrays.
|
@@ -1030,15 +1142,22 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1030
1142
|
|
1031
1143
|
artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
|
1032
1144
|
|
1033
|
-
If you want to **validate & annotate** an array, pass a `schema` to one of the `.
|
1145
|
+
If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_dataframe()`, `.from_anndata()`, ... constructors::
|
1034
1146
|
|
1035
1147
|
schema = ln.Schema(itype=ln.Feature) # a schema that merely enforces that feature names exist in the Feature registry
|
1036
|
-
artifact = ln.Artifact.
|
1148
|
+
artifact = ln.Artifact.from_dataframe("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
|
1149
|
+
|
1150
|
+
To annotate by **external features**::
|
1151
|
+
|
1152
|
+
schema = ln.examples.schemas.valid_features()
|
1153
|
+
artifact = ln.Artifact("./my_file.parquet", features={"species": "bird"}).save()
|
1154
|
+
|
1155
|
+
A `schema` can be optionally passed to also validate the features.
|
1037
1156
|
|
1038
1157
|
You can make a **new version** of an artifact by passing an existing `key`::
|
1039
1158
|
|
1040
1159
|
artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
|
1041
|
-
artifact_v2.versions.
|
1160
|
+
artifact_v2.versions.to_dataframe() # see all versions
|
1042
1161
|
|
1043
1162
|
You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
|
1044
1163
|
|
@@ -1112,6 +1231,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1112
1231
|
|
1113
1232
|
class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
1114
1233
|
abstract = False
|
1234
|
+
app_label = "lamindb"
|
1115
1235
|
constraints = [
|
1116
1236
|
# a simple hard unique constraint on `hash` clashes with the fact
|
1117
1237
|
# that pipelines sometimes aim to ingest the exact same file in different
|
@@ -1159,11 +1279,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1159
1279
|
|
1160
1280
|
ln.Artifact.filter(scientist="Barbara McClintock")
|
1161
1281
|
|
1162
|
-
Features may or may not be part of the dataset, i.e., the artifact content in storage.
|
1163
|
-
instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
|
1164
|
-
`DataFrame`-like artifact and annotates it with features corresponding to
|
1165
|
-
|
1166
|
-
validate the content of the artifact.
|
1282
|
+
Features may or may not be part of the dataset, i.e., the artifact content in storage.
|
1283
|
+
For instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
|
1284
|
+
`DataFrame`-like artifact and annotates it with features corresponding to these columns.
|
1285
|
+
`artifact.features.add_values`, by contrast, does not validate the content of the artifact.
|
1167
1286
|
|
1168
1287
|
.. dropdown:: An example for a model-like artifact
|
1169
1288
|
|
@@ -1178,6 +1297,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1178
1297
|
"subset_highlyvariable": True,
|
1179
1298
|
},
|
1180
1299
|
})
|
1300
|
+
|
1301
|
+
To validate external features::
|
1302
|
+
|
1303
|
+
schema = ln.Schema([ln.Feature(name="species", dtype=str).save()]).save()
|
1304
|
+
artifact.features.add_values({"species": "bird"}, schema=schema)
|
1181
1305
|
"""
|
1182
1306
|
from ._feature_manager import FeatureManager
|
1183
1307
|
|
@@ -1387,15 +1511,46 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1387
1511
|
# now proceed with the user-facing constructor
|
1388
1512
|
if len(args) > 1:
|
1389
1513
|
raise ValueError("Only one non-keyword arg allowed: data")
|
1514
|
+
|
1390
1515
|
data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
|
1391
1516
|
kind: str = kwargs.pop("kind", None)
|
1392
1517
|
key: str | None = kwargs.pop("key", None)
|
1393
1518
|
run_id: int | None = kwargs.pop("run_id", None) # for REST API
|
1394
1519
|
run: Run | None = kwargs.pop("run", None)
|
1520
|
+
using_key = kwargs.pop("using_key", None)
|
1395
1521
|
description: str | None = kwargs.pop("description", None)
|
1396
1522
|
revises: Artifact | None = kwargs.pop("revises", None)
|
1397
1523
|
overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None)
|
1398
1524
|
version: str | None = kwargs.pop("version", None)
|
1525
|
+
|
1526
|
+
features: dict[str, Any] = kwargs.pop("features", None)
|
1527
|
+
schema: Schema | None = kwargs.pop("schema", None)
|
1528
|
+
if features is not None and schema is not None:
|
1529
|
+
from lamindb.curators import DataFrameCurator
|
1530
|
+
|
1531
|
+
temp_df = pd.DataFrame([features])
|
1532
|
+
validation_schema = schema
|
1533
|
+
if schema.itype == "Composite" and schema.slots:
|
1534
|
+
if len(schema.slots) > 1:
|
1535
|
+
raise ValueError(
|
1536
|
+
f"Composite schema has {len(schema.slots)} slots. "
|
1537
|
+
"External feature validation only supports schemas with a single slot."
|
1538
|
+
)
|
1539
|
+
try:
|
1540
|
+
validation_schema = next(
|
1541
|
+
k for k in schema.slots.keys() if k.startswith("__external")
|
1542
|
+
)
|
1543
|
+
except StopIteration:
|
1544
|
+
raise ValueError(
|
1545
|
+
"External feature validation requires a slot that starts with __external."
|
1546
|
+
) from None
|
1547
|
+
|
1548
|
+
external_curator = DataFrameCurator(temp_df, validation_schema)
|
1549
|
+
external_curator.validate()
|
1550
|
+
external_curator._artifact = self
|
1551
|
+
|
1552
|
+
self._external_features = features
|
1553
|
+
|
1399
1554
|
branch_id: int | None = None
|
1400
1555
|
if "visibility" in kwargs: # backward compat
|
1401
1556
|
branch_id = kwargs.pop("visibility")
|
@@ -1406,13 +1561,16 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1406
1561
|
else:
|
1407
1562
|
branch_id = 1
|
1408
1563
|
branch = kwargs.pop("branch", None)
|
1564
|
+
|
1409
1565
|
space = kwargs.pop("space", None)
|
1410
|
-
space_id
|
1566
|
+
assert "space_id" not in kwargs, "please pass space instead" # noqa: S101
|
1411
1567
|
format = kwargs.pop("format", None)
|
1412
1568
|
_is_internal_call = kwargs.pop("_is_internal_call", False)
|
1413
1569
|
skip_check_exists = kwargs.pop("skip_check_exists", False)
|
1570
|
+
storage_was_passed = False
|
1414
1571
|
if "storage" in kwargs:
|
1415
1572
|
storage = kwargs.pop("storage")
|
1573
|
+
storage_was_passed = True
|
1416
1574
|
elif (
|
1417
1575
|
setup_settings.instance.keep_artifacts_local
|
1418
1576
|
and setup_settings.instance._local_storage is not None
|
@@ -1420,7 +1578,24 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1420
1578
|
storage = setup_settings.instance.local_storage.record
|
1421
1579
|
else:
|
1422
1580
|
storage = setup_settings.instance.storage.record
|
1423
|
-
|
1581
|
+
if space is None:
|
1582
|
+
from lamindb import context as run_context
|
1583
|
+
|
1584
|
+
if run_context.space is not None:
|
1585
|
+
space = run_context.space
|
1586
|
+
elif setup_settings.space is not None:
|
1587
|
+
space = setup_settings.space
|
1588
|
+
if space is not None and space != storage.space:
|
1589
|
+
if storage_was_passed:
|
1590
|
+
logger.warning(
|
1591
|
+
"storage argument ignored as storage information from space takes precedence"
|
1592
|
+
)
|
1593
|
+
storage_locs_for_space = Storage.filter(space=space)
|
1594
|
+
storage = storage_locs_for_space.first()
|
1595
|
+
if len(storage_locs_for_space) > 1:
|
1596
|
+
logger.warning(
|
1597
|
+
f"more than one storage location for space {space}, choosing {storage}"
|
1598
|
+
)
|
1424
1599
|
otype = kwargs.pop("otype") if "otype" in kwargs else None
|
1425
1600
|
if isinstance(data, str) and data.startswith("s3:///"):
|
1426
1601
|
# issue in Groovy / nf-lamin producing malformed S3 paths
|
@@ -1461,6 +1636,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1461
1636
|
)
|
1462
1637
|
else:
|
1463
1638
|
is_automanaged_path = False
|
1639
|
+
|
1464
1640
|
provisional_uid, revises = create_uid(revises=revises, version=version)
|
1465
1641
|
kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
|
1466
1642
|
data=data,
|
@@ -1518,7 +1694,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1518
1694
|
uid, revises = create_uid(revises=revises, version=version)
|
1519
1695
|
kwargs["uid"] = uid
|
1520
1696
|
|
1521
|
-
# only set key now so that we don't
|
1697
|
+
# only set key now so that we don't perform a look-up on it in case revises is passed
|
1522
1698
|
if revises is not None and revises.key is not None and kwargs["key"] is None:
|
1523
1699
|
kwargs["key"] = revises.key
|
1524
1700
|
|
@@ -1530,7 +1706,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1530
1706
|
kwargs["branch"] = branch
|
1531
1707
|
kwargs["branch_id"] = branch_id
|
1532
1708
|
kwargs["space"] = space
|
1533
|
-
kwargs["space_id"] = space_id
|
1534
1709
|
kwargs["otype"] = otype
|
1535
1710
|
kwargs["revises"] = revises
|
1536
1711
|
# this check needs to come down here because key might be populated from an
|
@@ -1544,6 +1719,43 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1544
1719
|
|
1545
1720
|
super().__init__(**kwargs)
|
1546
1721
|
|
1722
|
+
@classmethod
|
1723
|
+
def from_lazy(
|
1724
|
+
cls,
|
1725
|
+
suffix: str,
|
1726
|
+
overwrite_versions: bool,
|
1727
|
+
key: str | None = None,
|
1728
|
+
description: str | None = None,
|
1729
|
+
run: Run | None = None,
|
1730
|
+
**kwargs,
|
1731
|
+
) -> LazyArtifact:
|
1732
|
+
"""Create a lazy artifact for streaming to auto-generated internal paths.
|
1733
|
+
|
1734
|
+
This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
|
1735
|
+
and register the path as an artifact.
|
1736
|
+
|
1737
|
+
The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
|
1738
|
+
on `.save()` with the provided arguments.
|
1739
|
+
|
1740
|
+
Args:
|
1741
|
+
suffix: The suffix for the auto-generated internal path
|
1742
|
+
overwrite_versions: Whether to overwrite versions.
|
1743
|
+
key: An optional key to reference the artifact.
|
1744
|
+
description: A description.
|
1745
|
+
run: The run that creates the artifact.
|
1746
|
+
**kwargs: Other keyword arguments for the artifact to be created.
|
1747
|
+
|
1748
|
+
Examples:
|
1749
|
+
|
1750
|
+
Create a lazy artifact, write to the path and save to get a real artifact::
|
1751
|
+
|
1752
|
+
lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
|
1753
|
+
zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
|
1754
|
+
artifact = lazy.save()
|
1755
|
+
"""
|
1756
|
+
args = {"key": key, "description": description, "run": run, **kwargs}
|
1757
|
+
return LazyArtifact(suffix, overwrite_versions, **args)
|
1758
|
+
|
1547
1759
|
@property
|
1548
1760
|
@deprecated("kind")
|
1549
1761
|
def type(self) -> str:
|
@@ -1627,6 +1839,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1627
1839
|
idlike: Either a uid stub, uid or an integer id.
|
1628
1840
|
is_run_input: Whether to track this artifact as run input.
|
1629
1841
|
expressions: Fields and values passed as Django query expressions.
|
1842
|
+
Use `path=...` to get an artifact for a local or remote filepath if exists.
|
1630
1843
|
|
1631
1844
|
Raises:
|
1632
1845
|
:exc:`docs:lamindb.errors.DoesNotExist`: In case no matching record is found.
|
@@ -1641,6 +1854,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1641
1854
|
|
1642
1855
|
artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
|
1643
1856
|
artifact = ln.Arfifact.get(key="examples/my_file.parquet")
|
1857
|
+
artifact = ln.Artifact.get(path="s3://bucket/folder/adata.h5ad")
|
1644
1858
|
"""
|
1645
1859
|
from .query_set import QuerySet
|
1646
1860
|
|
@@ -1672,45 +1886,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1672
1886
|
ln.Arfifact.filter(cell_type_by_model__name="T cell")
|
1673
1887
|
|
1674
1888
|
"""
|
1675
|
-
from
|
1676
|
-
|
1677
|
-
if expressions:
|
1678
|
-
keys_normalized = [key.split("__")[0] for key in expressions]
|
1679
|
-
field_or_feature_or_param = keys_normalized[0].split("__")[0]
|
1680
|
-
if field_or_feature_or_param in Artifact.__get_available_fields__():
|
1681
|
-
qs = QuerySet(model=cls).filter(*queries, **expressions)
|
1682
|
-
if not any(e.startswith("kind") for e in expressions):
|
1683
|
-
return qs.exclude(kind="__lamindb_run__")
|
1684
|
-
else:
|
1685
|
-
return qs
|
1686
|
-
elif all(
|
1687
|
-
features_validated := Feature.validate(
|
1688
|
-
keys_normalized, field="name", mute=True
|
1689
|
-
)
|
1690
|
-
):
|
1691
|
-
return filter_base(Artifact, **expressions)
|
1692
|
-
else:
|
1693
|
-
features = ", ".join(
|
1694
|
-
sorted(np.array(keys_normalized)[~features_validated])
|
1695
|
-
)
|
1696
|
-
message = f"feature names: {features}"
|
1697
|
-
avail_fields = cls.__get_available_fields__()
|
1698
|
-
if "_branch_code" in avail_fields:
|
1699
|
-
avail_fields.remove("_branch_code") # backward compat
|
1700
|
-
fields = ", ".join(sorted(avail_fields))
|
1701
|
-
raise InvalidArgument(
|
1702
|
-
f"You can query either by available fields: {fields}\n"
|
1703
|
-
f"Or fix invalid {message}"
|
1704
|
-
)
|
1705
|
-
else:
|
1706
|
-
return (
|
1707
|
-
QuerySet(model=cls)
|
1708
|
-
.filter(*queries, **expressions)
|
1709
|
-
.exclude(kind="__lamindb_run__")
|
1710
|
-
)
|
1889
|
+
# from Registry metaclass
|
1890
|
+
return type(cls).filter(cls, *queries, **expressions)
|
1711
1891
|
|
1712
1892
|
@classmethod
|
1713
|
-
def
|
1893
|
+
def from_dataframe(
|
1714
1894
|
cls,
|
1715
1895
|
df: pd.DataFrame,
|
1716
1896
|
*,
|
@@ -1719,6 +1899,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1719
1899
|
run: Run | None = None,
|
1720
1900
|
revises: Artifact | None = None,
|
1721
1901
|
schema: Schema | None = None,
|
1902
|
+
features: dict[str, Any] | None = None,
|
1722
1903
|
**kwargs,
|
1723
1904
|
) -> Artifact:
|
1724
1905
|
"""Create from `DataFrame`, optionally validate & annotate.
|
@@ -1731,6 +1912,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1731
1912
|
revises: An old version of the artifact.
|
1732
1913
|
run: The run that creates the artifact.
|
1733
1914
|
schema: A schema that defines how to validate & annotate.
|
1915
|
+
features: External features dict for additional annotation.
|
1734
1916
|
|
1735
1917
|
See Also:
|
1736
1918
|
:meth:`~lamindb.Collection`
|
@@ -1745,7 +1927,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1745
1927
|
import lamindb as ln
|
1746
1928
|
|
1747
1929
|
df = ln.core.datasets.mini_immuno.get_dataset1()
|
1748
|
-
artifact = ln.Artifact.
|
1930
|
+
artifact = ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet").save()
|
1749
1931
|
|
1750
1932
|
With validation and annotation.
|
1751
1933
|
|
@@ -1762,6 +1944,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1762
1944
|
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
1763
1945
|
:language: python
|
1764
1946
|
|
1947
|
+
External features:
|
1948
|
+
|
1949
|
+
.. literalinclude:: scripts/curate_dataframe_external_features.py
|
1950
|
+
:language: python
|
1765
1951
|
"""
|
1766
1952
|
artifact = Artifact( # type: ignore
|
1767
1953
|
data=df,
|
@@ -1774,8 +1960,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1774
1960
|
**kwargs,
|
1775
1961
|
)
|
1776
1962
|
artifact.n_observations = len(df)
|
1963
|
+
|
1777
1964
|
if schema is not None:
|
1778
|
-
from
|
1965
|
+
from lamindb.curators.core import ComponentCurator
|
1779
1966
|
|
1780
1967
|
if not artifact._state.adding and artifact.suffix != ".parquet":
|
1781
1968
|
logger.warning(
|
@@ -1784,12 +1971,56 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1784
1971
|
)
|
1785
1972
|
return artifact
|
1786
1973
|
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1974
|
+
# Handle external features validation for Composite schemas
|
1975
|
+
if schema.itype == "Composite" and features is not None:
|
1976
|
+
try:
|
1977
|
+
external_slot = next(
|
1978
|
+
k for k in schema.slots.keys() if "__external__" in k
|
1979
|
+
)
|
1980
|
+
validation_schema = schema.slots[external_slot]
|
1981
|
+
except StopIteration:
|
1982
|
+
raise ValueError(
|
1983
|
+
"External feature validation requires a slot __external__."
|
1984
|
+
) from None
|
1985
|
+
|
1986
|
+
external_curator = ComponentCurator(
|
1987
|
+
pd.DataFrame([features]), validation_schema
|
1988
|
+
)
|
1989
|
+
external_curator.validate()
|
1990
|
+
artifact._external_features = features
|
1991
|
+
|
1992
|
+
# Validate main DataFrame if not Composite or if Composite has attrs
|
1993
|
+
if schema.itype != "Composite" or "attrs" in schema.slots:
|
1994
|
+
curator = ComponentCurator(artifact, schema)
|
1995
|
+
curator.validate()
|
1996
|
+
artifact.schema = schema
|
1997
|
+
artifact._curator = curator
|
1998
|
+
|
1791
1999
|
return artifact
|
1792
2000
|
|
2001
|
+
@classmethod
|
2002
|
+
@deprecated("from_dataframe")
|
2003
|
+
def from_df(
|
2004
|
+
cls,
|
2005
|
+
df: pd.DataFrame,
|
2006
|
+
*,
|
2007
|
+
key: str | None = None,
|
2008
|
+
description: str | None = None,
|
2009
|
+
run: Run | None = None,
|
2010
|
+
revises: Artifact | None = None,
|
2011
|
+
schema: Schema | None = None,
|
2012
|
+
**kwargs,
|
2013
|
+
) -> Artifact:
|
2014
|
+
return cls.from_dataframe(
|
2015
|
+
df,
|
2016
|
+
key=key,
|
2017
|
+
description=description,
|
2018
|
+
run=run,
|
2019
|
+
revises=revises,
|
2020
|
+
schema=schema,
|
2021
|
+
**kwargs,
|
2022
|
+
)
|
2023
|
+
|
1793
2024
|
@classmethod
|
1794
2025
|
def from_anndata(
|
1795
2026
|
cls,
|
@@ -2580,94 +2811,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2580
2811
|
artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
|
2581
2812
|
artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
|
2582
2813
|
"""
|
2583
|
-
|
2584
|
-
# storage = True if storage is None else storage
|
2585
|
-
|
2586
|
-
# this first check means an invalid delete fails fast rather than cascading through
|
2587
|
-
# database and storage permission errors
|
2588
|
-
if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
|
2589
|
-
isettings = setup_settings.instance
|
2590
|
-
if self.storage.instance_uid != isettings.uid and (
|
2591
|
-
storage or storage is None
|
2592
|
-
):
|
2593
|
-
raise IntegrityError(
|
2594
|
-
"Cannot simply delete artifacts outside of this instance's managed storage locations."
|
2595
|
-
"\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
|
2596
|
-
f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
|
2597
|
-
f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
|
2598
|
-
)
|
2599
|
-
# by default, we only move artifacts into the trash (branch_id = -1)
|
2600
|
-
trash_branch_id = -1
|
2601
|
-
if self.branch_id > trash_branch_id and not permanent:
|
2602
|
-
if storage is not None:
|
2603
|
-
logger.warning("moving artifact to trash, storage arg is ignored")
|
2604
|
-
# move to trash
|
2605
|
-
self.branch_id = trash_branch_id
|
2606
|
-
self.save()
|
2607
|
-
logger.important(f"moved artifact to trash (branch_id = {trash_branch_id})")
|
2608
|
-
return
|
2609
|
-
|
2610
|
-
# if the artifact is already in the trash
|
2611
|
-
# permanent delete skips the trash
|
2612
|
-
if permanent is None:
|
2613
|
-
# ask for confirmation of permanent delete
|
2614
|
-
response = input(
|
2615
|
-
"Artifact record is already in trash! Are you sure you want to permanently"
|
2616
|
-
" delete it? (y/n) You can't undo this action."
|
2617
|
-
)
|
2618
|
-
delete_record = response == "y"
|
2619
|
-
else:
|
2620
|
-
assert permanent # noqa: S101
|
2621
|
-
delete_record = True
|
2622
|
-
|
2623
|
-
if delete_record:
|
2624
|
-
# need to grab file path before deletion
|
2625
|
-
try:
|
2626
|
-
path, _ = filepath_from_artifact(self, using_key)
|
2627
|
-
except OSError:
|
2628
|
-
# we can still delete the record
|
2629
|
-
logger.warning("Could not get path")
|
2630
|
-
storage = False
|
2631
|
-
# only delete in storage if DB delete is successful
|
2632
|
-
# DB delete might error because of a foreign key constraint violated etc.
|
2633
|
-
if self._overwrite_versions and self.is_latest:
|
2634
|
-
logger.important(
|
2635
|
-
"deleting all versions of this artifact because they all share the same store"
|
2636
|
-
)
|
2637
|
-
for version in self.versions.all(): # includes self
|
2638
|
-
_delete_skip_storage(version)
|
2639
|
-
else:
|
2640
|
-
self._delete_skip_storage()
|
2641
|
-
# by default do not delete storage if deleting only a previous version
|
2642
|
-
# and the underlying store is mutable
|
2643
|
-
if self._overwrite_versions and not self.is_latest:
|
2644
|
-
delete_in_storage = False
|
2645
|
-
if storage:
|
2646
|
-
logger.warning(
|
2647
|
-
"storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
|
2648
|
-
)
|
2649
|
-
elif self.key is None or self._key_is_virtual:
|
2650
|
-
# do not ask for confirmation also if storage is None
|
2651
|
-
delete_in_storage = storage is None or storage
|
2652
|
-
else:
|
2653
|
-
# for artifacts with non-virtual semantic storage keys (key is not None)
|
2654
|
-
# ask for extra-confirmation
|
2655
|
-
if storage is None:
|
2656
|
-
response = input(
|
2657
|
-
f"Are you sure to want to delete {path}? (y/n) You can't undo"
|
2658
|
-
" this action."
|
2659
|
-
)
|
2660
|
-
delete_in_storage = response == "y"
|
2661
|
-
else:
|
2662
|
-
delete_in_storage = storage
|
2663
|
-
if not delete_in_storage:
|
2664
|
-
logger.important(f"a file/folder remains here: {path}")
|
2665
|
-
# we don't yet have logic to bring back the deleted metadata record
|
2666
|
-
# in case storage deletion fails - this is important for ACID down the road
|
2667
|
-
if delete_in_storage:
|
2668
|
-
delete_msg = delete_storage(path, raise_file_not_found_error=False)
|
2669
|
-
if delete_msg != "did-not-delete":
|
2670
|
-
logger.success(f"deleted {colors.yellow(f'{path}')}")
|
2814
|
+
super().delete(permanent=permanent, storage=storage, using_key=using_key)
|
2671
2815
|
|
2672
2816
|
@property
|
2673
2817
|
def _is_saved_to_storage_location(self) -> bool | None:
|
@@ -2796,11 +2940,20 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2796
2940
|
local_path_cache,
|
2797
2941
|
)
|
2798
2942
|
logger.important(f"moved local artifact to cache: {local_path_cache}")
|
2943
|
+
|
2944
|
+
# Handle external features
|
2945
|
+
if hasattr(self, "_external_features") and self._external_features is not None:
|
2946
|
+
external_features = self._external_features
|
2947
|
+
delattr(self, "_external_features")
|
2948
|
+
self.features.add_values(external_features)
|
2949
|
+
|
2950
|
+
# annotate Artifact
|
2799
2951
|
if hasattr(self, "_curator"):
|
2800
2952
|
curator = self._curator
|
2801
2953
|
delattr(self, "_curator")
|
2802
2954
|
# just annotates this artifact
|
2803
2955
|
curator.save_artifact()
|
2956
|
+
|
2804
2957
|
return self
|
2805
2958
|
|
2806
2959
|
def restore(self) -> None:
|
@@ -2848,7 +3001,7 @@ def _synchronize_cleanup_on_error(
|
|
2848
3001
|
|
2849
3002
|
|
2850
3003
|
def _delete_skip_storage(artifact, *args, **kwargs) -> None:
|
2851
|
-
super(
|
3004
|
+
super(SQLRecord, artifact).delete(*args, **kwargs)
|
2852
3005
|
|
2853
3006
|
|
2854
3007
|
def _save_skip_storage(artifact, **kwargs) -> None:
|
@@ -2866,6 +3019,7 @@ class ArtifactFeatureValue(BaseSQLRecord, IsLink, TracksRun):
|
|
2866
3019
|
featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
|
2867
3020
|
|
2868
3021
|
class Meta:
|
3022
|
+
app_label = "lamindb"
|
2869
3023
|
unique_together = ("artifact", "featurevalue")
|
2870
3024
|
|
2871
3025
|
|