lamindb 1.12.1__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +2 -2
- lamindb/_finish.py +1 -1
- lamindb/_tracked.py +3 -15
- lamindb/core/_context.py +45 -19
- lamindb/curators/_legacy.py +1 -1
- lamindb/curators/core.py +51 -21
- lamindb/errors.py +6 -0
- lamindb/examples/datasets/_core.py +1 -1
- lamindb/integrations/__init__.py +0 -18
- lamindb/integrations/{_lightning.py → lightning.py} +13 -10
- lamindb/migrations/0134_run_params.py +17 -0
- lamindb/migrations/{0133_squashed.py → 0134_squashed.py} +93 -90
- lamindb/models/_feature_manager.py +30 -20
- lamindb/models/_label_manager.py +3 -5
- lamindb/models/artifact.py +250 -291
- lamindb/models/artifact_set.py +4 -4
- lamindb/models/block.py +11 -9
- lamindb/models/can_curate.py +1 -1
- lamindb/models/collection.py +16 -17
- lamindb/models/has_parents.py +1 -3
- lamindb/models/query_manager.py +7 -7
- lamindb/models/query_set.py +38 -12
- lamindb/models/run.py +53 -49
- lamindb/models/schema.py +79 -65
- lamindb/models/sqlrecord.py +32 -17
- lamindb/models/transform.py +6 -3
- {lamindb-1.12.1.dist-info → lamindb-1.13.0.dist-info}/METADATA +26 -22
- {lamindb-1.12.1.dist-info → lamindb-1.13.0.dist-info}/RECORD +30 -29
- {lamindb-1.12.1.dist-info → lamindb-1.13.0.dist-info}/LICENSE +0 -0
- {lamindb-1.12.1.dist-info → lamindb-1.13.0.dist-info}/WHEEL +0 -0
lamindb/models/artifact.py
CHANGED
@@ -62,7 +62,7 @@ from ..core.storage.paths import (
|
|
62
62
|
filepath_cache_key_from_artifact,
|
63
63
|
filepath_from_artifact,
|
64
64
|
)
|
65
|
-
from ..errors import InvalidArgument, ValidationError
|
65
|
+
from ..errors import InvalidArgument, NoStorageLocationForSpace, ValidationError
|
66
66
|
from ..models._is_versioned import (
|
67
67
|
create_uid,
|
68
68
|
)
|
@@ -301,6 +301,7 @@ def get_stat_or_artifact(
|
|
301
301
|
check_hash: bool = True,
|
302
302
|
is_replace: bool = False,
|
303
303
|
instance: str | None = None,
|
304
|
+
skip_hash_lookup: bool = False,
|
304
305
|
) -> Union[tuple[int, str | None, str | None, int | None, Artifact | None], Artifact]:
|
305
306
|
"""Retrieves file statistics or an existing artifact based on the path, hash, and key."""
|
306
307
|
n_files = None
|
@@ -328,31 +329,39 @@ def get_stat_or_artifact(
|
|
328
329
|
if not check_hash:
|
329
330
|
return size, hash, hash_type, n_files, None
|
330
331
|
previous_artifact_version = None
|
331
|
-
if
|
332
|
-
|
333
|
-
|
332
|
+
if skip_hash_lookup:
|
333
|
+
artifact_with_same_hash_exists = False
|
334
|
+
hash_lookup_result = []
|
334
335
|
else:
|
335
|
-
|
336
|
-
Artifact.objects.using(instance)
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
336
|
+
if key is None or is_replace:
|
337
|
+
hash_lookup_result = Artifact.objects.using(instance).filter(
|
338
|
+
~Q(branch_id=-1), hash=hash
|
339
|
+
)
|
340
|
+
artifact_with_same_hash_exists = len(hash_lookup_result) > 0
|
341
|
+
else:
|
342
|
+
hash_lookup_result = (
|
343
|
+
Artifact.objects.using(instance)
|
344
|
+
.filter(
|
345
|
+
~Q(branch_id=-1),
|
346
|
+
Q(hash=hash) | Q(key=key, storage=storage),
|
347
|
+
)
|
348
|
+
.order_by("-created_at")
|
349
|
+
)
|
350
|
+
artifact_with_same_hash_exists = (
|
351
|
+
hash_lookup_result.filter(hash=hash).count() > 0
|
352
|
+
)
|
353
|
+
if key is not None and not is_replace:
|
354
|
+
if not artifact_with_same_hash_exists and len(hash_lookup_result) > 0:
|
343
355
|
logger.important(
|
344
356
|
f"creating new artifact version for key='{key}' (storage: '{storage.root}')"
|
345
357
|
)
|
346
|
-
previous_artifact_version =
|
358
|
+
previous_artifact_version = hash_lookup_result[0]
|
347
359
|
if artifact_with_same_hash_exists:
|
348
360
|
message = "returning existing artifact with same hash"
|
349
|
-
if result[0].branch_id == -1:
|
350
|
-
result[0].restore()
|
351
|
-
message = "restored artifact with same hash from trash"
|
352
361
|
logger.important(
|
353
|
-
f"{message}: {
|
362
|
+
f"{message}: {hash_lookup_result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
|
354
363
|
)
|
355
|
-
return
|
364
|
+
return hash_lookup_result[0]
|
356
365
|
else:
|
357
366
|
return size, hash, hash_type, n_files, previous_artifact_version
|
358
367
|
|
@@ -407,8 +416,8 @@ def get_artifact_kwargs_from_data(
|
|
407
416
|
is_replace: bool = False,
|
408
417
|
skip_check_exists: bool = False,
|
409
418
|
overwrite_versions: bool | None = None,
|
419
|
+
skip_hash_lookup: bool = False,
|
410
420
|
):
|
411
|
-
run = get_run(run)
|
412
421
|
memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
|
413
422
|
provisional_uid,
|
414
423
|
data,
|
@@ -440,11 +449,10 @@ def get_artifact_kwargs_from_data(
|
|
440
449
|
key=key,
|
441
450
|
instance=using_key,
|
442
451
|
is_replace=is_replace,
|
452
|
+
skip_hash_lookup=skip_hash_lookup,
|
443
453
|
)
|
444
454
|
if isinstance(stat_or_artifact, Artifact):
|
445
455
|
existing_artifact = stat_or_artifact
|
446
|
-
if run is not None:
|
447
|
-
existing_artifact._populate_subsequent_runs(run)
|
448
456
|
return existing_artifact, None
|
449
457
|
else:
|
450
458
|
size, hash, hash_type, n_files, revises = stat_or_artifact
|
@@ -634,11 +642,12 @@ def _check_otype_artifact(
|
|
634
642
|
return otype
|
635
643
|
|
636
644
|
|
637
|
-
def
|
645
|
+
def populate_subsequent_run(record: Union[Artifact, Collection], run: Run):
|
638
646
|
if record.run is None:
|
639
647
|
record.run = run
|
640
648
|
elif record.run != run:
|
641
649
|
record._subsequent_runs.add(run)
|
650
|
+
record._subsequent_run_id = run.id
|
642
651
|
|
643
652
|
|
644
653
|
# also see current_run() in core._data
|
@@ -698,29 +707,6 @@ def save_schema_links(self: Artifact) -> None:
|
|
698
707
|
bulk_create(links, ignore_conflicts=True)
|
699
708
|
|
700
709
|
|
701
|
-
# can restore later if needed
|
702
|
-
# def format_provenance(self, fk_data, print_types):
|
703
|
-
# type_str = lambda attr: (
|
704
|
-
# f": {get_related_model(self.__class__, attr).__name__}" if print_types else ""
|
705
|
-
# )
|
706
|
-
|
707
|
-
# return "".join(
|
708
|
-
# [
|
709
|
-
# f" .{field_name}{type_str(field_name)} = {format_field_value(value.get('name'))}\n"
|
710
|
-
# for field_name, value in fk_data.items()
|
711
|
-
# if value.get("name")
|
712
|
-
# ]
|
713
|
-
# )
|
714
|
-
|
715
|
-
# can restore later if needed
|
716
|
-
# def format_input_of_runs(self, print_types):
|
717
|
-
# if self.id is not None and self.input_of_runs.exists():
|
718
|
-
# values = [format_field_value(i.started_at) for i in self.input_of_runs.all()]
|
719
|
-
# type_str = ": Run" if print_types else "" # type: ignore
|
720
|
-
# return f" .input_of_runs{type_str} = {', '.join(values)}\n"
|
721
|
-
# return ""
|
722
|
-
|
723
|
-
|
724
710
|
def _describe_postgres(self): # for Artifact & Collection
|
725
711
|
from ._describe import (
|
726
712
|
describe_artifact_general,
|
@@ -963,7 +949,7 @@ def add_labels(
|
|
963
949
|
else:
|
964
950
|
validate_feature(feature, records) # type:ignore
|
965
951
|
records_by_registry = defaultdict(list)
|
966
|
-
feature_sets = self.feature_sets.filter(itype="Feature")
|
952
|
+
feature_sets = self.feature_sets.filter(itype="Feature")
|
967
953
|
internal_features = set() # type: ignore
|
968
954
|
if len(feature_sets) > 0:
|
969
955
|
for schema in feature_sets:
|
@@ -1136,6 +1122,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1136
1122
|
space: `Space | None = None` The space of the artifact. If `None`, uses the current space.
|
1137
1123
|
storage: `Storage | None = None` The storage location for the artifact. If `None`, uses the default storage location.
|
1138
1124
|
You can see and set the default storage location in :attr:`~lamindb.core.Settings.storage`.
|
1125
|
+
schema: A schema that defines how to validate & annotate.
|
1126
|
+
features: Additional external features to link.
|
1127
|
+
skip_hash_lookup: Skip the hash lookup so that a new artifact is created even if an identical artifact already exists.
|
1139
1128
|
|
1140
1129
|
Examples:
|
1141
1130
|
|
@@ -1326,7 +1315,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1326
1315
|
|
1327
1316
|
Similarly, you query based on these accessors::
|
1328
1317
|
|
1329
|
-
ln.Artifact.filter(ulabels__name="Experiment 1")
|
1318
|
+
ln.Artifact.filter(ulabels__name="Experiment 1")
|
1330
1319
|
|
1331
1320
|
Unlike the registry-specific accessors, the `.labels` accessor provides
|
1332
1321
|
a way of associating labels with features::
|
@@ -1495,12 +1484,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1495
1484
|
def __init__(
|
1496
1485
|
self,
|
1497
1486
|
# we're not choosing the name "path" for this arg because
|
1498
|
-
# it
|
1499
|
-
#
|
1487
|
+
# it could be confused with `artifact.path`
|
1488
|
+
# "data" conveys better that this is input data that's ingested
|
1500
1489
|
# and will be moved to a target path at `artifact.path`
|
1501
|
-
# also internally, we sometimes pass "data objects" like a DataFrame
|
1502
|
-
# here; and we might refactor this but we might also keep that internal
|
1503
|
-
# usage
|
1504
1490
|
data: UPathStr,
|
1505
1491
|
kind: ArtifactKind | str | None = None,
|
1506
1492
|
key: str | None = None,
|
@@ -1511,6 +1497,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1511
1497
|
storage: Storage | None = None,
|
1512
1498
|
branch: Branch | None = None,
|
1513
1499
|
space: Space | None = None,
|
1500
|
+
schema: Schema | None = None,
|
1501
|
+
features: dict[str, Any] | None = None,
|
1502
|
+
skip_hash_lookup: bool = False,
|
1514
1503
|
): ...
|
1515
1504
|
|
1516
1505
|
@overload
|
@@ -1542,48 +1531,23 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1542
1531
|
revises: Artifact | None = kwargs.pop("revises", None)
|
1543
1532
|
overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None)
|
1544
1533
|
version: str | None = kwargs.pop("version", None)
|
1545
|
-
|
1546
|
-
features: dict[str, Any] = kwargs.pop("features", None)
|
1547
1534
|
schema: Schema | None = kwargs.pop("schema", None)
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
f"Composite schema has {len(schema.slots)} slots. "
|
1557
|
-
"External feature validation only supports schemas with a single slot."
|
1558
|
-
)
|
1559
|
-
try:
|
1560
|
-
validation_schema = next(
|
1561
|
-
k for k in schema.slots.keys() if k.startswith("__external")
|
1562
|
-
)
|
1563
|
-
except StopIteration:
|
1564
|
-
raise ValueError(
|
1565
|
-
"External feature validation requires a slot that starts with __external."
|
1566
|
-
) from None
|
1567
|
-
|
1568
|
-
external_curator = DataFrameCurator(temp_df, validation_schema)
|
1569
|
-
external_curator.validate()
|
1570
|
-
external_curator._artifact = self
|
1571
|
-
|
1572
|
-
self._external_features = features
|
1573
|
-
|
1574
|
-
branch_id: int | None = None
|
1575
|
-
if "visibility" in kwargs: # backward compat
|
1576
|
-
branch_id = kwargs.pop("visibility")
|
1577
|
-
if "_branch_code" in kwargs: # backward compat
|
1578
|
-
branch_id = kwargs.pop("_branch_code")
|
1579
|
-
elif "branch_id" in kwargs:
|
1580
|
-
branch_id = kwargs.pop("branch_id")
|
1581
|
-
else:
|
1582
|
-
branch_id = 1
|
1583
|
-
branch = kwargs.pop("branch", None)
|
1535
|
+
features: dict[str, Any] | None = kwargs.pop("features", None)
|
1536
|
+
skip_hash_lookup: bool = kwargs.pop("skip_hash_lookup", False)
|
1537
|
+
|
1538
|
+
# validate external features if passed with a schema
|
1539
|
+
if features is not None:
|
1540
|
+
self._external_features = features
|
1541
|
+
if schema is not None:
|
1542
|
+
from lamindb.curators.core import ExperimentalDictCurator
|
1584
1543
|
|
1544
|
+
validation_schema = schema
|
1545
|
+
ExperimentalDictCurator(features, validation_schema).validate()
|
1546
|
+
|
1547
|
+
branch = kwargs.pop("branch", None)
|
1548
|
+
assert "branch_id" not in kwargs, "Please pass branch instead of branch_id." # noqa: S101
|
1585
1549
|
space = kwargs.pop("space", None)
|
1586
|
-
assert "space_id" not in kwargs, "
|
1550
|
+
assert "space_id" not in kwargs, "Please pass space instead of space_id." # noqa: S101
|
1587
1551
|
format = kwargs.pop("format", None)
|
1588
1552
|
_is_internal_call = kwargs.pop("_is_internal_call", False)
|
1589
1553
|
skip_check_exists = kwargs.pop("skip_check_exists", False)
|
@@ -1611,11 +1575,19 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1611
1575
|
"storage argument ignored as storage information from space takes precedence"
|
1612
1576
|
)
|
1613
1577
|
storage_locs_for_space = Storage.filter(space=space)
|
1614
|
-
|
1615
|
-
if
|
1616
|
-
|
1617
|
-
|
1578
|
+
n_storage_locs_for_space = len(storage_locs_for_space)
|
1579
|
+
if n_storage_locs_for_space == 0:
|
1580
|
+
raise NoStorageLocationForSpace(
|
1581
|
+
"No storage location found for space.\n"
|
1582
|
+
"Either create one via ln.Storage(root='create-s3', space=space).save()\n"
|
1583
|
+
"Or start managing access to an existing storage location via the space: storage_loc.space = space; storage.save()"
|
1618
1584
|
)
|
1585
|
+
else:
|
1586
|
+
storage = storage_locs_for_space.first()
|
1587
|
+
if n_storage_locs_for_space > 1:
|
1588
|
+
logger.warning(
|
1589
|
+
f"more than one storage location for space {space}, choosing {storage}"
|
1590
|
+
)
|
1619
1591
|
otype = kwargs.pop("otype") if "otype" in kwargs else None
|
1620
1592
|
if isinstance(data, str) and data.startswith("s3:///"):
|
1621
1593
|
# issue in Groovy / nf-lamin producing malformed S3 paths
|
@@ -1658,6 +1630,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1658
1630
|
is_automanaged_path = False
|
1659
1631
|
|
1660
1632
|
provisional_uid, revises = create_uid(revises=revises, version=version)
|
1633
|
+
run = get_run(run)
|
1661
1634
|
kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
|
1662
1635
|
data=data,
|
1663
1636
|
key=key,
|
@@ -1669,6 +1642,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1669
1642
|
using_key=using_key,
|
1670
1643
|
skip_check_exists=skip_check_exists,
|
1671
1644
|
overwrite_versions=overwrite_versions,
|
1645
|
+
skip_hash_lookup=skip_hash_lookup,
|
1672
1646
|
)
|
1673
1647
|
|
1674
1648
|
# an object with the same hash already exists
|
@@ -1685,6 +1659,8 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1685
1659
|
f"key {self.key} on existing artifact differs from passed key {key}"
|
1686
1660
|
)
|
1687
1661
|
update_attributes(self, attr_to_update)
|
1662
|
+
if run is not None:
|
1663
|
+
populate_subsequent_run(self, run)
|
1688
1664
|
return None
|
1689
1665
|
else:
|
1690
1666
|
kwargs = kwargs_or_artifact
|
@@ -1724,7 +1700,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1724
1700
|
kwargs["version"] = version
|
1725
1701
|
kwargs["description"] = description
|
1726
1702
|
kwargs["branch"] = branch
|
1727
|
-
kwargs["branch_id"] = branch_id
|
1728
1703
|
kwargs["space"] = space
|
1729
1704
|
kwargs["otype"] = otype
|
1730
1705
|
kwargs["revises"] = revises
|
@@ -1739,43 +1714,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1739
1714
|
|
1740
1715
|
super().__init__(**kwargs)
|
1741
1716
|
|
1742
|
-
@classmethod
|
1743
|
-
def from_lazy(
|
1744
|
-
cls,
|
1745
|
-
suffix: str,
|
1746
|
-
overwrite_versions: bool,
|
1747
|
-
key: str | None = None,
|
1748
|
-
description: str | None = None,
|
1749
|
-
run: Run | None = None,
|
1750
|
-
**kwargs,
|
1751
|
-
) -> LazyArtifact:
|
1752
|
-
"""Create a lazy artifact for streaming to auto-generated internal paths.
|
1753
|
-
|
1754
|
-
This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
|
1755
|
-
and register the path as an artifact.
|
1756
|
-
|
1757
|
-
The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
|
1758
|
-
on `.save()` with the provided arguments.
|
1759
|
-
|
1760
|
-
Args:
|
1761
|
-
suffix: The suffix for the auto-generated internal path
|
1762
|
-
overwrite_versions: Whether to overwrite versions.
|
1763
|
-
key: An optional key to reference the artifact.
|
1764
|
-
description: A description.
|
1765
|
-
run: The run that creates the artifact.
|
1766
|
-
**kwargs: Other keyword arguments for the artifact to be created.
|
1767
|
-
|
1768
|
-
Examples:
|
1769
|
-
|
1770
|
-
Create a lazy artifact, write to the path and save to get a real artifact::
|
1771
|
-
|
1772
|
-
lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
|
1773
|
-
zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
|
1774
|
-
artifact = lazy.save()
|
1775
|
-
"""
|
1776
|
-
args = {"key": key, "description": description, "run": run, **kwargs}
|
1777
|
-
return LazyArtifact(suffix, overwrite_versions, **args)
|
1778
|
-
|
1779
1717
|
@property
|
1780
1718
|
@deprecated("kind")
|
1781
1719
|
def type(self) -> str:
|
@@ -1876,8 +1814,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1876
1814
|
artifact = ln.Arfifact.get(key="examples/my_file.parquet")
|
1877
1815
|
artifact = ln.Artifact.get(path="s3://bucket/folder/adata.h5ad")
|
1878
1816
|
"""
|
1879
|
-
from .query_set import QuerySet
|
1880
|
-
|
1881
1817
|
return QuerySet(model=cls).get(idlike, is_run_input=is_run_input, **expressions)
|
1882
1818
|
|
1883
1819
|
@classmethod
|
@@ -1909,6 +1845,43 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1909
1845
|
# from Registry metaclass
|
1910
1846
|
return type(cls).filter(cls, *queries, **expressions)
|
1911
1847
|
|
1848
|
+
@classmethod
|
1849
|
+
def from_lazy(
|
1850
|
+
cls,
|
1851
|
+
suffix: str,
|
1852
|
+
overwrite_versions: bool,
|
1853
|
+
key: str | None = None,
|
1854
|
+
description: str | None = None,
|
1855
|
+
run: Run | None = None,
|
1856
|
+
**kwargs,
|
1857
|
+
) -> LazyArtifact:
|
1858
|
+
"""Create a lazy artifact for streaming to auto-generated internal paths.
|
1859
|
+
|
1860
|
+
This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
|
1861
|
+
and register the path as an artifact.
|
1862
|
+
|
1863
|
+
The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
|
1864
|
+
on `.save()` with the provided arguments.
|
1865
|
+
|
1866
|
+
Args:
|
1867
|
+
suffix: The suffix for the auto-generated internal path
|
1868
|
+
overwrite_versions: Whether to overwrite versions.
|
1869
|
+
key: An optional key to reference the artifact.
|
1870
|
+
description: A description.
|
1871
|
+
run: The run that creates the artifact.
|
1872
|
+
**kwargs: Other keyword arguments for the artifact to be created.
|
1873
|
+
|
1874
|
+
Examples:
|
1875
|
+
|
1876
|
+
Create a lazy artifact, write to the path and save to get a real artifact::
|
1877
|
+
|
1878
|
+
lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
|
1879
|
+
zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
|
1880
|
+
artifact = lazy.save()
|
1881
|
+
"""
|
1882
|
+
args = {"key": key, "description": description, "run": run, **kwargs}
|
1883
|
+
return LazyArtifact(suffix, overwrite_versions, **args)
|
1884
|
+
|
1912
1885
|
@classmethod
|
1913
1886
|
def from_dataframe(
|
1914
1887
|
cls,
|
@@ -1932,21 +1905,15 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1932
1905
|
revises: An old version of the artifact.
|
1933
1906
|
run: The run that creates the artifact.
|
1934
1907
|
schema: A schema that defines how to validate & annotate.
|
1935
|
-
features:
|
1908
|
+
features: Additional external features to link.
|
1936
1909
|
|
1937
|
-
|
1938
|
-
:meth:`~lamindb.Collection`
|
1939
|
-
Track collections.
|
1940
|
-
:class:`~lamindb.Feature`
|
1941
|
-
Track features.
|
1942
|
-
|
1943
|
-
Example:
|
1910
|
+
Examples:
|
1944
1911
|
|
1945
1912
|
No validation and annotation::
|
1946
1913
|
|
1947
1914
|
import lamindb as ln
|
1948
1915
|
|
1949
|
-
df = ln.
|
1916
|
+
df = ln.examples.datasets.mini_immuno.get_dataset1()
|
1950
1917
|
artifact = ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet").save()
|
1951
1918
|
|
1952
1919
|
With validation and annotation.
|
@@ -1980,9 +1947,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1980
1947
|
**kwargs,
|
1981
1948
|
)
|
1982
1949
|
artifact.n_observations = len(df)
|
1983
|
-
|
1950
|
+
if features is not None:
|
1951
|
+
artifact._external_features = features
|
1984
1952
|
if schema is not None:
|
1985
|
-
from lamindb.curators.core import
|
1953
|
+
from lamindb.curators.core import DataFrameCurator, ExperimentalDictCurator
|
1986
1954
|
|
1987
1955
|
if not artifact._state.adding and artifact.suffix != ".parquet":
|
1988
1956
|
logger.warning(
|
@@ -1991,31 +1959,14 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
1991
1959
|
)
|
1992
1960
|
return artifact
|
1993
1961
|
|
1994
|
-
|
1995
|
-
|
1996
|
-
|
1997
|
-
external_slot = next(
|
1998
|
-
k for k in schema.slots.keys() if "__external__" in k
|
1999
|
-
)
|
2000
|
-
validation_schema = schema.slots[external_slot]
|
2001
|
-
except StopIteration:
|
2002
|
-
raise ValueError(
|
2003
|
-
"External feature validation requires a slot __external__."
|
2004
|
-
) from None
|
2005
|
-
|
2006
|
-
external_curator = ComponentCurator(
|
2007
|
-
pd.DataFrame([features]), validation_schema
|
2008
|
-
)
|
2009
|
-
external_curator.validate()
|
2010
|
-
artifact._external_features = features
|
2011
|
-
|
2012
|
-
# Validate main DataFrame if not Composite or if Composite has attrs
|
2013
|
-
if schema.itype != "Composite" or "attrs" in schema.slots:
|
2014
|
-
curator = ComponentCurator(artifact, schema)
|
2015
|
-
curator.validate()
|
2016
|
-
artifact.schema = schema
|
2017
|
-
artifact._curator = curator
|
1962
|
+
if features is not None and "__external__" in schema.slots:
|
1963
|
+
validation_schema = schema.slots["__external__"]
|
1964
|
+
ExperimentalDictCurator(features, validation_schema).validate()
|
2018
1965
|
|
1966
|
+
curator = DataFrameCurator(artifact, schema)
|
1967
|
+
curator.validate()
|
1968
|
+
artifact.schema = schema
|
1969
|
+
artifact._curator = curator
|
2019
1970
|
return artifact
|
2020
1971
|
|
2021
1972
|
@classmethod
|
@@ -2076,7 +2027,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2076
2027
|
|
2077
2028
|
import lamindb as ln
|
2078
2029
|
|
2079
|
-
adata = ln.
|
2030
|
+
adata = ln.examples.datasets.anndata_with_obs()
|
2080
2031
|
artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
|
2081
2032
|
|
2082
2033
|
With validation and annotation.
|
@@ -2164,7 +2115,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2164
2115
|
|
2165
2116
|
import lamindb as ln
|
2166
2117
|
|
2167
|
-
mdata = ln.
|
2118
|
+
mdata = ln.examples.datasets.mudata_papalexi21_subset()
|
2168
2119
|
artifact = ln.Artifact.from_mudata(mdata, key="mudata_papalexi21_subset.h5mu").save()
|
2169
2120
|
"""
|
2170
2121
|
if not data_is_scversedatastructure(mdata, "MuData"):
|
@@ -2335,7 +2286,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2335
2286
|
|
2336
2287
|
import lamindb as ln
|
2337
2288
|
|
2338
|
-
dir_path = ln.
|
2289
|
+
dir_path = ln.examples.datasets.generate_cell_ranger_files("sample_001", ln.settings.storage)
|
2339
2290
|
artifacts = ln.Artifact.from_dir(dir_path)
|
2340
2291
|
ln.save(artifacts)
|
2341
2292
|
"""
|
@@ -2447,6 +2398,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2447
2398
|
However, it will update the suffix if it changes.
|
2448
2399
|
"""
|
2449
2400
|
storage = settings.storage.record
|
2401
|
+
run = get_run(run)
|
2450
2402
|
kwargs, privates = get_artifact_kwargs_from_data(
|
2451
2403
|
provisional_uid=self.uid,
|
2452
2404
|
data=data,
|
@@ -2690,7 +2642,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2690
2642
|
|
2691
2643
|
access = _track_writes_factory(access, finalize)
|
2692
2644
|
# only call if open is successfull
|
2693
|
-
|
2645
|
+
track_run_input(self, is_run_input)
|
2694
2646
|
return access
|
2695
2647
|
|
2696
2648
|
def load(
|
@@ -2769,7 +2721,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2769
2721
|
)
|
2770
2722
|
access_memory = load_to_memory(cache_path, **kwargs)
|
2771
2723
|
# only call if load is successfull
|
2772
|
-
|
2724
|
+
track_run_input(self, is_run_input)
|
2773
2725
|
|
2774
2726
|
return access_memory
|
2775
2727
|
|
@@ -2804,7 +2756,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2804
2756
|
filepath, cache_key=cache_key, **kwargs
|
2805
2757
|
)
|
2806
2758
|
# only call if sync is successfull
|
2807
|
-
|
2759
|
+
track_run_input(self, is_run_input)
|
2808
2760
|
return cache_path
|
2809
2761
|
|
2810
2762
|
def delete(
|
@@ -2969,13 +2921,13 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
2969
2921
|
)
|
2970
2922
|
logger.important(f"moved local artifact to cache: {local_path_cache}")
|
2971
2923
|
|
2972
|
-
#
|
2973
|
-
if hasattr(self, "_external_features")
|
2924
|
+
# annotate with external features
|
2925
|
+
if hasattr(self, "_external_features"):
|
2974
2926
|
external_features = self._external_features
|
2975
2927
|
delattr(self, "_external_features")
|
2976
2928
|
self.features.add_values(external_features)
|
2977
2929
|
|
2978
|
-
# annotate
|
2930
|
+
# annotate with internal features based on curator
|
2979
2931
|
if hasattr(self, "_curator"):
|
2980
2932
|
curator = self._curator
|
2981
2933
|
delattr(self, "_curator")
|
@@ -3002,9 +2954,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
|
3002
2954
|
"""
|
3003
2955
|
return describe_artifact_collection(self, return_str=return_str)
|
3004
2956
|
|
3005
|
-
def _populate_subsequent_runs(self, run: Run) -> None:
|
3006
|
-
_populate_subsequent_runs_(self, run)
|
3007
|
-
|
3008
2957
|
|
3009
2958
|
# can't really just call .cache in .load because of double tracking
|
3010
2959
|
def _synchronize_cleanup_on_error(
|
@@ -3068,15 +3017,20 @@ class ArtifactUser(BaseSQLRecord, IsLink, TracksRun):
|
|
3068
3017
|
unique_together = ("artifact", "user", "feature")
|
3069
3018
|
|
3070
3019
|
|
3071
|
-
def
|
3072
|
-
|
3020
|
+
def track_run_input(
|
3021
|
+
record: (
|
3073
3022
|
Artifact | Iterable[Artifact]
|
3074
3023
|
), # can also be Collection | Iterable[Collection]
|
3075
3024
|
is_run_input: bool | Run | None = None,
|
3076
3025
|
run: Run | None = None,
|
3077
|
-
):
|
3026
|
+
) -> None:
|
3027
|
+
"""Links a record as an input to a run.
|
3028
|
+
|
3029
|
+
This function contains all validation logic to make decisions on whether a
|
3030
|
+
record qualifies as an input or not.
|
3031
|
+
"""
|
3078
3032
|
if is_run_input is False:
|
3079
|
-
return
|
3033
|
+
return None
|
3080
3034
|
|
3081
3035
|
from .._tracked import get_current_tracked_run
|
3082
3036
|
from ..core._context import context
|
@@ -3089,133 +3043,138 @@ def _track_run_input(
|
|
3089
3043
|
run = get_current_tracked_run()
|
3090
3044
|
if run is None:
|
3091
3045
|
run = context.run
|
3092
|
-
# consider that
|
3093
|
-
|
3094
|
-
[
|
3046
|
+
# consider that record is an iterable of Data
|
3047
|
+
record_iter: Iterable[Artifact] | Iterable[Collection] = (
|
3048
|
+
[record] if isinstance(record, (Artifact, Collection)) else record
|
3095
3049
|
)
|
3096
|
-
|
3097
|
-
input_data = []
|
3050
|
+
input_records = []
|
3098
3051
|
if run is not None:
|
3099
|
-
|
3100
|
-
|
3052
|
+
assert not run._state.adding, "Save the run before tracking its inputs." # noqa: S101
|
3053
|
+
|
3054
|
+
def is_valid_input(record: Artifact | Collection):
|
3101
3055
|
is_valid = False
|
3102
|
-
if
|
3056
|
+
# if a record is not yet saved it has record._state.db = None
|
3057
|
+
# then it can't be an input
|
3058
|
+
# we silently ignore because what will happen is that
|
3059
|
+
# the record either gets saved and then is tracked as an output
|
3060
|
+
# or it won't get saved at all
|
3061
|
+
if record._state.db == "default":
|
3103
3062
|
# things are OK if the record is on the default db
|
3104
3063
|
is_valid = True
|
3105
|
-
elif data._state.db is None:
|
3106
|
-
# if a record is not yet saved, it can't be an input
|
3107
|
-
# we silently ignore because what likely happens is that
|
3108
|
-
# the user works with an object that's about to be saved
|
3109
|
-
# in the current Python session
|
3110
|
-
is_valid = False
|
3111
3064
|
else:
|
3112
3065
|
# record is on another db
|
3113
3066
|
# we have to save the record into the current db with
|
3114
3067
|
# the run being attached to a transfer transform
|
3115
3068
|
logger.info(
|
3116
|
-
f"completing transfer to track {
|
3069
|
+
f"completing transfer to track {record.__class__.__name__}('{record.uid}') as input"
|
3117
3070
|
)
|
3118
|
-
|
3071
|
+
record.save()
|
3119
3072
|
is_valid = True
|
3120
|
-
|
3121
|
-
|
3122
|
-
|
3123
|
-
|
3124
|
-
|
3125
|
-
|
3126
|
-
|
3127
|
-
|
3128
|
-
|
3073
|
+
# avoid cycles: record can't be both input and output
|
3074
|
+
if record.run_id == run.id:
|
3075
|
+
logger.debug(
|
3076
|
+
f"not tracking {record} as input to run {run} because created by same run"
|
3077
|
+
)
|
3078
|
+
is_valid = False
|
3079
|
+
if run.id == getattr(record, "_subsequent_run_id", None):
|
3080
|
+
logger.debug(
|
3081
|
+
f"not tracking {record} as input to run {run} because re-created in same run"
|
3082
|
+
)
|
3083
|
+
is_valid = False
|
3084
|
+
return is_valid
|
3129
3085
|
|
3130
|
-
|
3131
|
-
|
3132
|
-
if
|
3133
|
-
|
3086
|
+
input_records = [record for record in record_iter if is_valid_input(record)]
|
3087
|
+
input_records_ids = [record.id for record in input_records]
|
3088
|
+
if input_records:
|
3089
|
+
record_class_name = input_records[0].__class__.__name__.lower()
|
3134
3090
|
# let us first look at the case in which the user does not
|
3135
3091
|
# provide a boolean value for `is_run_input`
|
3136
3092
|
# hence, we need to determine whether we actually want to
|
3137
3093
|
# track a run or not
|
3138
|
-
|
3139
|
-
|
3094
|
+
track = False
|
3095
|
+
is_run_input = settings.track_run_inputs if is_run_input is None else is_run_input
|
3096
|
+
if is_run_input:
|
3140
3097
|
if run is None:
|
3141
|
-
if
|
3142
|
-
|
3143
|
-
|
3144
|
-
|
3145
|
-
|
3146
|
-
|
3147
|
-
|
3148
|
-
if settings.track_run_inputs:
|
3149
|
-
transform_note = ""
|
3150
|
-
if len(input_data) == 1:
|
3151
|
-
if input_data[0].transform is not None:
|
3152
|
-
transform_note = (
|
3153
|
-
", adding parent transform"
|
3154
|
-
f" {input_data[0].transform.id}"
|
3155
|
-
)
|
3156
|
-
logger.info(
|
3157
|
-
f"adding {data_class_name} ids {input_data_ids} as inputs for run"
|
3158
|
-
f" {run.id}{transform_note}"
|
3159
|
-
)
|
3160
|
-
track_run_input = True
|
3161
|
-
else:
|
3162
|
-
logger.hint(
|
3163
|
-
"track these data as a run input by passing `is_run_input=True`"
|
3164
|
-
)
|
3098
|
+
if not is_read_only_connection():
|
3099
|
+
logger.warning(WARNING_NO_INPUT)
|
3100
|
+
elif input_records:
|
3101
|
+
logger.debug(
|
3102
|
+
f"adding {record_class_name} ids {input_records_ids} as inputs for run {run.id}"
|
3103
|
+
)
|
3104
|
+
track = True
|
3165
3105
|
else:
|
3166
|
-
|
3167
|
-
if
|
3168
|
-
|
3169
|
-
|
3170
|
-
|
3171
|
-
|
3172
|
-
|
3173
|
-
|
3174
|
-
IsLink
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3180
|
-
|
3181
|
-
|
3182
|
-
|
3183
|
-
|
3184
|
-
|
3185
|
-
|
3186
|
-
|
3187
|
-
|
3188
|
-
|
3189
|
-
|
3190
|
-
if available_spaces is None:
|
3191
|
-
raise NoWriteAccess(
|
3192
|
-
f"You’re not allowed to write to the instance {instance.slug}.\n"
|
3193
|
-
"Please contact administrators of the instance if you need write access."
|
3194
|
-
) from None
|
3195
|
-
write_access_spaces = (
|
3196
|
-
available_spaces["admin"] + available_spaces["write"]
|
3197
|
-
)
|
3198
|
-
no_write_access_spaces = {
|
3199
|
-
data_space
|
3200
|
-
for data in input_data
|
3201
|
-
if (data_space := data.space) not in write_access_spaces
|
3202
|
-
}
|
3203
|
-
if (run_space := run.space) not in write_access_spaces:
|
3204
|
-
no_write_access_spaces.add(run_space)
|
3205
|
-
if len(no_write_access_spaces) > 1:
|
3206
|
-
name_msg = ", ".join(
|
3207
|
-
f"'{space.name}'" for space in no_write_access_spaces
|
3208
|
-
)
|
3209
|
-
space_msg = "spaces"
|
3210
|
-
else:
|
3211
|
-
name_msg = f"'{no_write_access_spaces.pop().name}'"
|
3212
|
-
space_msg = "space"
|
3106
|
+
track = is_run_input
|
3107
|
+
if not track or not input_records:
|
3108
|
+
return None
|
3109
|
+
if run is None:
|
3110
|
+
raise ValueError("No run context set. Call `ln.track()`.")
|
3111
|
+
if record_class_name == "artifact":
|
3112
|
+
IsLink = run.input_artifacts.through
|
3113
|
+
links = [
|
3114
|
+
IsLink(run_id=run.id, artifact_id=record_id)
|
3115
|
+
for record_id in input_records_ids
|
3116
|
+
]
|
3117
|
+
else:
|
3118
|
+
IsLink = run.input_collections.through
|
3119
|
+
links = [
|
3120
|
+
IsLink(run_id=run.id, collection_id=record_id)
|
3121
|
+
for record_id in input_records_ids
|
3122
|
+
]
|
3123
|
+
try:
|
3124
|
+
IsLink.objects.bulk_create(links, ignore_conflicts=True)
|
3125
|
+
except ProgrammingError as e:
|
3126
|
+
if "new row violates row-level security policy" in str(e):
|
3127
|
+
instance = setup_settings.instance
|
3128
|
+
available_spaces = instance.available_spaces
|
3129
|
+
if available_spaces is None:
|
3213
3130
|
raise NoWriteAccess(
|
3214
|
-
f"You’re not allowed to write to the
|
3215
|
-
|
3131
|
+
f"You’re not allowed to write to the instance {instance.slug}.\n"
|
3132
|
+
"Please contact administrators of the instance if you need write access."
|
3216
3133
|
) from None
|
3134
|
+
write_access_spaces = available_spaces["admin"] + available_spaces["write"]
|
3135
|
+
no_write_access_spaces = {
|
3136
|
+
record_space
|
3137
|
+
for record in input_records
|
3138
|
+
if (record_space := record.space) not in write_access_spaces
|
3139
|
+
}
|
3140
|
+
if (run_space := run.space) not in write_access_spaces:
|
3141
|
+
no_write_access_spaces.add(run_space)
|
3142
|
+
|
3143
|
+
if not no_write_access_spaces:
|
3144
|
+
# if there are no unavailable spaces, then this should be due to locking
|
3145
|
+
locked_records = [
|
3146
|
+
record
|
3147
|
+
for record in input_records
|
3148
|
+
if getattr(record, "is_locked", False)
|
3149
|
+
]
|
3150
|
+
if run.is_locked:
|
3151
|
+
locked_records.append(run)
|
3152
|
+
# if no unavailable spaces and no locked records, just raise the original error
|
3153
|
+
if not locked_records:
|
3154
|
+
raise e
|
3155
|
+
no_write_msg = (
|
3156
|
+
"It is not allowed to modify locked records: "
|
3157
|
+
+ ", ".join(
|
3158
|
+
r.__class__.__name__ + f"(uid={r.uid})" for r in locked_records
|
3159
|
+
)
|
3160
|
+
+ "."
|
3161
|
+
)
|
3162
|
+
raise NoWriteAccess(no_write_msg) from None
|
3163
|
+
|
3164
|
+
if len(no_write_access_spaces) > 1:
|
3165
|
+
name_msg = ", ".join(
|
3166
|
+
f"'{space.name}'" for space in no_write_access_spaces
|
3167
|
+
)
|
3168
|
+
space_msg = "spaces"
|
3217
3169
|
else:
|
3218
|
-
|
3170
|
+
name_msg = f"'{no_write_access_spaces.pop().name}'"
|
3171
|
+
space_msg = "space"
|
3172
|
+
raise NoWriteAccess(
|
3173
|
+
f"You’re not allowed to write to the {space_msg} {name_msg}.\n"
|
3174
|
+
f"Please contact administrators of the {space_msg} if you need write access."
|
3175
|
+
) from None
|
3176
|
+
else:
|
3177
|
+
raise e
|
3219
3178
|
|
3220
3179
|
|
3221
3180
|
# privates currently dealt with separately
|