lamindb 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +52 -36
- lamindb/_finish.py +17 -10
- lamindb/_tracked.py +1 -1
- lamindb/base/__init__.py +3 -1
- lamindb/base/fields.py +40 -22
- lamindb/base/ids.py +1 -94
- lamindb/base/types.py +2 -0
- lamindb/base/uids.py +117 -0
- lamindb/core/_context.py +203 -102
- lamindb/core/_settings.py +38 -25
- lamindb/core/datasets/__init__.py +11 -4
- lamindb/core/datasets/_core.py +5 -5
- lamindb/core/datasets/_small.py +0 -93
- lamindb/core/datasets/mini_immuno.py +172 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_backed_access.py +100 -6
- lamindb/core/storage/_polars_lazy_df.py +51 -0
- lamindb/core/storage/_pyarrow_dataset.py +15 -30
- lamindb/core/storage/_tiledbsoma.py +29 -13
- lamindb/core/storage/objects.py +6 -0
- lamindb/core/subsettings/__init__.py +2 -0
- lamindb/core/subsettings/_annotation_settings.py +11 -0
- lamindb/curators/__init__.py +7 -3349
- lamindb/curators/_legacy.py +2056 -0
- lamindb/curators/core.py +1534 -0
- lamindb/errors.py +11 -0
- lamindb/examples/__init__.py +27 -0
- lamindb/examples/schemas/__init__.py +12 -0
- lamindb/examples/schemas/_anndata.py +25 -0
- lamindb/examples/schemas/_simple.py +19 -0
- lamindb/integrations/_vitessce.py +8 -5
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
- lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
- lamindb/migrations/0093_alter_schemacomponent_unique_together.py +16 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +21 -4
- lamindb/models/_feature_manager.py +382 -287
- lamindb/models/_label_manager.py +8 -2
- lamindb/models/artifact.py +177 -106
- lamindb/models/artifact_set.py +122 -0
- lamindb/models/collection.py +73 -52
- lamindb/models/core.py +1 -1
- lamindb/models/feature.py +51 -17
- lamindb/models/has_parents.py +69 -14
- lamindb/models/project.py +1 -1
- lamindb/models/query_manager.py +221 -22
- lamindb/models/query_set.py +247 -172
- lamindb/models/record.py +65 -247
- lamindb/models/run.py +4 -4
- lamindb/models/save.py +8 -2
- lamindb/models/schema.py +456 -184
- lamindb/models/transform.py +2 -2
- lamindb/models/ulabel.py +8 -5
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/METADATA +6 -6
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/RECORD +57 -43
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/LICENSE +0 -0
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/WHEEL +0 -0
lamindb/models/_label_manager.py
CHANGED
@@ -24,7 +24,7 @@ from ._describe import (
|
|
24
24
|
TYPE_WIDTH,
|
25
25
|
VALUES_WIDTH,
|
26
26
|
describe_header,
|
27
|
-
|
27
|
+
format_rich_tree,
|
28
28
|
)
|
29
29
|
from ._django import get_artifact_with_related, get_related_model
|
30
30
|
from ._relations import dict_related_model_to_related_name
|
@@ -182,8 +182,14 @@ class LabelManager:
|
|
182
182
|
self._host = host
|
183
183
|
|
184
184
|
def __repr__(self) -> str:
|
185
|
+
return self.describe(return_str=True)
|
186
|
+
|
187
|
+
def describe(self, return_str=True) -> str:
|
188
|
+
"""Describe the labels."""
|
185
189
|
tree = describe_labels(self._host)
|
186
|
-
return
|
190
|
+
return format_rich_tree(
|
191
|
+
tree, fallback="no linked labels", return_str=return_str
|
192
|
+
)
|
187
193
|
|
188
194
|
def add(
|
189
195
|
self,
|
lamindb/models/artifact.py
CHANGED
@@ -5,7 +5,7 @@ import os
|
|
5
5
|
import shutil
|
6
6
|
from collections import defaultdict
|
7
7
|
from pathlib import Path, PurePath, PurePosixPath
|
8
|
-
from typing import TYPE_CHECKING, Any, Union, overload
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal, Union, overload
|
9
9
|
|
10
10
|
import fsspec
|
11
11
|
import lamindb_setup as ln_setup
|
@@ -17,7 +17,6 @@ from django.db.models import CASCADE, PROTECT, Q
|
|
17
17
|
from lamin_utils import colors, logger
|
18
18
|
from lamindb_setup import settings as setup_settings
|
19
19
|
from lamindb_setup._init_instance import register_storage_in_instance
|
20
|
-
from lamindb_setup.core import doc_args
|
21
20
|
from lamindb_setup.core._settings_storage import init_storage
|
22
21
|
from lamindb_setup.core.hashing import HASH_LENGTH, hash_dir, hash_file
|
23
22
|
from lamindb_setup.core.types import UPathStr
|
@@ -48,6 +47,11 @@ from ..core.storage import (
|
|
48
47
|
write_to_disk,
|
49
48
|
)
|
50
49
|
from ..core.storage._anndata_accessor import _anndata_n_observations
|
50
|
+
from ..core.storage._backed_access import (
|
51
|
+
_track_writes_factory,
|
52
|
+
backed_access,
|
53
|
+
)
|
54
|
+
from ..core.storage._polars_lazy_df import POLARS_SUFFIXES
|
51
55
|
from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES
|
52
56
|
from ..core.storage._tiledbsoma import _soma_n_observations
|
53
57
|
from ..core.storage.paths import (
|
@@ -94,8 +98,6 @@ WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-r
|
|
94
98
|
|
95
99
|
WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"
|
96
100
|
|
97
|
-
DEBUG_KWARGS_DOC = "**kwargs: Internal arguments for debugging."
|
98
|
-
|
99
101
|
try:
|
100
102
|
from ..core.storage._zarr import identify_zarr_type
|
101
103
|
except ImportError:
|
@@ -105,9 +107,10 @@ except ImportError:
|
|
105
107
|
|
106
108
|
|
107
109
|
if TYPE_CHECKING:
|
108
|
-
from collections.abc import Iterable
|
110
|
+
from collections.abc import Iterable, Iterator
|
109
111
|
|
110
112
|
from mudata import MuData # noqa: TC004
|
113
|
+
from polars import LazyFrame as PolarsLazyFrame
|
111
114
|
from pyarrow.dataset import Dataset as PyArrowDataset
|
112
115
|
from spatialdata import SpatialData # noqa: TC004
|
113
116
|
from tiledbsoma import Collection as SOMACollection
|
@@ -311,10 +314,9 @@ def get_stat_or_artifact(
|
|
311
314
|
result = Artifact.objects.using(instance).filter(hash=hash).all()
|
312
315
|
artifact_with_same_hash_exists = len(result) > 0
|
313
316
|
else:
|
314
|
-
storage_id = settings.storage.id
|
315
317
|
result = (
|
316
318
|
Artifact.objects.using(instance)
|
317
|
-
.filter(Q(hash=hash) | Q(key=key,
|
319
|
+
.filter(Q(hash=hash) | Q(key=key, storage=settings.storage.record))
|
318
320
|
.order_by("-created_at")
|
319
321
|
.all()
|
320
322
|
)
|
@@ -759,15 +761,15 @@ def _describe_sqlite(self, print_types: bool = False): # for artifact & collect
|
|
759
761
|
return tree
|
760
762
|
|
761
763
|
|
762
|
-
def describe_artifact_collection(self
|
763
|
-
from ._describe import
|
764
|
+
def describe_artifact_collection(self, return_str: bool = False) -> str | None:
|
765
|
+
from ._describe import format_rich_tree
|
764
766
|
|
765
767
|
if not self._state.adding and connections[self._state.db].vendor == "postgresql":
|
766
768
|
tree = _describe_postgres(self)
|
767
769
|
else:
|
768
770
|
tree = _describe_sqlite(self)
|
769
771
|
|
770
|
-
|
772
|
+
return format_rich_tree(tree, return_str=return_str)
|
771
773
|
|
772
774
|
|
773
775
|
def validate_feature(feature: Feature, records: list[Record]) -> None:
|
@@ -909,7 +911,7 @@ def add_labels(
|
|
909
911
|
for registry_name, records in records_by_registry.items():
|
910
912
|
if not from_curator and feature.name in internal_features:
|
911
913
|
raise ValidationError(
|
912
|
-
"Cannot manually annotate
|
914
|
+
"Cannot manually annotate a feature measured *within* the dataset. Please use a Curator."
|
913
915
|
)
|
914
916
|
if registry_name not in feature.dtype:
|
915
917
|
if not feature.dtype.startswith("cat"):
|
@@ -962,7 +964,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
962
964
|
|
963
965
|
Create an artifact **from a local file or folder**::
|
964
966
|
|
965
|
-
artifact = ln.Artifact("./my_file.parquet", key="
|
967
|
+
artifact = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
|
966
968
|
artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
|
967
969
|
|
968
970
|
Calling `.save()` copies or uploads the file to the default storage location of your lamindb instance.
|
@@ -977,29 +979,12 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
977
979
|
|
978
980
|
You can make a **new version** of an artifact by passing an existing `key`::
|
979
981
|
|
980
|
-
artifact_v2 = ln.Artifact("./my_file.parquet", key="
|
982
|
+
artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
|
981
983
|
artifact_v2.versions.df() # see all versions
|
982
984
|
|
983
|
-
|
984
|
-
|
985
|
-
It's inspired by APIs building on AWS S3.
|
986
|
-
|
987
|
-
Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
|
988
|
-
|
989
|
-
In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
|
990
|
-
|
991
|
-
# signature: S3.Bucket.upload_file(filepath, key)
|
992
|
-
import boto3
|
993
|
-
s3 = boto3.resource('s3')
|
994
|
-
bucket = s3.Bucket('mybucket')
|
995
|
-
bucket.upload_file('/tmp/hello.txt', 'hello.txt')
|
996
|
-
|
997
|
-
In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
|
985
|
+
You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
|
998
986
|
|
999
|
-
|
1000
|
-
import quilt3
|
1001
|
-
bucket = quilt3.Bucket('mybucket')
|
1002
|
-
bucket.put_file('hello.txt', '/tmp/hello.txt')
|
987
|
+
ln.settings.storage = "s3://some-bucket"
|
1003
988
|
|
1004
989
|
Sometimes you want to **avoid mapping the artifact into a path hierarchy**, and you only pass `description`::
|
1005
990
|
|
@@ -1034,6 +1019,27 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1034
1019
|
In concurrent workloads where the same artifact is created repeatedly at the exact same time, `.save()`
|
1035
1020
|
detects the duplication and will return the existing artifact.
|
1036
1021
|
|
1022
|
+
.. dropdown:: Why does the constructor look the way it looks?
|
1023
|
+
|
1024
|
+
It's inspired by APIs building on AWS S3.
|
1025
|
+
|
1026
|
+
Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
|
1027
|
+
|
1028
|
+
In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
|
1029
|
+
|
1030
|
+
# signature: S3.Bucket.upload_file(filepath, key)
|
1031
|
+
import boto3
|
1032
|
+
s3 = boto3.resource('s3')
|
1033
|
+
bucket = s3.Bucket('mybucket')
|
1034
|
+
bucket.upload_file('/tmp/hello.txt', 'hello.txt')
|
1035
|
+
|
1036
|
+
In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
|
1037
|
+
|
1038
|
+
# signature: quilt3.Bucket.put_file(key, filepath)
|
1039
|
+
import quilt3
|
1040
|
+
bucket = quilt3.Bucket('mybucket')
|
1041
|
+
bucket.put_file('hello.txt', '/tmp/hello.txt')
|
1042
|
+
|
1037
1043
|
See Also:
|
1038
1044
|
:class:`~lamindb.Storage`
|
1039
1045
|
Storage locations for artifacts.
|
@@ -1089,7 +1095,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1089
1095
|
ln.Artifact.filter(scientist="Barbara McClintock")
|
1090
1096
|
|
1091
1097
|
Features may or may not be part of the artifact content in storage. For
|
1092
|
-
instance, the :class:`~lamindb.
|
1098
|
+
instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
|
1093
1099
|
`DataFrame`-like artifact and annotates it with features corresponding to
|
1094
1100
|
these columns. `artifact.features.add_values`, by contrast, does not
|
1095
1101
|
validate the content of the artifact.
|
@@ -1227,7 +1233,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1227
1233
|
default=None,
|
1228
1234
|
related_name="validated_artifacts",
|
1229
1235
|
)
|
1230
|
-
"""The schema that validated this artifact in a :class:`~lamindb.curators.Curator`."""
|
1236
|
+
"""The schema that validated this artifact in a :class:`~lamindb.curators.core.Curator`."""
|
1231
1237
|
feature_sets: Schema = models.ManyToManyField(
|
1232
1238
|
Schema, related_name="artifacts", through="ArtifactSchema"
|
1233
1239
|
)
|
@@ -1525,7 +1531,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1525
1531
|
::
|
1526
1532
|
|
1527
1533
|
artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
|
1528
|
-
artifact = ln.Arfifact.get(key="
|
1534
|
+
artifact = ln.Arfifact.get(key="examples/my_file.parquet")
|
1529
1535
|
"""
|
1530
1536
|
from .query_set import QuerySet
|
1531
1537
|
|
@@ -1550,7 +1556,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1550
1556
|
|
1551
1557
|
Query by fields::
|
1552
1558
|
|
1553
|
-
ln.Arfifact.filter(key="
|
1559
|
+
ln.Arfifact.filter(key="examples/my_file.parquet")
|
1554
1560
|
|
1555
1561
|
Query by features::
|
1556
1562
|
|
@@ -1610,7 +1616,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1610
1616
|
schema: Schema | None = None,
|
1611
1617
|
**kwargs,
|
1612
1618
|
) -> Artifact:
|
1613
|
-
"""Create from `DataFrame`, validate &
|
1619
|
+
"""Create from `DataFrame`, optionally validate & annotate.
|
1614
1620
|
|
1615
1621
|
Args:
|
1616
1622
|
df: A `DataFrame` object.
|
@@ -1619,7 +1625,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1619
1625
|
description: A description.
|
1620
1626
|
revises: An old version of the artifact.
|
1621
1627
|
run: The run that creates the artifact.
|
1622
|
-
schema: A schema to validate & annotate.
|
1628
|
+
schema: A schema that defines how to validate & annotate.
|
1623
1629
|
|
1624
1630
|
See Also:
|
1625
1631
|
:meth:`~lamindb.Collection`
|
@@ -1627,19 +1633,30 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1627
1633
|
:class:`~lamindb.Feature`
|
1628
1634
|
Track features.
|
1629
1635
|
|
1630
|
-
Example
|
1636
|
+
Example:
|
1631
1637
|
|
1632
|
-
|
1638
|
+
No validation and annotation::
|
1639
|
+
|
1640
|
+
import lamindb as ln
|
1641
|
+
|
1642
|
+
df = ln.core.datasets.mini_immuno.get_dataset1()
|
1643
|
+
artifact = ln.Artifact.from_df(df, key="examples/dataset1.parquet").save()
|
1644
|
+
|
1645
|
+
With validation and annotation.
|
1646
|
+
|
1647
|
+
.. literalinclude:: scripts/curate_dataframe_flexible.py
|
1648
|
+
:language: python
|
1649
|
+
|
1650
|
+
Under-the-hood, this used the following schema.
|
1651
|
+
|
1652
|
+
.. literalinclude:: scripts/define_valid_features.py
|
1653
|
+
:language: python
|
1654
|
+
|
1655
|
+
Valid features & labels were defined as:
|
1656
|
+
|
1657
|
+
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
1658
|
+
:language: python
|
1633
1659
|
|
1634
|
-
df = ln.core.datasets.df_iris_in_meter_batch1()
|
1635
|
-
df.head()
|
1636
|
-
#> sepal_length sepal_width petal_length petal_width iris_organism_code
|
1637
|
-
#> 0 0.051 0.035 0.014 0.002 0
|
1638
|
-
#> 1 0.049 0.030 0.014 0.002 0
|
1639
|
-
#> 2 0.047 0.032 0.013 0.002 0
|
1640
|
-
#> 3 0.046 0.031 0.015 0.002 0
|
1641
|
-
#> 4 0.050 0.036 0.014 0.002 0
|
1642
|
-
artifact = ln.Artifact.from_df(df, key="iris/result_batch1.parquet").save()
|
1643
1660
|
"""
|
1644
1661
|
artifact = Artifact( # type: ignore
|
1645
1662
|
data=df,
|
@@ -1673,7 +1690,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1673
1690
|
schema: Schema | None = None,
|
1674
1691
|
**kwargs,
|
1675
1692
|
) -> Artifact:
|
1676
|
-
"""Create from
|
1693
|
+
"""Create from `AnnData`, optionally validate & annotate.
|
1677
1694
|
|
1678
1695
|
Args:
|
1679
1696
|
adata: An `AnnData` object or a path of AnnData-like.
|
@@ -1682,7 +1699,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1682
1699
|
description: A description.
|
1683
1700
|
revises: An old version of the artifact.
|
1684
1701
|
run: The run that creates the artifact.
|
1685
|
-
schema: A schema to validate & annotate.
|
1702
|
+
schema: A schema that defines how to validate & annotate.
|
1686
1703
|
|
1687
1704
|
See Also:
|
1688
1705
|
|
@@ -1691,12 +1708,31 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1691
1708
|
:class:`~lamindb.Feature`
|
1692
1709
|
Track features.
|
1693
1710
|
|
1694
|
-
Example
|
1711
|
+
Example:
|
1695
1712
|
|
1696
|
-
|
1713
|
+
No validation and annotation::
|
1714
|
+
|
1715
|
+
import lamindb as ln
|
1716
|
+
|
1717
|
+
adata = ln.core.datasets.anndata_with_obs()
|
1718
|
+
artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
|
1719
|
+
|
1720
|
+
With validation and annotation.
|
1721
|
+
|
1722
|
+
.. literalinclude:: scripts/curate_anndata_flexible.py
|
1723
|
+
:language: python
|
1724
|
+
|
1725
|
+
Under-the-hood, this used the following schema.
|
1726
|
+
|
1727
|
+
.. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
|
1728
|
+
:language: python
|
1729
|
+
|
1730
|
+
This schema tranposes the `var` DataFrame during curation, so that one validates and annotates the `var.T` schema, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`.
|
1731
|
+
If one doesn't transpose, one would annotate with the schema of `var`, i.e., `[gene_symbol, gene_type]`.
|
1732
|
+
|
1733
|
+
.. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png
|
1734
|
+
:width: 800px
|
1697
1735
|
|
1698
|
-
adata = ln.core.datasets.anndata_with_obs()
|
1699
|
-
artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
|
1700
1736
|
"""
|
1701
1737
|
if not data_is_anndata(adata):
|
1702
1738
|
raise ValueError(
|
@@ -1745,7 +1781,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1745
1781
|
schema: Schema | None = None,
|
1746
1782
|
**kwargs,
|
1747
1783
|
) -> Artifact:
|
1748
|
-
"""Create from
|
1784
|
+
"""Create from `MuData`, optionally validate & annotate.
|
1749
1785
|
|
1750
1786
|
Args:
|
1751
1787
|
mdata: A `MuData` object.
|
@@ -1754,7 +1790,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1754
1790
|
description: A description.
|
1755
1791
|
revises: An old version of the artifact.
|
1756
1792
|
run: The run that creates the artifact.
|
1757
|
-
schema: A schema to validate & annotate.
|
1793
|
+
schema: A schema that defines how to validate & annotate.
|
1758
1794
|
|
1759
1795
|
See Also:
|
1760
1796
|
:meth:`~lamindb.Collection`
|
@@ -1804,16 +1840,16 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1804
1840
|
schema: Schema | None = None,
|
1805
1841
|
**kwargs,
|
1806
1842
|
) -> Artifact:
|
1807
|
-
"""Create from
|
1843
|
+
"""Create from `SpatialData`, optionally validate & annotate.
|
1808
1844
|
|
1809
1845
|
Args:
|
1810
|
-
|
1846
|
+
sdata: A `SpatialData` object.
|
1811
1847
|
key: A relative path within default storage,
|
1812
1848
|
e.g., `"myfolder/myfile.zarr"`.
|
1813
1849
|
description: A description.
|
1814
1850
|
revises: An old version of the artifact.
|
1815
1851
|
run: The run that creates the artifact.
|
1816
|
-
|
1852
|
+
schema: A schema that defines how to validate & annotate.
|
1817
1853
|
|
1818
1854
|
See Also:
|
1819
1855
|
:meth:`~lamindb.Collection`
|
@@ -1821,11 +1857,21 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1821
1857
|
:class:`~lamindb.Feature`
|
1822
1858
|
Track features.
|
1823
1859
|
|
1824
|
-
Example
|
1860
|
+
Example:
|
1825
1861
|
|
1826
|
-
|
1862
|
+
No validation and annotation::
|
1863
|
+
|
1864
|
+
import lamindb as ln
|
1865
|
+
|
1866
|
+
artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr").save()
|
1827
1867
|
|
1828
|
-
|
1868
|
+
With validation and annotation.
|
1869
|
+
|
1870
|
+
.. literalinclude:: scripts/define_schema_spatialdata.py
|
1871
|
+
:language: python
|
1872
|
+
|
1873
|
+
.. literalinclude:: scripts/curate_spatialdata.py
|
1874
|
+
:language: python
|
1829
1875
|
"""
|
1830
1876
|
if not data_is_spatialdata(sdata):
|
1831
1877
|
raise ValueError(
|
@@ -2117,29 +2163,39 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2117
2163
|
self._old_suffix = self.suffix
|
2118
2164
|
|
2119
2165
|
def open(
|
2120
|
-
self,
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
2128
|
-
|
2129
|
-
|
2166
|
+
self,
|
2167
|
+
mode: str = "r",
|
2168
|
+
engine: Literal["pyarrow", "polars"] = "pyarrow",
|
2169
|
+
is_run_input: bool | None = None,
|
2170
|
+
**kwargs,
|
2171
|
+
) -> (
|
2172
|
+
AnnDataAccessor
|
2173
|
+
| BackedAccessor
|
2174
|
+
| SOMACollection
|
2175
|
+
| SOMAExperiment
|
2176
|
+
| SOMAMeasurement
|
2177
|
+
| PyArrowDataset
|
2178
|
+
| Iterator[PolarsLazyFrame]
|
2179
|
+
):
|
2180
|
+
"""Open a dataset for streaming.
|
2130
2181
|
|
2131
2182
|
Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
|
2132
|
-
`tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats
|
2183
|
+
`tiledbsoma` objects (`.tiledbsoma`), `pyarrow` or `polars` compatible formats
|
2184
|
+
(`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
|
2133
2185
|
|
2134
2186
|
Args:
|
2135
2187
|
mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
|
2136
2188
|
otherwise should be always `"r"` (read-only mode).
|
2189
|
+
engine: Which module to use for lazy loading of a dataframe
|
2190
|
+
from `pyarrow` or `polars` compatible formats.
|
2191
|
+
This has no effect if the artifact is not a dataframe, i.e.
|
2192
|
+
if it is an `AnnData,` `hdf5`, `zarr` or `tiledbsoma` object.
|
2137
2193
|
is_run_input: Whether to track this artifact as run input.
|
2138
2194
|
**kwargs: Keyword arguments for the accessor, i.e. `h5py` or `zarr` connection,
|
2139
|
-
`pyarrow.dataset.dataset
|
2195
|
+
`pyarrow.dataset.dataset`, `polars.scan_*` function.
|
2140
2196
|
|
2141
2197
|
Notes:
|
2142
|
-
For more info, see
|
2198
|
+
For more info, see guide: :doc:`/arrays`.
|
2143
2199
|
|
2144
2200
|
Example::
|
2145
2201
|
|
@@ -2152,6 +2208,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2152
2208
|
#> AnnDataAccessor object with n_obs × n_vars = 70 × 765
|
2153
2209
|
#> constructed for the AnnData object pbmc68k.h5ad
|
2154
2210
|
#> ...
|
2211
|
+
artifact = ln.Artifact.get(key="lndb-storage/df.parquet")
|
2212
|
+
artifact.open()
|
2213
|
+
#> pyarrow._dataset.FileSystemDataset
|
2214
|
+
|
2155
2215
|
"""
|
2156
2216
|
if self._overwrite_versions and not self.is_latest:
|
2157
2217
|
raise ValueError(INCONSISTENT_STATE_MSG)
|
@@ -2159,6 +2219,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2159
2219
|
h5_suffixes = [".h5", ".hdf5", ".h5ad"]
|
2160
2220
|
h5_suffixes += [s + ".gz" for s in h5_suffixes]
|
2161
2221
|
# ignore empty suffix for now
|
2222
|
+
df_suffixes = tuple(set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES))
|
2162
2223
|
suffixes = (
|
2163
2224
|
(
|
2164
2225
|
"",
|
@@ -2167,7 +2228,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2167
2228
|
".tiledbsoma",
|
2168
2229
|
)
|
2169
2230
|
+ tuple(h5_suffixes)
|
2170
|
-
+
|
2231
|
+
+ df_suffixes
|
2171
2232
|
+ tuple(
|
2172
2233
|
s + ".gz" for s in PYARROW_SUFFIXES
|
2173
2234
|
) # this doesn't work for externally gzipped files, REMOVE LATER
|
@@ -2175,10 +2236,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2175
2236
|
if self.suffix not in suffixes:
|
2176
2237
|
raise ValueError(
|
2177
2238
|
"Artifact should have a zarr, h5, tiledbsoma object"
|
2178
|
-
" or a compatible `pyarrow.dataset.dataset` directory"
|
2239
|
+
" or a compatible `pyarrow.dataset.dataset` or `polars.scan_*` directory"
|
2179
2240
|
" as the underlying data, please use one of the following suffixes"
|
2180
2241
|
f" for the object name: {', '.join(suffixes[1:])}."
|
2181
|
-
f" Or no suffix for a folder with {', '.join(
|
2242
|
+
f" Or no suffix for a folder with {', '.join(df_suffixes)} files"
|
2182
2243
|
" (no mixing allowed)."
|
2183
2244
|
)
|
2184
2245
|
if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
|
@@ -2187,10 +2248,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2187
2248
|
)
|
2188
2249
|
|
2189
2250
|
from lamindb import settings
|
2190
|
-
from lamindb.core.storage._backed_access import (
|
2191
|
-
_track_writes_factory,
|
2192
|
-
backed_access,
|
2193
|
-
)
|
2194
2251
|
|
2195
2252
|
using_key = settings._using_key
|
2196
2253
|
filepath, cache_key = filepath_cache_key_from_artifact(
|
@@ -2211,14 +2268,22 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2211
2268
|
) and not filepath.synchronize(localpath, just_check=True)
|
2212
2269
|
if open_cache:
|
2213
2270
|
try:
|
2214
|
-
access = backed_access(
|
2271
|
+
access = backed_access(
|
2272
|
+
localpath, mode, engine, using_key=using_key, **kwargs
|
2273
|
+
)
|
2215
2274
|
except Exception as e:
|
2216
|
-
|
2275
|
+
# also ignore ValueError here because
|
2276
|
+
# such errors most probably just imply an incorrect argument
|
2277
|
+
if isinstance(filepath, LocalPathClasses) or isinstance(
|
2278
|
+
e, (ImportError, ValueError)
|
2279
|
+
):
|
2217
2280
|
raise e
|
2218
2281
|
logger.warning(
|
2219
2282
|
f"The cache might be corrupted: {e}. Trying to open directly."
|
2220
2283
|
)
|
2221
|
-
access = backed_access(
|
2284
|
+
access = backed_access(
|
2285
|
+
filepath, mode, engine, using_key=using_key, **kwargs
|
2286
|
+
)
|
2222
2287
|
# happens only if backed_access has been successful
|
2223
2288
|
# delete the corrupted cache
|
2224
2289
|
if localpath.is_dir():
|
@@ -2226,7 +2291,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2226
2291
|
else:
|
2227
2292
|
localpath.unlink(missing_ok=True)
|
2228
2293
|
else:
|
2229
|
-
access = backed_access(
|
2294
|
+
access = backed_access(
|
2295
|
+
filepath, mode, engine, using_key=using_key, **kwargs
|
2296
|
+
)
|
2230
2297
|
if is_tiledbsoma_w:
|
2231
2298
|
|
2232
2299
|
def finalize():
|
@@ -2327,7 +2394,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2327
2394
|
_track_run_input(self, is_run_input)
|
2328
2395
|
return access_memory
|
2329
2396
|
|
2330
|
-
@doc_args(DEBUG_KWARGS_DOC)
|
2331
2397
|
def cache(
|
2332
2398
|
self, *, is_run_input: bool | None = None, mute: bool = False, **kwargs
|
2333
2399
|
) -> Path:
|
@@ -2340,7 +2406,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2340
2406
|
Args:
|
2341
2407
|
mute: Silence logging of caching progress.
|
2342
2408
|
is_run_input: Whether to track this artifact as run input.
|
2343
|
-
{}
|
2344
2409
|
|
2345
2410
|
Example::
|
2346
2411
|
|
@@ -2399,6 +2464,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2399
2464
|
artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
|
2400
2465
|
artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
|
2401
2466
|
"""
|
2467
|
+
# we're *not* running the line below because the case `storage is None` triggers user feedback in one case
|
2468
|
+
# storage = True if storage is None else storage
|
2469
|
+
|
2402
2470
|
# this first check means an invalid delete fails fast rather than cascading through
|
2403
2471
|
# database and storage permission errors
|
2404
2472
|
if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
|
@@ -2449,8 +2517,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2449
2517
|
# only delete in storage if DB delete is successful
|
2450
2518
|
# DB delete might error because of a foreign key constraint violated etc.
|
2451
2519
|
if self._overwrite_versions and self.is_latest:
|
2452
|
-
|
2453
|
-
|
2520
|
+
logger.important(
|
2521
|
+
"deleting all versions of this artifact because they all share the same store"
|
2522
|
+
)
|
2523
|
+
for version in self.versions.all(): # includes self
|
2454
2524
|
_delete_skip_storage(version)
|
2455
2525
|
else:
|
2456
2526
|
self._delete_skip_storage()
|
@@ -2460,7 +2530,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2460
2530
|
delete_in_storage = False
|
2461
2531
|
if storage:
|
2462
2532
|
logger.warning(
|
2463
|
-
"
|
2533
|
+
"storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
|
2464
2534
|
)
|
2465
2535
|
elif self.key is None or self._key_is_virtual:
|
2466
2536
|
# do not ask for confirmation also if storage is None
|
@@ -2485,13 +2555,11 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2485
2555
|
if delete_msg != "did-not-delete":
|
2486
2556
|
logger.success(f"deleted {colors.yellow(f'{path}')}")
|
2487
2557
|
|
2488
|
-
@doc_args(DEBUG_KWARGS_DOC)
|
2489
2558
|
def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
2490
2559
|
"""Save to database & storage.
|
2491
2560
|
|
2492
2561
|
Args:
|
2493
2562
|
upload: Trigger upload to cloud storage in instances with hybrid storage mode.
|
2494
|
-
{}
|
2495
2563
|
|
2496
2564
|
Example::
|
2497
2565
|
|
@@ -2577,14 +2645,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2577
2645
|
self._branch_code = 1
|
2578
2646
|
self.save()
|
2579
2647
|
|
2580
|
-
def describe(self) -> None:
|
2581
|
-
"""Describe
|
2582
|
-
|
2583
|
-
Example::
|
2648
|
+
def describe(self, return_str: bool = False) -> None:
|
2649
|
+
"""Describe record including linked records.
|
2584
2650
|
|
2585
|
-
|
2651
|
+
Args:
|
2652
|
+
return_str: Return a string instead of printing.
|
2586
2653
|
"""
|
2587
|
-
return describe_artifact_collection(self)
|
2654
|
+
return describe_artifact_collection(self, return_str=return_str)
|
2588
2655
|
|
2589
2656
|
def _populate_subsequent_runs(self, run: Run) -> None:
|
2590
2657
|
_populate_subsequent_runs_(self, run)
|
@@ -2624,9 +2691,11 @@ def _save_skip_storage(artifact, **kwargs) -> None:
|
|
2624
2691
|
|
2625
2692
|
class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
|
2626
2693
|
id: int = models.BigAutoField(primary_key=True)
|
2627
|
-
artifact: Artifact = ForeignKey(
|
2694
|
+
artifact: Artifact = ForeignKey(
|
2695
|
+
Artifact, CASCADE, related_name="links_featurevalue"
|
2696
|
+
)
|
2628
2697
|
# we follow the lower() case convention rather than snake case for link models
|
2629
|
-
featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="
|
2698
|
+
featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
|
2630
2699
|
|
2631
2700
|
class Meta:
|
2632
2701
|
unique_together = ("artifact", "featurevalue")
|
@@ -2634,9 +2703,11 @@ class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
|
|
2634
2703
|
|
2635
2704
|
class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
|
2636
2705
|
id: int = models.BigAutoField(primary_key=True)
|
2637
|
-
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="
|
2706
|
+
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_paramvalue")
|
2638
2707
|
# we follow the lower() case convention rather than snake case for link models
|
2639
|
-
paramvalue: ParamValue = ForeignKey(
|
2708
|
+
paramvalue: ParamValue = ForeignKey(
|
2709
|
+
ParamValue, PROTECT, related_name="links_artifact"
|
2710
|
+
)
|
2640
2711
|
|
2641
2712
|
class Meta:
|
2642
2713
|
unique_together = ("artifact", "paramvalue")
|
@@ -2685,8 +2756,8 @@ def _track_run_input(
|
|
2685
2756
|
# record is on another db
|
2686
2757
|
# we have to save the record into the current db with
|
2687
2758
|
# the run being attached to a transfer transform
|
2688
|
-
logger.
|
2689
|
-
f"completing transfer to track {data.__class__.__name__}('{data.uid[:8]}') as input"
|
2759
|
+
logger.info(
|
2760
|
+
f"completing transfer to track {data.__class__.__name__}('{data.uid[:8]}...') as input"
|
2690
2761
|
)
|
2691
2762
|
data.save()
|
2692
2763
|
is_valid = True
|