lamindb 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +52 -36
- lamindb/_finish.py +17 -10
- lamindb/_tracked.py +1 -1
- lamindb/base/__init__.py +3 -1
- lamindb/base/fields.py +40 -22
- lamindb/base/ids.py +1 -94
- lamindb/base/types.py +2 -0
- lamindb/base/uids.py +117 -0
- lamindb/core/_context.py +177 -89
- lamindb/core/_settings.py +38 -25
- lamindb/core/datasets/__init__.py +11 -4
- lamindb/core/datasets/_core.py +5 -5
- lamindb/core/datasets/_small.py +0 -93
- lamindb/core/datasets/mini_immuno.py +172 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_backed_access.py +100 -6
- lamindb/core/storage/_polars_lazy_df.py +51 -0
- lamindb/core/storage/_pyarrow_dataset.py +15 -30
- lamindb/core/storage/objects.py +6 -0
- lamindb/core/subsettings/__init__.py +2 -0
- lamindb/core/subsettings/_annotation_settings.py +11 -0
- lamindb/curators/__init__.py +7 -3349
- lamindb/curators/_legacy.py +2056 -0
- lamindb/curators/core.py +1546 -0
- lamindb/errors.py +11 -0
- lamindb/examples/__init__.py +27 -0
- lamindb/examples/schemas/__init__.py +12 -0
- lamindb/examples/schemas/_anndata.py +25 -0
- lamindb/examples/schemas/_simple.py +19 -0
- lamindb/integrations/_vitessce.py +8 -5
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
- lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +21 -4
- lamindb/models/_feature_manager.py +365 -286
- lamindb/models/_label_manager.py +8 -2
- lamindb/models/artifact.py +173 -95
- lamindb/models/artifact_set.py +122 -0
- lamindb/models/collection.py +73 -52
- lamindb/models/core.py +1 -1
- lamindb/models/feature.py +51 -17
- lamindb/models/has_parents.py +2 -2
- lamindb/models/project.py +1 -1
- lamindb/models/query_manager.py +221 -22
- lamindb/models/query_set.py +245 -171
- lamindb/models/record.py +62 -243
- lamindb/models/run.py +4 -4
- lamindb/models/save.py +8 -2
- lamindb/models/schema.py +458 -181
- lamindb/models/transform.py +2 -2
- lamindb/models/ulabel.py +8 -5
- {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/METADATA +6 -6
- {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/RECORD +55 -42
- {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
- {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
lamindb/models/_label_manager.py
CHANGED
@@ -24,7 +24,7 @@ from ._describe import (
|
|
24
24
|
TYPE_WIDTH,
|
25
25
|
VALUES_WIDTH,
|
26
26
|
describe_header,
|
27
|
-
|
27
|
+
format_rich_tree,
|
28
28
|
)
|
29
29
|
from ._django import get_artifact_with_related, get_related_model
|
30
30
|
from ._relations import dict_related_model_to_related_name
|
@@ -182,8 +182,14 @@ class LabelManager:
|
|
182
182
|
self._host = host
|
183
183
|
|
184
184
|
def __repr__(self) -> str:
|
185
|
+
return self.describe(return_str=True)
|
186
|
+
|
187
|
+
def describe(self, return_str=True) -> str:
|
188
|
+
"""Describe the labels."""
|
185
189
|
tree = describe_labels(self._host)
|
186
|
-
return
|
190
|
+
return format_rich_tree(
|
191
|
+
tree, fallback="no linked labels", return_str=return_str
|
192
|
+
)
|
187
193
|
|
188
194
|
def add(
|
189
195
|
self,
|
lamindb/models/artifact.py
CHANGED
@@ -5,7 +5,7 @@ import os
|
|
5
5
|
import shutil
|
6
6
|
from collections import defaultdict
|
7
7
|
from pathlib import Path, PurePath, PurePosixPath
|
8
|
-
from typing import TYPE_CHECKING, Any, Union, overload
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal, Union, overload
|
9
9
|
|
10
10
|
import fsspec
|
11
11
|
import lamindb_setup as ln_setup
|
@@ -48,6 +48,11 @@ from ..core.storage import (
|
|
48
48
|
write_to_disk,
|
49
49
|
)
|
50
50
|
from ..core.storage._anndata_accessor import _anndata_n_observations
|
51
|
+
from ..core.storage._backed_access import (
|
52
|
+
_track_writes_factory,
|
53
|
+
backed_access,
|
54
|
+
)
|
55
|
+
from ..core.storage._polars_lazy_df import POLARS_SUFFIXES
|
51
56
|
from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES
|
52
57
|
from ..core.storage._tiledbsoma import _soma_n_observations
|
53
58
|
from ..core.storage.paths import (
|
@@ -105,9 +110,10 @@ except ImportError:
|
|
105
110
|
|
106
111
|
|
107
112
|
if TYPE_CHECKING:
|
108
|
-
from collections.abc import Iterable
|
113
|
+
from collections.abc import Iterable, Iterator
|
109
114
|
|
110
115
|
from mudata import MuData # noqa: TC004
|
116
|
+
from polars import LazyFrame as PolarsLazyFrame
|
111
117
|
from pyarrow.dataset import Dataset as PyArrowDataset
|
112
118
|
from spatialdata import SpatialData # noqa: TC004
|
113
119
|
from tiledbsoma import Collection as SOMACollection
|
@@ -311,10 +317,9 @@ def get_stat_or_artifact(
|
|
311
317
|
result = Artifact.objects.using(instance).filter(hash=hash).all()
|
312
318
|
artifact_with_same_hash_exists = len(result) > 0
|
313
319
|
else:
|
314
|
-
storage_id = settings.storage.id
|
315
320
|
result = (
|
316
321
|
Artifact.objects.using(instance)
|
317
|
-
.filter(Q(hash=hash) | Q(key=key,
|
322
|
+
.filter(Q(hash=hash) | Q(key=key, storage=settings.storage.record))
|
318
323
|
.order_by("-created_at")
|
319
324
|
.all()
|
320
325
|
)
|
@@ -759,15 +764,15 @@ def _describe_sqlite(self, print_types: bool = False): # for artifact & collect
|
|
759
764
|
return tree
|
760
765
|
|
761
766
|
|
762
|
-
def describe_artifact_collection(self
|
763
|
-
from ._describe import
|
767
|
+
def describe_artifact_collection(self, return_str: bool = False) -> str | None:
|
768
|
+
from ._describe import format_rich_tree
|
764
769
|
|
765
770
|
if not self._state.adding and connections[self._state.db].vendor == "postgresql":
|
766
771
|
tree = _describe_postgres(self)
|
767
772
|
else:
|
768
773
|
tree = _describe_sqlite(self)
|
769
774
|
|
770
|
-
|
775
|
+
return format_rich_tree(tree, return_str=return_str)
|
771
776
|
|
772
777
|
|
773
778
|
def validate_feature(feature: Feature, records: list[Record]) -> None:
|
@@ -962,7 +967,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
962
967
|
|
963
968
|
Create an artifact **from a local file or folder**::
|
964
969
|
|
965
|
-
artifact = ln.Artifact("./my_file.parquet", key="
|
970
|
+
artifact = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
|
966
971
|
artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
|
967
972
|
|
968
973
|
Calling `.save()` copies or uploads the file to the default storage location of your lamindb instance.
|
@@ -977,29 +982,12 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
977
982
|
|
978
983
|
You can make a **new version** of an artifact by passing an existing `key`::
|
979
984
|
|
980
|
-
artifact_v2 = ln.Artifact("./my_file.parquet", key="
|
985
|
+
artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
|
981
986
|
artifact_v2.versions.df() # see all versions
|
982
987
|
|
983
|
-
|
984
|
-
|
985
|
-
It's inspired by APIs building on AWS S3.
|
986
|
-
|
987
|
-
Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
|
988
|
+
You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
|
988
989
|
|
989
|
-
|
990
|
-
|
991
|
-
# signature: S3.Bucket.upload_file(filepath, key)
|
992
|
-
import boto3
|
993
|
-
s3 = boto3.resource('s3')
|
994
|
-
bucket = s3.Bucket('mybucket')
|
995
|
-
bucket.upload_file('/tmp/hello.txt', 'hello.txt')
|
996
|
-
|
997
|
-
In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
|
998
|
-
|
999
|
-
# signature: quilt3.Bucket.put_file(key, filepath)
|
1000
|
-
import quilt3
|
1001
|
-
bucket = quilt3.Bucket('mybucket')
|
1002
|
-
bucket.put_file('hello.txt', '/tmp/hello.txt')
|
990
|
+
ln.settings.storage = "s3://some-bucket"
|
1003
991
|
|
1004
992
|
Sometimes you want to **avoid mapping the artifact into a path hierarchy**, and you only pass `description`::
|
1005
993
|
|
@@ -1034,6 +1022,27 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1034
1022
|
In concurrent workloads where the same artifact is created repeatedly at the exact same time, `.save()`
|
1035
1023
|
detects the duplication and will return the existing artifact.
|
1036
1024
|
|
1025
|
+
.. dropdown:: Why does the constructor look the way it looks?
|
1026
|
+
|
1027
|
+
It's inspired by APIs building on AWS S3.
|
1028
|
+
|
1029
|
+
Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
|
1030
|
+
|
1031
|
+
In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
|
1032
|
+
|
1033
|
+
# signature: S3.Bucket.upload_file(filepath, key)
|
1034
|
+
import boto3
|
1035
|
+
s3 = boto3.resource('s3')
|
1036
|
+
bucket = s3.Bucket('mybucket')
|
1037
|
+
bucket.upload_file('/tmp/hello.txt', 'hello.txt')
|
1038
|
+
|
1039
|
+
In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
|
1040
|
+
|
1041
|
+
# signature: quilt3.Bucket.put_file(key, filepath)
|
1042
|
+
import quilt3
|
1043
|
+
bucket = quilt3.Bucket('mybucket')
|
1044
|
+
bucket.put_file('hello.txt', '/tmp/hello.txt')
|
1045
|
+
|
1037
1046
|
See Also:
|
1038
1047
|
:class:`~lamindb.Storage`
|
1039
1048
|
Storage locations for artifacts.
|
@@ -1089,7 +1098,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1089
1098
|
ln.Artifact.filter(scientist="Barbara McClintock")
|
1090
1099
|
|
1091
1100
|
Features may or may not be part of the artifact content in storage. For
|
1092
|
-
instance, the :class:`~lamindb.
|
1101
|
+
instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
|
1093
1102
|
`DataFrame`-like artifact and annotates it with features corresponding to
|
1094
1103
|
these columns. `artifact.features.add_values`, by contrast, does not
|
1095
1104
|
validate the content of the artifact.
|
@@ -1525,7 +1534,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1525
1534
|
::
|
1526
1535
|
|
1527
1536
|
artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
|
1528
|
-
artifact = ln.Arfifact.get(key="
|
1537
|
+
artifact = ln.Arfifact.get(key="examples/my_file.parquet")
|
1529
1538
|
"""
|
1530
1539
|
from .query_set import QuerySet
|
1531
1540
|
|
@@ -1550,7 +1559,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1550
1559
|
|
1551
1560
|
Query by fields::
|
1552
1561
|
|
1553
|
-
ln.Arfifact.filter(key="
|
1562
|
+
ln.Arfifact.filter(key="examples/my_file.parquet")
|
1554
1563
|
|
1555
1564
|
Query by features::
|
1556
1565
|
|
@@ -1610,7 +1619,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1610
1619
|
schema: Schema | None = None,
|
1611
1620
|
**kwargs,
|
1612
1621
|
) -> Artifact:
|
1613
|
-
"""Create from `DataFrame`, validate &
|
1622
|
+
"""Create from `DataFrame`, optionally validate & annotate.
|
1614
1623
|
|
1615
1624
|
Args:
|
1616
1625
|
df: A `DataFrame` object.
|
@@ -1619,7 +1628,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1619
1628
|
description: A description.
|
1620
1629
|
revises: An old version of the artifact.
|
1621
1630
|
run: The run that creates the artifact.
|
1622
|
-
schema: A schema to validate & annotate.
|
1631
|
+
schema: A schema that defines how to validate & annotate.
|
1623
1632
|
|
1624
1633
|
See Also:
|
1625
1634
|
:meth:`~lamindb.Collection`
|
@@ -1627,19 +1636,30 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1627
1636
|
:class:`~lamindb.Feature`
|
1628
1637
|
Track features.
|
1629
1638
|
|
1630
|
-
Example
|
1639
|
+
Example:
|
1631
1640
|
|
1632
|
-
|
1641
|
+
No validation and annotation::
|
1642
|
+
|
1643
|
+
import lamindb as ln
|
1644
|
+
|
1645
|
+
df = ln.core.datasets.mini_immuno.get_dataset1()
|
1646
|
+
artifact = ln.Artifact.from_df(df, key="examples/dataset1.parquet").save()
|
1647
|
+
|
1648
|
+
With validation and annotation.
|
1649
|
+
|
1650
|
+
.. literalinclude:: scripts/curate_dataframe_flexible.py
|
1651
|
+
:language: python
|
1652
|
+
|
1653
|
+
Under-the-hood, this used the following schema.
|
1654
|
+
|
1655
|
+
.. literalinclude:: scripts/define_valid_features.py
|
1656
|
+
:language: python
|
1657
|
+
|
1658
|
+
Valid features & labels were defined as:
|
1659
|
+
|
1660
|
+
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
1661
|
+
:language: python
|
1633
1662
|
|
1634
|
-
df = ln.core.datasets.df_iris_in_meter_batch1()
|
1635
|
-
df.head()
|
1636
|
-
#> sepal_length sepal_width petal_length petal_width iris_organism_code
|
1637
|
-
#> 0 0.051 0.035 0.014 0.002 0
|
1638
|
-
#> 1 0.049 0.030 0.014 0.002 0
|
1639
|
-
#> 2 0.047 0.032 0.013 0.002 0
|
1640
|
-
#> 3 0.046 0.031 0.015 0.002 0
|
1641
|
-
#> 4 0.050 0.036 0.014 0.002 0
|
1642
|
-
artifact = ln.Artifact.from_df(df, key="iris/result_batch1.parquet").save()
|
1643
1663
|
"""
|
1644
1664
|
artifact = Artifact( # type: ignore
|
1645
1665
|
data=df,
|
@@ -1673,7 +1693,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1673
1693
|
schema: Schema | None = None,
|
1674
1694
|
**kwargs,
|
1675
1695
|
) -> Artifact:
|
1676
|
-
"""Create from
|
1696
|
+
"""Create from `AnnData`, optionally validate & annotate.
|
1677
1697
|
|
1678
1698
|
Args:
|
1679
1699
|
adata: An `AnnData` object or a path of AnnData-like.
|
@@ -1682,7 +1702,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1682
1702
|
description: A description.
|
1683
1703
|
revises: An old version of the artifact.
|
1684
1704
|
run: The run that creates the artifact.
|
1685
|
-
schema: A schema to validate & annotate.
|
1705
|
+
schema: A schema that defines how to validate & annotate.
|
1686
1706
|
|
1687
1707
|
See Also:
|
1688
1708
|
|
@@ -1691,12 +1711,31 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1691
1711
|
:class:`~lamindb.Feature`
|
1692
1712
|
Track features.
|
1693
1713
|
|
1694
|
-
Example
|
1714
|
+
Example:
|
1695
1715
|
|
1696
|
-
|
1716
|
+
No validation and annotation::
|
1717
|
+
|
1718
|
+
import lamindb as ln
|
1719
|
+
|
1720
|
+
adata = ln.core.datasets.anndata_with_obs()
|
1721
|
+
artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
|
1722
|
+
|
1723
|
+
With validation and annotation.
|
1724
|
+
|
1725
|
+
.. literalinclude:: scripts/curate_anndata_flexible.py
|
1726
|
+
:language: python
|
1727
|
+
|
1728
|
+
Under-the-hood, this used the following schema.
|
1729
|
+
|
1730
|
+
.. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
|
1731
|
+
:language: python
|
1732
|
+
|
1733
|
+
This schema tranposes the `var` DataFrame during curation, so that one validates and annotates the `var.T` schema, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`.
|
1734
|
+
If one doesn't transpose, one would annotate with the schema of `var`, i.e., `[gene_symbol, gene_type]`.
|
1735
|
+
|
1736
|
+
.. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png
|
1737
|
+
:width: 800px
|
1697
1738
|
|
1698
|
-
adata = ln.core.datasets.anndata_with_obs()
|
1699
|
-
artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
|
1700
1739
|
"""
|
1701
1740
|
if not data_is_anndata(adata):
|
1702
1741
|
raise ValueError(
|
@@ -1745,7 +1784,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1745
1784
|
schema: Schema | None = None,
|
1746
1785
|
**kwargs,
|
1747
1786
|
) -> Artifact:
|
1748
|
-
"""Create from
|
1787
|
+
"""Create from `MuData`, optionally validate & annotate.
|
1749
1788
|
|
1750
1789
|
Args:
|
1751
1790
|
mdata: A `MuData` object.
|
@@ -1754,7 +1793,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1754
1793
|
description: A description.
|
1755
1794
|
revises: An old version of the artifact.
|
1756
1795
|
run: The run that creates the artifact.
|
1757
|
-
schema: A schema to validate & annotate.
|
1796
|
+
schema: A schema that defines how to validate & annotate.
|
1758
1797
|
|
1759
1798
|
See Also:
|
1760
1799
|
:meth:`~lamindb.Collection`
|
@@ -1804,16 +1843,16 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1804
1843
|
schema: Schema | None = None,
|
1805
1844
|
**kwargs,
|
1806
1845
|
) -> Artifact:
|
1807
|
-
"""Create from
|
1846
|
+
"""Create from `SpatialData`, optionally validate & annotate.
|
1808
1847
|
|
1809
1848
|
Args:
|
1810
|
-
|
1849
|
+
sdata: A `SpatialData` object.
|
1811
1850
|
key: A relative path within default storage,
|
1812
1851
|
e.g., `"myfolder/myfile.zarr"`.
|
1813
1852
|
description: A description.
|
1814
1853
|
revises: An old version of the artifact.
|
1815
1854
|
run: The run that creates the artifact.
|
1816
|
-
|
1855
|
+
schema: A schema that defines how to validate & annotate.
|
1817
1856
|
|
1818
1857
|
See Also:
|
1819
1858
|
:meth:`~lamindb.Collection`
|
@@ -1821,11 +1860,21 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
1821
1860
|
:class:`~lamindb.Feature`
|
1822
1861
|
Track features.
|
1823
1862
|
|
1824
|
-
Example
|
1863
|
+
Example:
|
1825
1864
|
|
1826
|
-
|
1865
|
+
No validation and annotation::
|
1866
|
+
|
1867
|
+
import lamindb as ln
|
1868
|
+
|
1869
|
+
artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr").save()
|
1827
1870
|
|
1828
|
-
|
1871
|
+
With validation and annotation.
|
1872
|
+
|
1873
|
+
.. literalinclude:: scripts/define_schema_spatialdata.py
|
1874
|
+
:language: python
|
1875
|
+
|
1876
|
+
.. literalinclude:: scripts/curate_spatialdata.py
|
1877
|
+
:language: python
|
1829
1878
|
"""
|
1830
1879
|
if not data_is_spatialdata(sdata):
|
1831
1880
|
raise ValueError(
|
@@ -2117,29 +2166,39 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2117
2166
|
self._old_suffix = self.suffix
|
2118
2167
|
|
2119
2168
|
def open(
|
2120
|
-
self,
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
2128
|
-
|
2129
|
-
|
2169
|
+
self,
|
2170
|
+
mode: str = "r",
|
2171
|
+
engine: Literal["pyarrow", "polars"] = "pyarrow",
|
2172
|
+
is_run_input: bool | None = None,
|
2173
|
+
**kwargs,
|
2174
|
+
) -> (
|
2175
|
+
AnnDataAccessor
|
2176
|
+
| BackedAccessor
|
2177
|
+
| SOMACollection
|
2178
|
+
| SOMAExperiment
|
2179
|
+
| SOMAMeasurement
|
2180
|
+
| PyArrowDataset
|
2181
|
+
| Iterator[PolarsLazyFrame]
|
2182
|
+
):
|
2183
|
+
"""Open a dataset for streaming.
|
2130
2184
|
|
2131
2185
|
Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
|
2132
|
-
`tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats
|
2186
|
+
`tiledbsoma` objects (`.tiledbsoma`), `pyarrow` or `polars` compatible formats
|
2187
|
+
(`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
|
2133
2188
|
|
2134
2189
|
Args:
|
2135
2190
|
mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
|
2136
2191
|
otherwise should be always `"r"` (read-only mode).
|
2192
|
+
engine: Which module to use for lazy loading of a dataframe
|
2193
|
+
from `pyarrow` or `polars` compatible formats.
|
2194
|
+
This has no effect if the artifact is not a dataframe, i.e.
|
2195
|
+
if it is an `AnnData,` `hdf5`, `zarr` or `tiledbsoma` object.
|
2137
2196
|
is_run_input: Whether to track this artifact as run input.
|
2138
2197
|
**kwargs: Keyword arguments for the accessor, i.e. `h5py` or `zarr` connection,
|
2139
|
-
`pyarrow.dataset.dataset
|
2198
|
+
`pyarrow.dataset.dataset`, `polars.scan_*` function.
|
2140
2199
|
|
2141
2200
|
Notes:
|
2142
|
-
For more info, see
|
2201
|
+
For more info, see guide: :doc:`/arrays`.
|
2143
2202
|
|
2144
2203
|
Example::
|
2145
2204
|
|
@@ -2152,6 +2211,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2152
2211
|
#> AnnDataAccessor object with n_obs × n_vars = 70 × 765
|
2153
2212
|
#> constructed for the AnnData object pbmc68k.h5ad
|
2154
2213
|
#> ...
|
2214
|
+
artifact = ln.Artifact.get(key="lndb-storage/df.parquet")
|
2215
|
+
artifact.open()
|
2216
|
+
#> pyarrow._dataset.FileSystemDataset
|
2217
|
+
|
2155
2218
|
"""
|
2156
2219
|
if self._overwrite_versions and not self.is_latest:
|
2157
2220
|
raise ValueError(INCONSISTENT_STATE_MSG)
|
@@ -2159,6 +2222,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2159
2222
|
h5_suffixes = [".h5", ".hdf5", ".h5ad"]
|
2160
2223
|
h5_suffixes += [s + ".gz" for s in h5_suffixes]
|
2161
2224
|
# ignore empty suffix for now
|
2225
|
+
df_suffixes = tuple(set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES))
|
2162
2226
|
suffixes = (
|
2163
2227
|
(
|
2164
2228
|
"",
|
@@ -2167,7 +2231,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2167
2231
|
".tiledbsoma",
|
2168
2232
|
)
|
2169
2233
|
+ tuple(h5_suffixes)
|
2170
|
-
+
|
2234
|
+
+ df_suffixes
|
2171
2235
|
+ tuple(
|
2172
2236
|
s + ".gz" for s in PYARROW_SUFFIXES
|
2173
2237
|
) # this doesn't work for externally gzipped files, REMOVE LATER
|
@@ -2175,10 +2239,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2175
2239
|
if self.suffix not in suffixes:
|
2176
2240
|
raise ValueError(
|
2177
2241
|
"Artifact should have a zarr, h5, tiledbsoma object"
|
2178
|
-
" or a compatible `pyarrow.dataset.dataset` directory"
|
2242
|
+
" or a compatible `pyarrow.dataset.dataset` or `polars.scan_*` directory"
|
2179
2243
|
" as the underlying data, please use one of the following suffixes"
|
2180
2244
|
f" for the object name: {', '.join(suffixes[1:])}."
|
2181
|
-
f" Or no suffix for a folder with {', '.join(
|
2245
|
+
f" Or no suffix for a folder with {', '.join(df_suffixes)} files"
|
2182
2246
|
" (no mixing allowed)."
|
2183
2247
|
)
|
2184
2248
|
if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
|
@@ -2187,10 +2251,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2187
2251
|
)
|
2188
2252
|
|
2189
2253
|
from lamindb import settings
|
2190
|
-
from lamindb.core.storage._backed_access import (
|
2191
|
-
_track_writes_factory,
|
2192
|
-
backed_access,
|
2193
|
-
)
|
2194
2254
|
|
2195
2255
|
using_key = settings._using_key
|
2196
2256
|
filepath, cache_key = filepath_cache_key_from_artifact(
|
@@ -2211,14 +2271,22 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2211
2271
|
) and not filepath.synchronize(localpath, just_check=True)
|
2212
2272
|
if open_cache:
|
2213
2273
|
try:
|
2214
|
-
access = backed_access(
|
2274
|
+
access = backed_access(
|
2275
|
+
localpath, mode, engine, using_key=using_key, **kwargs
|
2276
|
+
)
|
2215
2277
|
except Exception as e:
|
2216
|
-
|
2278
|
+
# also ignore ValueError here because
|
2279
|
+
# such errors most probably just imply an incorrect argument
|
2280
|
+
if isinstance(filepath, LocalPathClasses) or isinstance(
|
2281
|
+
e, (ImportError, ValueError)
|
2282
|
+
):
|
2217
2283
|
raise e
|
2218
2284
|
logger.warning(
|
2219
2285
|
f"The cache might be corrupted: {e}. Trying to open directly."
|
2220
2286
|
)
|
2221
|
-
access = backed_access(
|
2287
|
+
access = backed_access(
|
2288
|
+
filepath, mode, engine, using_key=using_key, **kwargs
|
2289
|
+
)
|
2222
2290
|
# happens only if backed_access has been successful
|
2223
2291
|
# delete the corrupted cache
|
2224
2292
|
if localpath.is_dir():
|
@@ -2226,7 +2294,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2226
2294
|
else:
|
2227
2295
|
localpath.unlink(missing_ok=True)
|
2228
2296
|
else:
|
2229
|
-
access = backed_access(
|
2297
|
+
access = backed_access(
|
2298
|
+
filepath, mode, engine, using_key=using_key, **kwargs
|
2299
|
+
)
|
2230
2300
|
if is_tiledbsoma_w:
|
2231
2301
|
|
2232
2302
|
def finalize():
|
@@ -2399,6 +2469,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2399
2469
|
artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
|
2400
2470
|
artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
|
2401
2471
|
"""
|
2472
|
+
# we're *not* running the line below because the case `storage is None` triggers user feedback in one case
|
2473
|
+
# storage = True if storage is None else storage
|
2474
|
+
|
2402
2475
|
# this first check means an invalid delete fails fast rather than cascading through
|
2403
2476
|
# database and storage permission errors
|
2404
2477
|
if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
|
@@ -2449,8 +2522,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2449
2522
|
# only delete in storage if DB delete is successful
|
2450
2523
|
# DB delete might error because of a foreign key constraint violated etc.
|
2451
2524
|
if self._overwrite_versions and self.is_latest:
|
2452
|
-
|
2453
|
-
|
2525
|
+
logger.important(
|
2526
|
+
"deleting all versions of this artifact because they all share the same store"
|
2527
|
+
)
|
2528
|
+
for version in self.versions.all(): # includes self
|
2454
2529
|
_delete_skip_storage(version)
|
2455
2530
|
else:
|
2456
2531
|
self._delete_skip_storage()
|
@@ -2460,7 +2535,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2460
2535
|
delete_in_storage = False
|
2461
2536
|
if storage:
|
2462
2537
|
logger.warning(
|
2463
|
-
"
|
2538
|
+
"storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
|
2464
2539
|
)
|
2465
2540
|
elif self.key is None or self._key_is_virtual:
|
2466
2541
|
# do not ask for confirmation also if storage is None
|
@@ -2577,14 +2652,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2577
2652
|
self._branch_code = 1
|
2578
2653
|
self.save()
|
2579
2654
|
|
2580
|
-
def describe(self) -> None:
|
2581
|
-
"""Describe
|
2655
|
+
def describe(self, return_str: bool = False) -> None:
|
2656
|
+
"""Describe record including linked records.
|
2582
2657
|
|
2583
|
-
|
2584
|
-
|
2585
|
-
artifact.describe()
|
2658
|
+
Args:
|
2659
|
+
return_str: Return a string instead of printing.
|
2586
2660
|
"""
|
2587
|
-
return describe_artifact_collection(self)
|
2661
|
+
return describe_artifact_collection(self, return_str=return_str)
|
2588
2662
|
|
2589
2663
|
def _populate_subsequent_runs(self, run: Run) -> None:
|
2590
2664
|
_populate_subsequent_runs_(self, run)
|
@@ -2624,9 +2698,11 @@ def _save_skip_storage(artifact, **kwargs) -> None:
|
|
2624
2698
|
|
2625
2699
|
class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
|
2626
2700
|
id: int = models.BigAutoField(primary_key=True)
|
2627
|
-
artifact: Artifact = ForeignKey(
|
2701
|
+
artifact: Artifact = ForeignKey(
|
2702
|
+
Artifact, CASCADE, related_name="links_featurevalue"
|
2703
|
+
)
|
2628
2704
|
# we follow the lower() case convention rather than snake case for link models
|
2629
|
-
featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="
|
2705
|
+
featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
|
2630
2706
|
|
2631
2707
|
class Meta:
|
2632
2708
|
unique_together = ("artifact", "featurevalue")
|
@@ -2634,9 +2710,11 @@ class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
|
|
2634
2710
|
|
2635
2711
|
class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
|
2636
2712
|
id: int = models.BigAutoField(primary_key=True)
|
2637
|
-
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="
|
2713
|
+
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_paramvalue")
|
2638
2714
|
# we follow the lower() case convention rather than snake case for link models
|
2639
|
-
paramvalue: ParamValue = ForeignKey(
|
2715
|
+
paramvalue: ParamValue = ForeignKey(
|
2716
|
+
ParamValue, PROTECT, related_name="links_artifact"
|
2717
|
+
)
|
2640
2718
|
|
2641
2719
|
class Meta:
|
2642
2720
|
unique_together = ("artifact", "paramvalue")
|