lamindb 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. lamindb/__init__.py +52 -36
  2. lamindb/_finish.py +17 -10
  3. lamindb/_tracked.py +1 -1
  4. lamindb/base/__init__.py +3 -1
  5. lamindb/base/fields.py +40 -22
  6. lamindb/base/ids.py +1 -94
  7. lamindb/base/types.py +2 -0
  8. lamindb/base/uids.py +117 -0
  9. lamindb/core/_context.py +203 -102
  10. lamindb/core/_settings.py +38 -25
  11. lamindb/core/datasets/__init__.py +11 -4
  12. lamindb/core/datasets/_core.py +5 -5
  13. lamindb/core/datasets/_small.py +0 -93
  14. lamindb/core/datasets/mini_immuno.py +172 -0
  15. lamindb/core/loaders.py +1 -1
  16. lamindb/core/storage/_backed_access.py +100 -6
  17. lamindb/core/storage/_polars_lazy_df.py +51 -0
  18. lamindb/core/storage/_pyarrow_dataset.py +15 -30
  19. lamindb/core/storage/_tiledbsoma.py +29 -13
  20. lamindb/core/storage/objects.py +6 -0
  21. lamindb/core/subsettings/__init__.py +2 -0
  22. lamindb/core/subsettings/_annotation_settings.py +11 -0
  23. lamindb/curators/__init__.py +7 -3349
  24. lamindb/curators/_legacy.py +2056 -0
  25. lamindb/curators/core.py +1534 -0
  26. lamindb/errors.py +11 -0
  27. lamindb/examples/__init__.py +27 -0
  28. lamindb/examples/schemas/__init__.py +12 -0
  29. lamindb/examples/schemas/_anndata.py +25 -0
  30. lamindb/examples/schemas/_simple.py +19 -0
  31. lamindb/integrations/_vitessce.py +8 -5
  32. lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
  33. lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
  34. lamindb/migrations/0093_alter_schemacomponent_unique_together.py +16 -0
  35. lamindb/models/__init__.py +4 -1
  36. lamindb/models/_describe.py +21 -4
  37. lamindb/models/_feature_manager.py +382 -287
  38. lamindb/models/_label_manager.py +8 -2
  39. lamindb/models/artifact.py +177 -106
  40. lamindb/models/artifact_set.py +122 -0
  41. lamindb/models/collection.py +73 -52
  42. lamindb/models/core.py +1 -1
  43. lamindb/models/feature.py +51 -17
  44. lamindb/models/has_parents.py +69 -14
  45. lamindb/models/project.py +1 -1
  46. lamindb/models/query_manager.py +221 -22
  47. lamindb/models/query_set.py +247 -172
  48. lamindb/models/record.py +65 -247
  49. lamindb/models/run.py +4 -4
  50. lamindb/models/save.py +8 -2
  51. lamindb/models/schema.py +456 -184
  52. lamindb/models/transform.py +2 -2
  53. lamindb/models/ulabel.py +8 -5
  54. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/METADATA +6 -6
  55. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/RECORD +57 -43
  56. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/LICENSE +0 -0
  57. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/WHEEL +0 -0
@@ -24,7 +24,7 @@ from ._describe import (
24
24
  TYPE_WIDTH,
25
25
  VALUES_WIDTH,
26
26
  describe_header,
27
- print_rich_tree,
27
+ format_rich_tree,
28
28
  )
29
29
  from ._django import get_artifact_with_related, get_related_model
30
30
  from ._relations import dict_related_model_to_related_name
@@ -182,8 +182,14 @@ class LabelManager:
182
182
  self._host = host
183
183
 
184
184
  def __repr__(self) -> str:
185
+ return self.describe(return_str=True)
186
+
187
+ def describe(self, return_str=True) -> str:
188
+ """Describe the labels."""
185
189
  tree = describe_labels(self._host)
186
- return print_rich_tree(tree, fallback="no linked labels")
190
+ return format_rich_tree(
191
+ tree, fallback="no linked labels", return_str=return_str
192
+ )
187
193
 
188
194
  def add(
189
195
  self,
@@ -5,7 +5,7 @@ import os
5
5
  import shutil
6
6
  from collections import defaultdict
7
7
  from pathlib import Path, PurePath, PurePosixPath
8
- from typing import TYPE_CHECKING, Any, Union, overload
8
+ from typing import TYPE_CHECKING, Any, Literal, Union, overload
9
9
 
10
10
  import fsspec
11
11
  import lamindb_setup as ln_setup
@@ -17,7 +17,6 @@ from django.db.models import CASCADE, PROTECT, Q
17
17
  from lamin_utils import colors, logger
18
18
  from lamindb_setup import settings as setup_settings
19
19
  from lamindb_setup._init_instance import register_storage_in_instance
20
- from lamindb_setup.core import doc_args
21
20
  from lamindb_setup.core._settings_storage import init_storage
22
21
  from lamindb_setup.core.hashing import HASH_LENGTH, hash_dir, hash_file
23
22
  from lamindb_setup.core.types import UPathStr
@@ -48,6 +47,11 @@ from ..core.storage import (
48
47
  write_to_disk,
49
48
  )
50
49
  from ..core.storage._anndata_accessor import _anndata_n_observations
50
+ from ..core.storage._backed_access import (
51
+ _track_writes_factory,
52
+ backed_access,
53
+ )
54
+ from ..core.storage._polars_lazy_df import POLARS_SUFFIXES
51
55
  from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES
52
56
  from ..core.storage._tiledbsoma import _soma_n_observations
53
57
  from ..core.storage.paths import (
@@ -94,8 +98,6 @@ WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-r
94
98
 
95
99
  WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"
96
100
 
97
- DEBUG_KWARGS_DOC = "**kwargs: Internal arguments for debugging."
98
-
99
101
  try:
100
102
  from ..core.storage._zarr import identify_zarr_type
101
103
  except ImportError:
@@ -105,9 +107,10 @@ except ImportError:
105
107
 
106
108
 
107
109
  if TYPE_CHECKING:
108
- from collections.abc import Iterable
110
+ from collections.abc import Iterable, Iterator
109
111
 
110
112
  from mudata import MuData # noqa: TC004
113
+ from polars import LazyFrame as PolarsLazyFrame
111
114
  from pyarrow.dataset import Dataset as PyArrowDataset
112
115
  from spatialdata import SpatialData # noqa: TC004
113
116
  from tiledbsoma import Collection as SOMACollection
@@ -311,10 +314,9 @@ def get_stat_or_artifact(
311
314
  result = Artifact.objects.using(instance).filter(hash=hash).all()
312
315
  artifact_with_same_hash_exists = len(result) > 0
313
316
  else:
314
- storage_id = settings.storage.id
315
317
  result = (
316
318
  Artifact.objects.using(instance)
317
- .filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
319
+ .filter(Q(hash=hash) | Q(key=key, storage=settings.storage.record))
318
320
  .order_by("-created_at")
319
321
  .all()
320
322
  )
@@ -759,15 +761,15 @@ def _describe_sqlite(self, print_types: bool = False): # for artifact & collect
759
761
  return tree
760
762
 
761
763
 
762
- def describe_artifact_collection(self): # for artifact & collection
763
- from ._describe import print_rich_tree
764
+ def describe_artifact_collection(self, return_str: bool = False) -> str | None:
765
+ from ._describe import format_rich_tree
764
766
 
765
767
  if not self._state.adding and connections[self._state.db].vendor == "postgresql":
766
768
  tree = _describe_postgres(self)
767
769
  else:
768
770
  tree = _describe_sqlite(self)
769
771
 
770
- print_rich_tree(tree)
772
+ return format_rich_tree(tree, return_str=return_str)
771
773
 
772
774
 
773
775
  def validate_feature(feature: Feature, records: list[Record]) -> None:
@@ -909,7 +911,7 @@ def add_labels(
909
911
  for registry_name, records in records_by_registry.items():
910
912
  if not from_curator and feature.name in internal_features:
911
913
  raise ValidationError(
912
- "Cannot manually annotate internal feature with label. Please use ln.Curator"
914
+ "Cannot manually annotate a feature measured *within* the dataset. Please use a Curator."
913
915
  )
914
916
  if registry_name not in feature.dtype:
915
917
  if not feature.dtype.startswith("cat"):
@@ -962,7 +964,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
962
964
 
963
965
  Create an artifact **from a local file or folder**::
964
966
 
965
- artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
967
+ artifact = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
966
968
  artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
967
969
 
968
970
  Calling `.save()` copies or uploads the file to the default storage location of your lamindb instance.
@@ -977,29 +979,12 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
977
979
 
978
980
  You can make a **new version** of an artifact by passing an existing `key`::
979
981
 
980
- artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
982
+ artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
981
983
  artifact_v2.versions.df() # see all versions
982
984
 
983
- .. dropdown:: Why does the API look this way?
984
-
985
- It's inspired by APIs building on AWS S3.
986
-
987
- Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
988
-
989
- In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
990
-
991
- # signature: S3.Bucket.upload_file(filepath, key)
992
- import boto3
993
- s3 = boto3.resource('s3')
994
- bucket = s3.Bucket('mybucket')
995
- bucket.upload_file('/tmp/hello.txt', 'hello.txt')
996
-
997
- In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
985
+ You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
998
986
 
999
- # signature: quilt3.Bucket.put_file(key, filepath)
1000
- import quilt3
1001
- bucket = quilt3.Bucket('mybucket')
1002
- bucket.put_file('hello.txt', '/tmp/hello.txt')
987
+ ln.settings.storage = "s3://some-bucket"
1003
988
 
1004
989
  Sometimes you want to **avoid mapping the artifact into a path hierarchy**, and you only pass `description`::
1005
990
 
@@ -1034,6 +1019,27 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1034
1019
  In concurrent workloads where the same artifact is created repeatedly at the exact same time, `.save()`
1035
1020
  detects the duplication and will return the existing artifact.
1036
1021
 
1022
+ .. dropdown:: Why does the constructor look the way it looks?
1023
+
1024
+ It's inspired by APIs building on AWS S3.
1025
+
1026
+ Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
1027
+
1028
+ In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
1029
+
1030
+ # signature: S3.Bucket.upload_file(filepath, key)
1031
+ import boto3
1032
+ s3 = boto3.resource('s3')
1033
+ bucket = s3.Bucket('mybucket')
1034
+ bucket.upload_file('/tmp/hello.txt', 'hello.txt')
1035
+
1036
+ In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
1037
+
1038
+ # signature: quilt3.Bucket.put_file(key, filepath)
1039
+ import quilt3
1040
+ bucket = quilt3.Bucket('mybucket')
1041
+ bucket.put_file('hello.txt', '/tmp/hello.txt')
1042
+
1037
1043
  See Also:
1038
1044
  :class:`~lamindb.Storage`
1039
1045
  Storage locations for artifacts.
@@ -1089,7 +1095,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1089
1095
  ln.Artifact.filter(scientist="Barbara McClintock")
1090
1096
 
1091
1097
  Features may or may not be part of the artifact content in storage. For
1092
- instance, the :class:`~lamindb.Curator` flow validates the columns of a
1098
+ instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1093
1099
  `DataFrame`-like artifact and annotates it with features corresponding to
1094
1100
  these columns. `artifact.features.add_values`, by contrast, does not
1095
1101
  validate the content of the artifact.
@@ -1227,7 +1233,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1227
1233
  default=None,
1228
1234
  related_name="validated_artifacts",
1229
1235
  )
1230
- """The schema that validated this artifact in a :class:`~lamindb.curators.Curator`."""
1236
+ """The schema that validated this artifact in a :class:`~lamindb.curators.core.Curator`."""
1231
1237
  feature_sets: Schema = models.ManyToManyField(
1232
1238
  Schema, related_name="artifacts", through="ArtifactSchema"
1233
1239
  )
@@ -1525,7 +1531,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1525
1531
  ::
1526
1532
 
1527
1533
  artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
1528
- artifact = ln.Arfifact.get(key="my_datasets/my_file.parquet")
1534
+ artifact = ln.Arfifact.get(key="examples/my_file.parquet")
1529
1535
  """
1530
1536
  from .query_set import QuerySet
1531
1537
 
@@ -1550,7 +1556,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1550
1556
 
1551
1557
  Query by fields::
1552
1558
 
1553
- ln.Arfifact.filter(key="my_datasets/my_file.parquet")
1559
+ ln.Arfifact.filter(key="examples/my_file.parquet")
1554
1560
 
1555
1561
  Query by features::
1556
1562
 
@@ -1610,7 +1616,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1610
1616
  schema: Schema | None = None,
1611
1617
  **kwargs,
1612
1618
  ) -> Artifact:
1613
- """Create from `DataFrame`, validate & link features.
1619
+ """Create from `DataFrame`, optionally validate & annotate.
1614
1620
 
1615
1621
  Args:
1616
1622
  df: A `DataFrame` object.
@@ -1619,7 +1625,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1619
1625
  description: A description.
1620
1626
  revises: An old version of the artifact.
1621
1627
  run: The run that creates the artifact.
1622
- schema: A schema to validate & annotate.
1628
+ schema: A schema that defines how to validate & annotate.
1623
1629
 
1624
1630
  See Also:
1625
1631
  :meth:`~lamindb.Collection`
@@ -1627,19 +1633,30 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1627
1633
  :class:`~lamindb.Feature`
1628
1634
  Track features.
1629
1635
 
1630
- Example::
1636
+ Example:
1631
1637
 
1632
- import lamindb as ln
1638
+ No validation and annotation::
1639
+
1640
+ import lamindb as ln
1641
+
1642
+ df = ln.core.datasets.mini_immuno.get_dataset1()
1643
+ artifact = ln.Artifact.from_df(df, key="examples/dataset1.parquet").save()
1644
+
1645
+ With validation and annotation.
1646
+
1647
+ .. literalinclude:: scripts/curate_dataframe_flexible.py
1648
+ :language: python
1649
+
1650
+ Under-the-hood, this used the following schema.
1651
+
1652
+ .. literalinclude:: scripts/define_valid_features.py
1653
+ :language: python
1654
+
1655
+ Valid features & labels were defined as:
1656
+
1657
+ .. literalinclude:: scripts/define_mini_immuno_features_labels.py
1658
+ :language: python
1633
1659
 
1634
- df = ln.core.datasets.df_iris_in_meter_batch1()
1635
- df.head()
1636
- #> sepal_length sepal_width petal_length petal_width iris_organism_code
1637
- #> 0 0.051 0.035 0.014 0.002 0
1638
- #> 1 0.049 0.030 0.014 0.002 0
1639
- #> 2 0.047 0.032 0.013 0.002 0
1640
- #> 3 0.046 0.031 0.015 0.002 0
1641
- #> 4 0.050 0.036 0.014 0.002 0
1642
- artifact = ln.Artifact.from_df(df, key="iris/result_batch1.parquet").save()
1643
1660
  """
1644
1661
  artifact = Artifact( # type: ignore
1645
1662
  data=df,
@@ -1673,7 +1690,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1673
1690
  schema: Schema | None = None,
1674
1691
  **kwargs,
1675
1692
  ) -> Artifact:
1676
- """Create from ``AnnData``, validate & link features.
1693
+ """Create from `AnnData`, optionally validate & annotate.
1677
1694
 
1678
1695
  Args:
1679
1696
  adata: An `AnnData` object or a path of AnnData-like.
@@ -1682,7 +1699,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1682
1699
  description: A description.
1683
1700
  revises: An old version of the artifact.
1684
1701
  run: The run that creates the artifact.
1685
- schema: A schema to validate & annotate.
1702
+ schema: A schema that defines how to validate & annotate.
1686
1703
 
1687
1704
  See Also:
1688
1705
 
@@ -1691,12 +1708,31 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1691
1708
  :class:`~lamindb.Feature`
1692
1709
  Track features.
1693
1710
 
1694
- Example::
1711
+ Example:
1695
1712
 
1696
- import lamindb as ln
1713
+ No validation and annotation::
1714
+
1715
+ import lamindb as ln
1716
+
1717
+ adata = ln.core.datasets.anndata_with_obs()
1718
+ artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
1719
+
1720
+ With validation and annotation.
1721
+
1722
+ .. literalinclude:: scripts/curate_anndata_flexible.py
1723
+ :language: python
1724
+
1725
+ Under-the-hood, this used the following schema.
1726
+
1727
+ .. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
1728
+ :language: python
1729
+
1730
+ This schema tranposes the `var` DataFrame during curation, so that one validates and annotates the `var.T` schema, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`.
1731
+ If one doesn't transpose, one would annotate with the schema of `var`, i.e., `[gene_symbol, gene_type]`.
1732
+
1733
+ .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png
1734
+ :width: 800px
1697
1735
 
1698
- adata = ln.core.datasets.anndata_with_obs()
1699
- artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
1700
1736
  """
1701
1737
  if not data_is_anndata(adata):
1702
1738
  raise ValueError(
@@ -1745,7 +1781,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1745
1781
  schema: Schema | None = None,
1746
1782
  **kwargs,
1747
1783
  ) -> Artifact:
1748
- """Create from ``MuData``, validate & link features.
1784
+ """Create from `MuData`, optionally validate & annotate.
1749
1785
 
1750
1786
  Args:
1751
1787
  mdata: A `MuData` object.
@@ -1754,7 +1790,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1754
1790
  description: A description.
1755
1791
  revises: An old version of the artifact.
1756
1792
  run: The run that creates the artifact.
1757
- schema: A schema to validate & annotate.
1793
+ schema: A schema that defines how to validate & annotate.
1758
1794
 
1759
1795
  See Also:
1760
1796
  :meth:`~lamindb.Collection`
@@ -1804,16 +1840,16 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1804
1840
  schema: Schema | None = None,
1805
1841
  **kwargs,
1806
1842
  ) -> Artifact:
1807
- """Create from ``SpatialData``, validate & link features.
1843
+ """Create from `SpatialData`, optionally validate & annotate.
1808
1844
 
1809
1845
  Args:
1810
- mdata: A `SpatialData` object.
1846
+ sdata: A `SpatialData` object.
1811
1847
  key: A relative path within default storage,
1812
1848
  e.g., `"myfolder/myfile.zarr"`.
1813
1849
  description: A description.
1814
1850
  revises: An old version of the artifact.
1815
1851
  run: The run that creates the artifact.
1816
- schema: A schema to validate & annotate.
1852
+ schema: A schema that defines how to validate & annotate.
1817
1853
 
1818
1854
  See Also:
1819
1855
  :meth:`~lamindb.Collection`
@@ -1821,11 +1857,21 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1821
1857
  :class:`~lamindb.Feature`
1822
1858
  Track features.
1823
1859
 
1824
- Example::
1860
+ Example:
1825
1861
 
1826
- import lamindb as ln
1862
+ No validation and annotation::
1863
+
1864
+ import lamindb as ln
1865
+
1866
+ artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr").save()
1827
1867
 
1828
- artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr").save()
1868
+ With validation and annotation.
1869
+
1870
+ .. literalinclude:: scripts/define_schema_spatialdata.py
1871
+ :language: python
1872
+
1873
+ .. literalinclude:: scripts/curate_spatialdata.py
1874
+ :language: python
1829
1875
  """
1830
1876
  if not data_is_spatialdata(sdata):
1831
1877
  raise ValueError(
@@ -2117,29 +2163,39 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2117
2163
  self._old_suffix = self.suffix
2118
2164
 
2119
2165
  def open(
2120
- self, mode: str = "r", is_run_input: bool | None = None, **kwargs
2121
- ) -> Union[
2122
- AnnDataAccessor,
2123
- BackedAccessor,
2124
- SOMACollection,
2125
- SOMAExperiment,
2126
- SOMAMeasurement,
2127
- PyArrowDataset,
2128
- ]:
2129
- """Return a cloud-backed data object.
2166
+ self,
2167
+ mode: str = "r",
2168
+ engine: Literal["pyarrow", "polars"] = "pyarrow",
2169
+ is_run_input: bool | None = None,
2170
+ **kwargs,
2171
+ ) -> (
2172
+ AnnDataAccessor
2173
+ | BackedAccessor
2174
+ | SOMACollection
2175
+ | SOMAExperiment
2176
+ | SOMAMeasurement
2177
+ | PyArrowDataset
2178
+ | Iterator[PolarsLazyFrame]
2179
+ ):
2180
+ """Open a dataset for streaming.
2130
2181
 
2131
2182
  Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
2132
- `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats.
2183
+ `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` or `polars` compatible formats
2184
+ (`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
2133
2185
 
2134
2186
  Args:
2135
2187
  mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
2136
2188
  otherwise should be always `"r"` (read-only mode).
2189
+ engine: Which module to use for lazy loading of a dataframe
2190
+ from `pyarrow` or `polars` compatible formats.
2191
+ This has no effect if the artifact is not a dataframe, i.e.
2192
+ if it is an `AnnData,` `hdf5`, `zarr` or `tiledbsoma` object.
2137
2193
  is_run_input: Whether to track this artifact as run input.
2138
2194
  **kwargs: Keyword arguments for the accessor, i.e. `h5py` or `zarr` connection,
2139
- `pyarrow.dataset.dataset`.
2195
+ `pyarrow.dataset.dataset`, `polars.scan_*` function.
2140
2196
 
2141
2197
  Notes:
2142
- For more info, see tutorial: :doc:`/arrays`.
2198
+ For more info, see guide: :doc:`/arrays`.
2143
2199
 
2144
2200
  Example::
2145
2201
 
@@ -2152,6 +2208,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2152
2208
  #> AnnDataAccessor object with n_obs × n_vars = 70 × 765
2153
2209
  #> constructed for the AnnData object pbmc68k.h5ad
2154
2210
  #> ...
2211
+ artifact = ln.Artifact.get(key="lndb-storage/df.parquet")
2212
+ artifact.open()
2213
+ #> pyarrow._dataset.FileSystemDataset
2214
+
2155
2215
  """
2156
2216
  if self._overwrite_versions and not self.is_latest:
2157
2217
  raise ValueError(INCONSISTENT_STATE_MSG)
@@ -2159,6 +2219,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2159
2219
  h5_suffixes = [".h5", ".hdf5", ".h5ad"]
2160
2220
  h5_suffixes += [s + ".gz" for s in h5_suffixes]
2161
2221
  # ignore empty suffix for now
2222
+ df_suffixes = tuple(set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES))
2162
2223
  suffixes = (
2163
2224
  (
2164
2225
  "",
@@ -2167,7 +2228,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2167
2228
  ".tiledbsoma",
2168
2229
  )
2169
2230
  + tuple(h5_suffixes)
2170
- + PYARROW_SUFFIXES
2231
+ + df_suffixes
2171
2232
  + tuple(
2172
2233
  s + ".gz" for s in PYARROW_SUFFIXES
2173
2234
  ) # this doesn't work for externally gzipped files, REMOVE LATER
@@ -2175,10 +2236,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2175
2236
  if self.suffix not in suffixes:
2176
2237
  raise ValueError(
2177
2238
  "Artifact should have a zarr, h5, tiledbsoma object"
2178
- " or a compatible `pyarrow.dataset.dataset` directory"
2239
+ " or a compatible `pyarrow.dataset.dataset` or `polars.scan_*` directory"
2179
2240
  " as the underlying data, please use one of the following suffixes"
2180
2241
  f" for the object name: {', '.join(suffixes[1:])}."
2181
- f" Or no suffix for a folder with {', '.join(PYARROW_SUFFIXES)} files"
2242
+ f" Or no suffix for a folder with {', '.join(df_suffixes)} files"
2182
2243
  " (no mixing allowed)."
2183
2244
  )
2184
2245
  if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
@@ -2187,10 +2248,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2187
2248
  )
2188
2249
 
2189
2250
  from lamindb import settings
2190
- from lamindb.core.storage._backed_access import (
2191
- _track_writes_factory,
2192
- backed_access,
2193
- )
2194
2251
 
2195
2252
  using_key = settings._using_key
2196
2253
  filepath, cache_key = filepath_cache_key_from_artifact(
@@ -2211,14 +2268,22 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2211
2268
  ) and not filepath.synchronize(localpath, just_check=True)
2212
2269
  if open_cache:
2213
2270
  try:
2214
- access = backed_access(localpath, mode, using_key, **kwargs)
2271
+ access = backed_access(
2272
+ localpath, mode, engine, using_key=using_key, **kwargs
2273
+ )
2215
2274
  except Exception as e:
2216
- if isinstance(filepath, LocalPathClasses):
2275
+ # also ignore ValueError here because
2276
+ # such errors most probably just imply an incorrect argument
2277
+ if isinstance(filepath, LocalPathClasses) or isinstance(
2278
+ e, (ImportError, ValueError)
2279
+ ):
2217
2280
  raise e
2218
2281
  logger.warning(
2219
2282
  f"The cache might be corrupted: {e}. Trying to open directly."
2220
2283
  )
2221
- access = backed_access(filepath, mode, using_key, **kwargs)
2284
+ access = backed_access(
2285
+ filepath, mode, engine, using_key=using_key, **kwargs
2286
+ )
2222
2287
  # happens only if backed_access has been successful
2223
2288
  # delete the corrupted cache
2224
2289
  if localpath.is_dir():
@@ -2226,7 +2291,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2226
2291
  else:
2227
2292
  localpath.unlink(missing_ok=True)
2228
2293
  else:
2229
- access = backed_access(filepath, mode, using_key, **kwargs)
2294
+ access = backed_access(
2295
+ filepath, mode, engine, using_key=using_key, **kwargs
2296
+ )
2230
2297
  if is_tiledbsoma_w:
2231
2298
 
2232
2299
  def finalize():
@@ -2327,7 +2394,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2327
2394
  _track_run_input(self, is_run_input)
2328
2395
  return access_memory
2329
2396
 
2330
- @doc_args(DEBUG_KWARGS_DOC)
2331
2397
  def cache(
2332
2398
  self, *, is_run_input: bool | None = None, mute: bool = False, **kwargs
2333
2399
  ) -> Path:
@@ -2340,7 +2406,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2340
2406
  Args:
2341
2407
  mute: Silence logging of caching progress.
2342
2408
  is_run_input: Whether to track this artifact as run input.
2343
- {}
2344
2409
 
2345
2410
  Example::
2346
2411
 
@@ -2399,6 +2464,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2399
2464
  artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
2400
2465
  artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
2401
2466
  """
2467
+ # we're *not* running the line below because the case `storage is None` triggers user feedback in one case
2468
+ # storage = True if storage is None else storage
2469
+
2402
2470
  # this first check means an invalid delete fails fast rather than cascading through
2403
2471
  # database and storage permission errors
2404
2472
  if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
@@ -2449,8 +2517,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2449
2517
  # only delete in storage if DB delete is successful
2450
2518
  # DB delete might error because of a foreign key constraint violated etc.
2451
2519
  if self._overwrite_versions and self.is_latest:
2452
- # includes self
2453
- for version in self.versions.all():
2520
+ logger.important(
2521
+ "deleting all versions of this artifact because they all share the same store"
2522
+ )
2523
+ for version in self.versions.all(): # includes self
2454
2524
  _delete_skip_storage(version)
2455
2525
  else:
2456
2526
  self._delete_skip_storage()
@@ -2460,7 +2530,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2460
2530
  delete_in_storage = False
2461
2531
  if storage:
2462
2532
  logger.warning(
2463
- "Storage argument is ignored; can't delete storage on an previous version"
2533
+ "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
2464
2534
  )
2465
2535
  elif self.key is None or self._key_is_virtual:
2466
2536
  # do not ask for confirmation also if storage is None
@@ -2485,13 +2555,11 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2485
2555
  if delete_msg != "did-not-delete":
2486
2556
  logger.success(f"deleted {colors.yellow(f'{path}')}")
2487
2557
 
2488
- @doc_args(DEBUG_KWARGS_DOC)
2489
2558
  def save(self, upload: bool | None = None, **kwargs) -> Artifact:
2490
2559
  """Save to database & storage.
2491
2560
 
2492
2561
  Args:
2493
2562
  upload: Trigger upload to cloud storage in instances with hybrid storage mode.
2494
- {}
2495
2563
 
2496
2564
  Example::
2497
2565
 
@@ -2577,14 +2645,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2577
2645
  self._branch_code = 1
2578
2646
  self.save()
2579
2647
 
2580
- def describe(self) -> None:
2581
- """Describe relations of record.
2582
-
2583
- Example::
2648
+ def describe(self, return_str: bool = False) -> None:
2649
+ """Describe record including linked records.
2584
2650
 
2585
- artifact.describe()
2651
+ Args:
2652
+ return_str: Return a string instead of printing.
2586
2653
  """
2587
- return describe_artifact_collection(self)
2654
+ return describe_artifact_collection(self, return_str=return_str)
2588
2655
 
2589
2656
  def _populate_subsequent_runs(self, run: Run) -> None:
2590
2657
  _populate_subsequent_runs_(self, run)
@@ -2624,9 +2691,11 @@ def _save_skip_storage(artifact, **kwargs) -> None:
2624
2691
 
2625
2692
  class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
2626
2693
  id: int = models.BigAutoField(primary_key=True)
2627
- artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
2694
+ artifact: Artifact = ForeignKey(
2695
+ Artifact, CASCADE, related_name="links_featurevalue"
2696
+ )
2628
2697
  # we follow the lower() case convention rather than snake case for link models
2629
- featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="+")
2698
+ featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
2630
2699
 
2631
2700
  class Meta:
2632
2701
  unique_together = ("artifact", "featurevalue")
@@ -2634,9 +2703,11 @@ class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
2634
2703
 
2635
2704
  class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
2636
2705
  id: int = models.BigAutoField(primary_key=True)
2637
- artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
2706
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_paramvalue")
2638
2707
  # we follow the lower() case convention rather than snake case for link models
2639
- paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
2708
+ paramvalue: ParamValue = ForeignKey(
2709
+ ParamValue, PROTECT, related_name="links_artifact"
2710
+ )
2640
2711
 
2641
2712
  class Meta:
2642
2713
  unique_together = ("artifact", "paramvalue")
@@ -2685,8 +2756,8 @@ def _track_run_input(
2685
2756
  # record is on another db
2686
2757
  # we have to save the record into the current db with
2687
2758
  # the run being attached to a transfer transform
2688
- logger.important(
2689
- f"completing transfer to track {data.__class__.__name__}('{data.uid[:8]}') as input"
2759
+ logger.info(
2760
+ f"completing transfer to track {data.__class__.__name__}('{data.uid[:8]}...') as input"
2690
2761
  )
2691
2762
  data.save()
2692
2763
  is_valid = True