lamindb 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. lamindb/__init__.py +52 -36
  2. lamindb/_finish.py +17 -10
  3. lamindb/_tracked.py +1 -1
  4. lamindb/base/__init__.py +3 -1
  5. lamindb/base/fields.py +40 -22
  6. lamindb/base/ids.py +1 -94
  7. lamindb/base/types.py +2 -0
  8. lamindb/base/uids.py +117 -0
  9. lamindb/core/_context.py +177 -89
  10. lamindb/core/_settings.py +38 -25
  11. lamindb/core/datasets/__init__.py +11 -4
  12. lamindb/core/datasets/_core.py +5 -5
  13. lamindb/core/datasets/_small.py +0 -93
  14. lamindb/core/datasets/mini_immuno.py +172 -0
  15. lamindb/core/loaders.py +1 -1
  16. lamindb/core/storage/_backed_access.py +100 -6
  17. lamindb/core/storage/_polars_lazy_df.py +51 -0
  18. lamindb/core/storage/_pyarrow_dataset.py +15 -30
  19. lamindb/core/storage/objects.py +6 -0
  20. lamindb/core/subsettings/__init__.py +2 -0
  21. lamindb/core/subsettings/_annotation_settings.py +11 -0
  22. lamindb/curators/__init__.py +7 -3349
  23. lamindb/curators/_legacy.py +2056 -0
  24. lamindb/curators/core.py +1546 -0
  25. lamindb/errors.py +11 -0
  26. lamindb/examples/__init__.py +27 -0
  27. lamindb/examples/schemas/__init__.py +12 -0
  28. lamindb/examples/schemas/_anndata.py +25 -0
  29. lamindb/examples/schemas/_simple.py +19 -0
  30. lamindb/integrations/_vitessce.py +8 -5
  31. lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
  32. lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
  33. lamindb/models/__init__.py +4 -1
  34. lamindb/models/_describe.py +21 -4
  35. lamindb/models/_feature_manager.py +365 -286
  36. lamindb/models/_label_manager.py +8 -2
  37. lamindb/models/artifact.py +173 -95
  38. lamindb/models/artifact_set.py +122 -0
  39. lamindb/models/collection.py +73 -52
  40. lamindb/models/core.py +1 -1
  41. lamindb/models/feature.py +51 -17
  42. lamindb/models/has_parents.py +2 -2
  43. lamindb/models/project.py +1 -1
  44. lamindb/models/query_manager.py +221 -22
  45. lamindb/models/query_set.py +245 -171
  46. lamindb/models/record.py +62 -243
  47. lamindb/models/run.py +4 -4
  48. lamindb/models/save.py +8 -2
  49. lamindb/models/schema.py +458 -181
  50. lamindb/models/transform.py +2 -2
  51. lamindb/models/ulabel.py +8 -5
  52. {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/METADATA +6 -6
  53. {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/RECORD +55 -42
  54. {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
  55. {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
@@ -24,7 +24,7 @@ from ._describe import (
24
24
  TYPE_WIDTH,
25
25
  VALUES_WIDTH,
26
26
  describe_header,
27
- print_rich_tree,
27
+ format_rich_tree,
28
28
  )
29
29
  from ._django import get_artifact_with_related, get_related_model
30
30
  from ._relations import dict_related_model_to_related_name
@@ -182,8 +182,14 @@ class LabelManager:
182
182
  self._host = host
183
183
 
184
184
  def __repr__(self) -> str:
185
+ return self.describe(return_str=True)
186
+
187
+ def describe(self, return_str=True) -> str:
188
+ """Describe the labels."""
185
189
  tree = describe_labels(self._host)
186
- return print_rich_tree(tree, fallback="no linked labels")
190
+ return format_rich_tree(
191
+ tree, fallback="no linked labels", return_str=return_str
192
+ )
187
193
 
188
194
  def add(
189
195
  self,
@@ -5,7 +5,7 @@ import os
5
5
  import shutil
6
6
  from collections import defaultdict
7
7
  from pathlib import Path, PurePath, PurePosixPath
8
- from typing import TYPE_CHECKING, Any, Union, overload
8
+ from typing import TYPE_CHECKING, Any, Literal, Union, overload
9
9
 
10
10
  import fsspec
11
11
  import lamindb_setup as ln_setup
@@ -48,6 +48,11 @@ from ..core.storage import (
48
48
  write_to_disk,
49
49
  )
50
50
  from ..core.storage._anndata_accessor import _anndata_n_observations
51
+ from ..core.storage._backed_access import (
52
+ _track_writes_factory,
53
+ backed_access,
54
+ )
55
+ from ..core.storage._polars_lazy_df import POLARS_SUFFIXES
51
56
  from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES
52
57
  from ..core.storage._tiledbsoma import _soma_n_observations
53
58
  from ..core.storage.paths import (
@@ -105,9 +110,10 @@ except ImportError:
105
110
 
106
111
 
107
112
  if TYPE_CHECKING:
108
- from collections.abc import Iterable
113
+ from collections.abc import Iterable, Iterator
109
114
 
110
115
  from mudata import MuData # noqa: TC004
116
+ from polars import LazyFrame as PolarsLazyFrame
111
117
  from pyarrow.dataset import Dataset as PyArrowDataset
112
118
  from spatialdata import SpatialData # noqa: TC004
113
119
  from tiledbsoma import Collection as SOMACollection
@@ -311,10 +317,9 @@ def get_stat_or_artifact(
311
317
  result = Artifact.objects.using(instance).filter(hash=hash).all()
312
318
  artifact_with_same_hash_exists = len(result) > 0
313
319
  else:
314
- storage_id = settings.storage.id
315
320
  result = (
316
321
  Artifact.objects.using(instance)
317
- .filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
322
+ .filter(Q(hash=hash) | Q(key=key, storage=settings.storage.record))
318
323
  .order_by("-created_at")
319
324
  .all()
320
325
  )
@@ -759,15 +764,15 @@ def _describe_sqlite(self, print_types: bool = False): # for artifact & collect
759
764
  return tree
760
765
 
761
766
 
762
- def describe_artifact_collection(self): # for artifact & collection
763
- from ._describe import print_rich_tree
767
+ def describe_artifact_collection(self, return_str: bool = False) -> str | None:
768
+ from ._describe import format_rich_tree
764
769
 
765
770
  if not self._state.adding and connections[self._state.db].vendor == "postgresql":
766
771
  tree = _describe_postgres(self)
767
772
  else:
768
773
  tree = _describe_sqlite(self)
769
774
 
770
- print_rich_tree(tree)
775
+ return format_rich_tree(tree, return_str=return_str)
771
776
 
772
777
 
773
778
  def validate_feature(feature: Feature, records: list[Record]) -> None:
@@ -962,7 +967,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
962
967
 
963
968
  Create an artifact **from a local file or folder**::
964
969
 
965
- artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
970
+ artifact = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
966
971
  artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
967
972
 
968
973
  Calling `.save()` copies or uploads the file to the default storage location of your lamindb instance.
@@ -977,29 +982,12 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
977
982
 
978
983
  You can make a **new version** of an artifact by passing an existing `key`::
979
984
 
980
- artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
985
+ artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
981
986
  artifact_v2.versions.df() # see all versions
982
987
 
983
- .. dropdown:: Why does the API look this way?
984
-
985
- It's inspired by APIs building on AWS S3.
986
-
987
- Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
988
+ You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
988
989
 
989
- In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
990
-
991
- # signature: S3.Bucket.upload_file(filepath, key)
992
- import boto3
993
- s3 = boto3.resource('s3')
994
- bucket = s3.Bucket('mybucket')
995
- bucket.upload_file('/tmp/hello.txt', 'hello.txt')
996
-
997
- In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
998
-
999
- # signature: quilt3.Bucket.put_file(key, filepath)
1000
- import quilt3
1001
- bucket = quilt3.Bucket('mybucket')
1002
- bucket.put_file('hello.txt', '/tmp/hello.txt')
990
+ ln.settings.storage = "s3://some-bucket"
1003
991
 
1004
992
  Sometimes you want to **avoid mapping the artifact into a path hierarchy**, and you only pass `description`::
1005
993
 
@@ -1034,6 +1022,27 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1034
1022
  In concurrent workloads where the same artifact is created repeatedly at the exact same time, `.save()`
1035
1023
  detects the duplication and will return the existing artifact.
1036
1024
 
1025
+ .. dropdown:: Why does the constructor look the way it looks?
1026
+
1027
+ It's inspired by APIs building on AWS S3.
1028
+
1029
+ Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
1030
+
1031
+ In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
1032
+
1033
+ # signature: S3.Bucket.upload_file(filepath, key)
1034
+ import boto3
1035
+ s3 = boto3.resource('s3')
1036
+ bucket = s3.Bucket('mybucket')
1037
+ bucket.upload_file('/tmp/hello.txt', 'hello.txt')
1038
+
1039
+ In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
1040
+
1041
+ # signature: quilt3.Bucket.put_file(key, filepath)
1042
+ import quilt3
1043
+ bucket = quilt3.Bucket('mybucket')
1044
+ bucket.put_file('hello.txt', '/tmp/hello.txt')
1045
+
1037
1046
  See Also:
1038
1047
  :class:`~lamindb.Storage`
1039
1048
  Storage locations for artifacts.
@@ -1089,7 +1098,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1089
1098
  ln.Artifact.filter(scientist="Barbara McClintock")
1090
1099
 
1091
1100
  Features may or may not be part of the artifact content in storage. For
1092
- instance, the :class:`~lamindb.Curator` flow validates the columns of a
1101
+ instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1093
1102
  `DataFrame`-like artifact and annotates it with features corresponding to
1094
1103
  these columns. `artifact.features.add_values`, by contrast, does not
1095
1104
  validate the content of the artifact.
@@ -1525,7 +1534,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1525
1534
  ::
1526
1535
 
1527
1536
  artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
1528
- artifact = ln.Arfifact.get(key="my_datasets/my_file.parquet")
1537
+ artifact = ln.Arfifact.get(key="examples/my_file.parquet")
1529
1538
  """
1530
1539
  from .query_set import QuerySet
1531
1540
 
@@ -1550,7 +1559,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1550
1559
 
1551
1560
  Query by fields::
1552
1561
 
1553
- ln.Arfifact.filter(key="my_datasets/my_file.parquet")
1562
+ ln.Arfifact.filter(key="examples/my_file.parquet")
1554
1563
 
1555
1564
  Query by features::
1556
1565
 
@@ -1610,7 +1619,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1610
1619
  schema: Schema | None = None,
1611
1620
  **kwargs,
1612
1621
  ) -> Artifact:
1613
- """Create from `DataFrame`, validate & link features.
1622
+ """Create from `DataFrame`, optionally validate & annotate.
1614
1623
 
1615
1624
  Args:
1616
1625
  df: A `DataFrame` object.
@@ -1619,7 +1628,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1619
1628
  description: A description.
1620
1629
  revises: An old version of the artifact.
1621
1630
  run: The run that creates the artifact.
1622
- schema: A schema to validate & annotate.
1631
+ schema: A schema that defines how to validate & annotate.
1623
1632
 
1624
1633
  See Also:
1625
1634
  :meth:`~lamindb.Collection`
@@ -1627,19 +1636,30 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1627
1636
  :class:`~lamindb.Feature`
1628
1637
  Track features.
1629
1638
 
1630
- Example::
1639
+ Example:
1631
1640
 
1632
- import lamindb as ln
1641
+ No validation and annotation::
1642
+
1643
+ import lamindb as ln
1644
+
1645
+ df = ln.core.datasets.mini_immuno.get_dataset1()
1646
+ artifact = ln.Artifact.from_df(df, key="examples/dataset1.parquet").save()
1647
+
1648
+ With validation and annotation.
1649
+
1650
+ .. literalinclude:: scripts/curate_dataframe_flexible.py
1651
+ :language: python
1652
+
1653
+ Under-the-hood, this used the following schema.
1654
+
1655
+ .. literalinclude:: scripts/define_valid_features.py
1656
+ :language: python
1657
+
1658
+ Valid features & labels were defined as:
1659
+
1660
+ .. literalinclude:: scripts/define_mini_immuno_features_labels.py
1661
+ :language: python
1633
1662
 
1634
- df = ln.core.datasets.df_iris_in_meter_batch1()
1635
- df.head()
1636
- #> sepal_length sepal_width petal_length petal_width iris_organism_code
1637
- #> 0 0.051 0.035 0.014 0.002 0
1638
- #> 1 0.049 0.030 0.014 0.002 0
1639
- #> 2 0.047 0.032 0.013 0.002 0
1640
- #> 3 0.046 0.031 0.015 0.002 0
1641
- #> 4 0.050 0.036 0.014 0.002 0
1642
- artifact = ln.Artifact.from_df(df, key="iris/result_batch1.parquet").save()
1643
1663
  """
1644
1664
  artifact = Artifact( # type: ignore
1645
1665
  data=df,
@@ -1673,7 +1693,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1673
1693
  schema: Schema | None = None,
1674
1694
  **kwargs,
1675
1695
  ) -> Artifact:
1676
- """Create from ``AnnData``, validate & link features.
1696
+ """Create from `AnnData`, optionally validate & annotate.
1677
1697
 
1678
1698
  Args:
1679
1699
  adata: An `AnnData` object or a path of AnnData-like.
@@ -1682,7 +1702,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1682
1702
  description: A description.
1683
1703
  revises: An old version of the artifact.
1684
1704
  run: The run that creates the artifact.
1685
- schema: A schema to validate & annotate.
1705
+ schema: A schema that defines how to validate & annotate.
1686
1706
 
1687
1707
  See Also:
1688
1708
 
@@ -1691,12 +1711,31 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1691
1711
  :class:`~lamindb.Feature`
1692
1712
  Track features.
1693
1713
 
1694
- Example::
1714
+ Example:
1695
1715
 
1696
- import lamindb as ln
1716
+ No validation and annotation::
1717
+
1718
+ import lamindb as ln
1719
+
1720
+ adata = ln.core.datasets.anndata_with_obs()
1721
+ artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
1722
+
1723
+ With validation and annotation.
1724
+
1725
+ .. literalinclude:: scripts/curate_anndata_flexible.py
1726
+ :language: python
1727
+
1728
+ Under-the-hood, this used the following schema.
1729
+
1730
+ .. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
1731
+ :language: python
1732
+
1733
+ This schema tranposes the `var` DataFrame during curation, so that one validates and annotates the `var.T` schema, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`.
1734
+ If one doesn't transpose, one would annotate with the schema of `var`, i.e., `[gene_symbol, gene_type]`.
1735
+
1736
+ .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png
1737
+ :width: 800px
1697
1738
 
1698
- adata = ln.core.datasets.anndata_with_obs()
1699
- artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
1700
1739
  """
1701
1740
  if not data_is_anndata(adata):
1702
1741
  raise ValueError(
@@ -1745,7 +1784,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1745
1784
  schema: Schema | None = None,
1746
1785
  **kwargs,
1747
1786
  ) -> Artifact:
1748
- """Create from ``MuData``, validate & link features.
1787
+ """Create from `MuData`, optionally validate & annotate.
1749
1788
 
1750
1789
  Args:
1751
1790
  mdata: A `MuData` object.
@@ -1754,7 +1793,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1754
1793
  description: A description.
1755
1794
  revises: An old version of the artifact.
1756
1795
  run: The run that creates the artifact.
1757
- schema: A schema to validate & annotate.
1796
+ schema: A schema that defines how to validate & annotate.
1758
1797
 
1759
1798
  See Also:
1760
1799
  :meth:`~lamindb.Collection`
@@ -1804,16 +1843,16 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1804
1843
  schema: Schema | None = None,
1805
1844
  **kwargs,
1806
1845
  ) -> Artifact:
1807
- """Create from ``SpatialData``, validate & link features.
1846
+ """Create from `SpatialData`, optionally validate & annotate.
1808
1847
 
1809
1848
  Args:
1810
- mdata: A `SpatialData` object.
1849
+ sdata: A `SpatialData` object.
1811
1850
  key: A relative path within default storage,
1812
1851
  e.g., `"myfolder/myfile.zarr"`.
1813
1852
  description: A description.
1814
1853
  revises: An old version of the artifact.
1815
1854
  run: The run that creates the artifact.
1816
- schema: A schema to validate & annotate.
1855
+ schema: A schema that defines how to validate & annotate.
1817
1856
 
1818
1857
  See Also:
1819
1858
  :meth:`~lamindb.Collection`
@@ -1821,11 +1860,21 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1821
1860
  :class:`~lamindb.Feature`
1822
1861
  Track features.
1823
1862
 
1824
- Example::
1863
+ Example:
1825
1864
 
1826
- import lamindb as ln
1865
+ No validation and annotation::
1866
+
1867
+ import lamindb as ln
1868
+
1869
+ artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr").save()
1827
1870
 
1828
- artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr").save()
1871
+ With validation and annotation.
1872
+
1873
+ .. literalinclude:: scripts/define_schema_spatialdata.py
1874
+ :language: python
1875
+
1876
+ .. literalinclude:: scripts/curate_spatialdata.py
1877
+ :language: python
1829
1878
  """
1830
1879
  if not data_is_spatialdata(sdata):
1831
1880
  raise ValueError(
@@ -2117,29 +2166,39 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2117
2166
  self._old_suffix = self.suffix
2118
2167
 
2119
2168
  def open(
2120
- self, mode: str = "r", is_run_input: bool | None = None, **kwargs
2121
- ) -> Union[
2122
- AnnDataAccessor,
2123
- BackedAccessor,
2124
- SOMACollection,
2125
- SOMAExperiment,
2126
- SOMAMeasurement,
2127
- PyArrowDataset,
2128
- ]:
2129
- """Return a cloud-backed data object.
2169
+ self,
2170
+ mode: str = "r",
2171
+ engine: Literal["pyarrow", "polars"] = "pyarrow",
2172
+ is_run_input: bool | None = None,
2173
+ **kwargs,
2174
+ ) -> (
2175
+ AnnDataAccessor
2176
+ | BackedAccessor
2177
+ | SOMACollection
2178
+ | SOMAExperiment
2179
+ | SOMAMeasurement
2180
+ | PyArrowDataset
2181
+ | Iterator[PolarsLazyFrame]
2182
+ ):
2183
+ """Open a dataset for streaming.
2130
2184
 
2131
2185
  Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
2132
- `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats.
2186
+ `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` or `polars` compatible formats
2187
+ (`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
2133
2188
 
2134
2189
  Args:
2135
2190
  mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
2136
2191
  otherwise should be always `"r"` (read-only mode).
2192
+ engine: Which module to use for lazy loading of a dataframe
2193
+ from `pyarrow` or `polars` compatible formats.
2194
+ This has no effect if the artifact is not a dataframe, i.e.
2195
+ if it is an `AnnData,` `hdf5`, `zarr` or `tiledbsoma` object.
2137
2196
  is_run_input: Whether to track this artifact as run input.
2138
2197
  **kwargs: Keyword arguments for the accessor, i.e. `h5py` or `zarr` connection,
2139
- `pyarrow.dataset.dataset`.
2198
+ `pyarrow.dataset.dataset`, `polars.scan_*` function.
2140
2199
 
2141
2200
  Notes:
2142
- For more info, see tutorial: :doc:`/arrays`.
2201
+ For more info, see guide: :doc:`/arrays`.
2143
2202
 
2144
2203
  Example::
2145
2204
 
@@ -2152,6 +2211,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2152
2211
  #> AnnDataAccessor object with n_obs × n_vars = 70 × 765
2153
2212
  #> constructed for the AnnData object pbmc68k.h5ad
2154
2213
  #> ...
2214
+ artifact = ln.Artifact.get(key="lndb-storage/df.parquet")
2215
+ artifact.open()
2216
+ #> pyarrow._dataset.FileSystemDataset
2217
+
2155
2218
  """
2156
2219
  if self._overwrite_versions and not self.is_latest:
2157
2220
  raise ValueError(INCONSISTENT_STATE_MSG)
@@ -2159,6 +2222,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2159
2222
  h5_suffixes = [".h5", ".hdf5", ".h5ad"]
2160
2223
  h5_suffixes += [s + ".gz" for s in h5_suffixes]
2161
2224
  # ignore empty suffix for now
2225
+ df_suffixes = tuple(set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES))
2162
2226
  suffixes = (
2163
2227
  (
2164
2228
  "",
@@ -2167,7 +2231,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2167
2231
  ".tiledbsoma",
2168
2232
  )
2169
2233
  + tuple(h5_suffixes)
2170
- + PYARROW_SUFFIXES
2234
+ + df_suffixes
2171
2235
  + tuple(
2172
2236
  s + ".gz" for s in PYARROW_SUFFIXES
2173
2237
  ) # this doesn't work for externally gzipped files, REMOVE LATER
@@ -2175,10 +2239,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2175
2239
  if self.suffix not in suffixes:
2176
2240
  raise ValueError(
2177
2241
  "Artifact should have a zarr, h5, tiledbsoma object"
2178
- " or a compatible `pyarrow.dataset.dataset` directory"
2242
+ " or a compatible `pyarrow.dataset.dataset` or `polars.scan_*` directory"
2179
2243
  " as the underlying data, please use one of the following suffixes"
2180
2244
  f" for the object name: {', '.join(suffixes[1:])}."
2181
- f" Or no suffix for a folder with {', '.join(PYARROW_SUFFIXES)} files"
2245
+ f" Or no suffix for a folder with {', '.join(df_suffixes)} files"
2182
2246
  " (no mixing allowed)."
2183
2247
  )
2184
2248
  if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
@@ -2187,10 +2251,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2187
2251
  )
2188
2252
 
2189
2253
  from lamindb import settings
2190
- from lamindb.core.storage._backed_access import (
2191
- _track_writes_factory,
2192
- backed_access,
2193
- )
2194
2254
 
2195
2255
  using_key = settings._using_key
2196
2256
  filepath, cache_key = filepath_cache_key_from_artifact(
@@ -2211,14 +2271,22 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2211
2271
  ) and not filepath.synchronize(localpath, just_check=True)
2212
2272
  if open_cache:
2213
2273
  try:
2214
- access = backed_access(localpath, mode, using_key, **kwargs)
2274
+ access = backed_access(
2275
+ localpath, mode, engine, using_key=using_key, **kwargs
2276
+ )
2215
2277
  except Exception as e:
2216
- if isinstance(filepath, LocalPathClasses):
2278
+ # also ignore ValueError here because
2279
+ # such errors most probably just imply an incorrect argument
2280
+ if isinstance(filepath, LocalPathClasses) or isinstance(
2281
+ e, (ImportError, ValueError)
2282
+ ):
2217
2283
  raise e
2218
2284
  logger.warning(
2219
2285
  f"The cache might be corrupted: {e}. Trying to open directly."
2220
2286
  )
2221
- access = backed_access(filepath, mode, using_key, **kwargs)
2287
+ access = backed_access(
2288
+ filepath, mode, engine, using_key=using_key, **kwargs
2289
+ )
2222
2290
  # happens only if backed_access has been successful
2223
2291
  # delete the corrupted cache
2224
2292
  if localpath.is_dir():
@@ -2226,7 +2294,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2226
2294
  else:
2227
2295
  localpath.unlink(missing_ok=True)
2228
2296
  else:
2229
- access = backed_access(filepath, mode, using_key, **kwargs)
2297
+ access = backed_access(
2298
+ filepath, mode, engine, using_key=using_key, **kwargs
2299
+ )
2230
2300
  if is_tiledbsoma_w:
2231
2301
 
2232
2302
  def finalize():
@@ -2399,6 +2469,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2399
2469
  artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
2400
2470
  artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
2401
2471
  """
2472
+ # we're *not* running the line below because the case `storage is None` triggers user feedback in one case
2473
+ # storage = True if storage is None else storage
2474
+
2402
2475
  # this first check means an invalid delete fails fast rather than cascading through
2403
2476
  # database and storage permission errors
2404
2477
  if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
@@ -2449,8 +2522,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2449
2522
  # only delete in storage if DB delete is successful
2450
2523
  # DB delete might error because of a foreign key constraint violated etc.
2451
2524
  if self._overwrite_versions and self.is_latest:
2452
- # includes self
2453
- for version in self.versions.all():
2525
+ logger.important(
2526
+ "deleting all versions of this artifact because they all share the same store"
2527
+ )
2528
+ for version in self.versions.all(): # includes self
2454
2529
  _delete_skip_storage(version)
2455
2530
  else:
2456
2531
  self._delete_skip_storage()
@@ -2460,7 +2535,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2460
2535
  delete_in_storage = False
2461
2536
  if storage:
2462
2537
  logger.warning(
2463
- "Storage argument is ignored; can't delete storage on an previous version"
2538
+ "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
2464
2539
  )
2465
2540
  elif self.key is None or self._key_is_virtual:
2466
2541
  # do not ask for confirmation also if storage is None
@@ -2577,14 +2652,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2577
2652
  self._branch_code = 1
2578
2653
  self.save()
2579
2654
 
2580
- def describe(self) -> None:
2581
- """Describe relations of record.
2655
+ def describe(self, return_str: bool = False) -> None:
2656
+ """Describe record including linked records.
2582
2657
 
2583
- Example::
2584
-
2585
- artifact.describe()
2658
+ Args:
2659
+ return_str: Return a string instead of printing.
2586
2660
  """
2587
- return describe_artifact_collection(self)
2661
+ return describe_artifact_collection(self, return_str=return_str)
2588
2662
 
2589
2663
  def _populate_subsequent_runs(self, run: Run) -> None:
2590
2664
  _populate_subsequent_runs_(self, run)
@@ -2624,9 +2698,11 @@ def _save_skip_storage(artifact, **kwargs) -> None:
2624
2698
 
2625
2699
  class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
2626
2700
  id: int = models.BigAutoField(primary_key=True)
2627
- artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
2701
+ artifact: Artifact = ForeignKey(
2702
+ Artifact, CASCADE, related_name="links_featurevalue"
2703
+ )
2628
2704
  # we follow the lower() case convention rather than snake case for link models
2629
- featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="+")
2705
+ featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
2630
2706
 
2631
2707
  class Meta:
2632
2708
  unique_together = ("artifact", "featurevalue")
@@ -2634,9 +2710,11 @@ class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
2634
2710
 
2635
2711
  class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
2636
2712
  id: int = models.BigAutoField(primary_key=True)
2637
- artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
2713
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_paramvalue")
2638
2714
  # we follow the lower() case convention rather than snake case for link models
2639
- paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
2715
+ paramvalue: ParamValue = ForeignKey(
2716
+ ParamValue, PROTECT, related_name="links_artifact"
2717
+ )
2640
2718
 
2641
2719
  class Meta:
2642
2720
  unique_together = ("artifact", "paramvalue")