PyPI - lamindb - Versions diffs - 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

lamindb 1.10.1py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

lamindb/__init__.py +89 -49
lamindb/_finish.py +17 -15
lamindb/_tracked.py +2 -4
lamindb/_view.py +1 -1
lamindb/base/__init__.py +2 -1
lamindb/base/dtypes.py +76 -0
lamindb/core/_settings.py +45 -2
lamindb/core/storage/_anndata_accessor.py +118 -26
lamindb/core/storage/_backed_access.py +10 -7
lamindb/core/storage/_spatialdata_accessor.py +15 -4
lamindb/core/storage/_zarr.py +3 -0
lamindb/curators/_legacy.py +16 -3
lamindb/curators/core.py +449 -193
lamindb/errors.py +6 -0
lamindb/examples/cellxgene/__init__.py +8 -3
lamindb/examples/cellxgene/_cellxgene.py +127 -13
lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
lamindb/examples/croissant/__init__.py +32 -6
lamindb/examples/datasets/__init__.py +2 -2
lamindb/examples/datasets/_core.py +9 -2
lamindb/examples/datasets/_small.py +66 -22
lamindb/examples/fixtures/sheets.py +8 -2
lamindb/integrations/_croissant.py +34 -11
lamindb/migrations/0118_alter_recordproject_value_projectrecord.py +99 -0
lamindb/migrations/0119_rename_records_project_linked_in_records.py +26 -0
lamindb/migrations/{0117_squashed.py → 0119_squashed.py} +92 -5
lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
lamindb/migrations/0121_recorduser.py +60 -0
lamindb/models/__init__.py +4 -1
lamindb/models/_describe.py +2 -2
lamindb/models/_feature_manager.py +131 -71
lamindb/models/_from_values.py +2 -2
lamindb/models/_is_versioned.py +4 -4
lamindb/models/_label_manager.py +4 -4
lamindb/models/artifact.py +357 -192
lamindb/models/artifact_set.py +45 -1
lamindb/models/can_curate.py +1 -2
lamindb/models/collection.py +3 -34
lamindb/models/feature.py +111 -7
lamindb/models/has_parents.py +11 -11
lamindb/models/project.py +42 -2
lamindb/models/query_manager.py +16 -7
lamindb/models/query_set.py +191 -78
lamindb/models/record.py +30 -5
lamindb/models/run.py +10 -33
lamindb/models/save.py +6 -8
lamindb/models/schema.py +54 -26
lamindb/models/sqlrecord.py +152 -40
lamindb/models/storage.py +59 -14
lamindb/models/transform.py +17 -17
lamindb/models/ulabel.py +6 -1
{lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/METADATA +11 -16
{lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/RECORD +55 -50
{lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/LICENSE +0 -0
{lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/WHEEL +0 -0

lamindb/core/storage/_anndata_accessor.py CHANGED Viewed

@@ -13,12 +13,17 @@ from anndata import __version__ as anndata_version
 from anndata._core.index import _normalize_indices
 from anndata._core.views import _resolve_idx
 from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
-from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
+from anndata._io.specs.registry import (
+    get_spec,
+    read_elem,
+    read_elem_partial,
+    write_elem,
+)
 from anndata.compat import _read_attr
 from fsspec.implementations.local import LocalFileSystem
 from fsspec.utils import infer_compression
 from lamin_utils import logger
-from lamindb_setup.core.upath import infer_filesystem
+from lamindb_setup.core.upath import S3FSMap, infer_filesystem
 from packaging import version
 from upath import UPath
@@ -28,6 +33,8 @@ if TYPE_CHECKING:
     from fsspec.core import OpenFile
     from lamindb_setup.types import UPathStr
+    from lamindb import Artifact
 anndata_version_parse = version.parse(anndata_version)
@@ -288,7 +295,7 @@ except ImportError:
 if ZARR_INSTALLED:
     from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
-    from ._zarr import get_zarr_store
+    from ._zarr import IS_ZARR_V3, get_zarr_store
     ArrayTypes.append(zarr.Array)
     GroupTypes.append(zarr.Group)
@@ -299,7 +306,18 @@ if ZARR_INSTALLED:
         assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!"  #  noqa: S101
         store = get_zarr_store(filepath)
-        storage = zarr.open(store, mode=mode)
+        kwargs = {}
+        if IS_ZARR_V3 and mode != "r":
+            # otherwise unable to write
+            kwargs["use_consolidated"] = False
+        storage = zarr.open(store, mode=mode, **kwargs)
+        # zarr v2 re-initializes the mapper
+        # we need to put back the correct one
+        # S3FSMap is returned from get_zarr_store only for zarr v2
+        if isinstance(store, S3FSMap):
+            assert not IS_ZARR_V3  # noqa: S101
+            storage.store.map = store
         conn = None
         return conn, storage
@@ -351,10 +369,10 @@ if ZARR_INSTALLED:
     # this is needed because accessing zarr.Group.keys() directly is very slow
     @registry.register("zarr")
     def keys(storage: zarr.Group):
-        if hasattr(storage, "_sync_iter"):  # zarr v3
+        if IS_ZARR_V3:
             paths = storage._sync_iter(storage.store.list())
         else:
-            paths = storage.store.keys()  # zarr v2
+            paths = storage.store.keys()
         attrs_keys: dict[str, list] = {}
         obs_var_arrays = []
@@ -438,9 +456,15 @@ def _try_backed_full(elem):
     return read_elem(elem)
+def _to_index(elem: np.ndarray):
+    if elem.dtype in (np.float64, np.int64):
+        elem = elem.astype(str)
+    return pd.Index(elem)
 def _safer_read_index(elem):
     if isinstance(elem, GroupTypes):
-        return pd.Index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
+        return _to_index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
     elif isinstance(elem, ArrayTypes):
         indices = None
         for index_name in ("index", "_index"):
@@ -450,7 +474,7 @@ def _safer_read_index(elem):
         if indices is not None and len(indices) > 0:
             if isinstance(indices[0], bytes):
                 indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
-            return pd.Index(indices)
+            return _to_index(indices)
         else:
             raise ValueError("Indices not found.")
     else:
@@ -479,33 +503,40 @@ class _MapAccessor:
         return descr
+def _safer_read_df(elem, indices=None):
+    if indices is not None:
+        obj = registry.safer_read_partial(elem, indices=indices)
+        df = _records_to_df(obj)
+    else:
+        df = registry.read_dataframe(elem)
+    if df.index.dtype in (np.float64, np.int64):
+        df.index = df.index.astype(str)
+    return df
 class _AnnDataAttrsMixin:
     storage: StorageType
     _attrs_keys: Mapping[str, list]
     @cached_property
-    def obs(self) -> pd.DataFrame:
+    def obs(self) -> pd.DataFrame | None:
         if "obs" not in self._attrs_keys:
             return None
         indices = getattr(self, "indices", None)
-        if indices is not None:
-            indices = (indices[0], slice(None))
-            obj = registry.safer_read_partial(self.storage["obs"], indices=indices)  # type: ignore
-            return _records_to_df(obj)
-        else:
-            return registry.read_dataframe(self.storage["obs"])  # type: ignore
+        return _safer_read_df(
+            self.storage["obs"],  # type: ignore
+            indices=(indices[0], slice(None)) if indices is not None else None,
+        )
     @cached_property
-    def var(self) -> pd.DataFrame:
+    def var(self) -> pd.DataFrame | None:
         if "var" not in self._attrs_keys:
             return None
         indices = getattr(self, "indices", None)
-        if indices is not None:
-            indices = (indices[1], slice(None))
-            obj = registry.safer_read_partial(self.storage["var"], indices=indices)  # type: ignore
-            return _records_to_df(obj)
-        else:
-            return registry.read_dataframe(self.storage["var"])  # type: ignore
+        return _safer_read_df(
+            self.storage["var"],  # type: ignore
+            indices=(indices[1], slice(None)) if indices is not None else None,
+        )
     @cached_property
     def uns(self):
@@ -702,6 +733,7 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
         connection: OpenFile | None,
         storage: StorageType,
         filename: str,
+        artifact: Artifact | None = None,
     ):
         self._conn = connection
         self.storage = storage
@@ -713,14 +745,43 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
         self._obs_names = _safer_read_index(self.storage["obs"])  # type: ignore
         self._var_names = _safer_read_index(self.storage["var"])  # type: ignore
+        self._artifact = artifact  # save artifact to update in write mode
+        self._updated = False  # track updates in r+ mode for zarr
+        self._entered = False  # check that the context manager is used
         self._closed = False
     def close(self):
         """Closes the connection."""
-        if hasattr(self, "storage") and hasattr(self.storage, "close"):
-            self.storage.close()
-        if hasattr(self, "_conn") and hasattr(self._conn, "close"):
-            self._conn.close()
+        storage = self.storage
+        connection = self._conn
+        if self._updated and (artifact := self._artifact) is not None:
+            from lamindb.models.artifact import Artifact
+            from lamindb.models.sqlrecord import init_self_from_db
+            # now self._updated can only be True for zarr
+            assert ZARR_INSTALLED  # noqa: S101
+            store = storage.store
+            keys = storage._sync_iter(store.list()) if IS_ZARR_V3 else store.keys()
+            # this checks that there consolidated metadata was written before
+            # need to update it
+            # zmetadata is in spatialdata sometimes for some reason
+            if ".zmetadata" in keys or "zmetadata" in keys:
+                zarr.consolidate_metadata(store)
+            new_version = Artifact(
+                artifact.path, revises=artifact, _is_internal_call=True
+            ).save()
+            # note: sets _state.db = "default"
+            init_self_from_db(artifact, new_version)
+        if hasattr(storage, "close"):
+            storage.close()
+        if hasattr(connection, "close"):
+            connection.close()
         self._closed = True
     @property
@@ -728,6 +789,8 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
         return self._closed
     def __enter__(self):
+        self._entered = True
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
@@ -763,6 +826,35 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
             self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
         )
+    def add_column(
+        self,
+        where: Literal["obs", "var"],
+        col_name: str,
+        col: np.ndarray | pd.Categorical,
+    ):
+        """Add a new column to .obs or .var of the underlying AnnData object."""
+        df_store = self.storage[where]  # type: ignore
+        if getattr(df_store, "read_only", True):
+            raise ValueError(
+                "You can use .add_column(...) only with zarr in a writable mode."
+            )
+        write_elem(df_store, col_name, col)
+        df_store.attrs["column-order"] = df_store.attrs["column-order"] + [col_name]
+        # remind only once if this wasn't updated before and not in the context manager
+        if not self._updated and not self._entered and self._artifact is not None:
+            logger.important(
+                "Do not forget to call .close() after you finish "
+                f"working with this accessor for {self._name} "
+                "to automatically update the corresponding artifact."
+            )
+        self._updated = True
+        # reset the cached property
+        # todo: maybe just append the column if the df was already loaded
+        self.__dict__.pop(where, None)
+        # update the cached columns
+        self._attrs_keys[where].append(col_name)
 # get the number of observations in an anndata object or file fast and safely
 def _anndata_n_observations(object: UPathStr | AnnData) -> int | None:

lamindb/core/storage/_backed_access.py CHANGED Viewed

@@ -4,6 +4,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Literal
+import h5py
 from anndata._io.specs.registry import get_spec
 from ._anndata_accessor import AnnDataAccessor, StorageType, registry
@@ -92,10 +93,10 @@ def backed_access(
     from lamindb.models import Artifact
     if isinstance(artifact_or_filepath, Artifact):
-        objectpath, _ = filepath_from_artifact(
-            artifact_or_filepath, using_key=using_key
-        )
+        artifact = artifact_or_filepath
+        objectpath, _ = filepath_from_artifact(artifact, using_key=using_key)
     else:
+        artifact = None
         objectpath = artifact_or_filepath
     name = objectpath.name
     # ignore .gz, only check the real suffix
@@ -111,9 +112,11 @@ def backed_access(
     elif suffix in {".h5", ".hdf5", ".h5ad"}:
         conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
     elif suffix == ".zarr":
+        if mode not in {"r", "r+"}:
+            raise ValueError("`mode` should be either 'r' or 'r+' for zarr.")
         conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
         if "spatialdata_attrs" in storage.attrs:
-            return SpatialDataAccessor(storage, name)
+            return SpatialDataAccessor(storage, name, artifact)
     elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (
         df_suffix := df_suffixes.pop()
     ) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):
@@ -127,9 +130,9 @@ def backed_access(
     is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
     if is_anndata:
-        if mode != "r":
-            raise ValueError("Can only access `AnnData` with mode='r'.")
-        return AnnDataAccessor(conn, storage, name)
+        if mode != "r" and isinstance(storage, h5py.Group):
+            raise ValueError("Can only access `hdf5` `AnnData` with mode='r'.")
+        return AnnDataAccessor(conn, storage, name, artifact)
     else:
         return BackedAccessor(conn, storage)

lamindb/core/storage/_spatialdata_accessor.py CHANGED Viewed

@@ -8,13 +8,22 @@ from ._anndata_accessor import AnnDataAccessor
 if TYPE_CHECKING:
     from zarr import Group
+    from lamindb import Artifact
 class _TablesAccessor:
-    def __init__(self, tables: Group):
+    def __init__(self, tables: Group, artifact: Artifact | None = None):
         self._tables = tables
+        self._artifact = artifact
     def __getitem__(self, key: str) -> AnnDataAccessor:
-        return AnnDataAccessor(connection=None, storage=self._tables[key], filename=key)
+        return AnnDataAccessor(
+            connection=None,
+            storage=self._tables[key],
+            filename=key,
+            artifact=self._artifact,
+        )
     def keys(self) -> list[str]:
         return list(self._tables.keys())
@@ -33,14 +42,16 @@ class SpatialDataAccessor:
     For now only allows to access `tables`.
     """
-    def __init__(self, storage: Group, name: str):
+    def __init__(self, storage: Group, name: str, artifact: Artifact | None = None):
         self.storage = storage
         self._name = name
+        self._artifact = artifact
     @cached_property
     def tables(self) -> _TablesAccessor:
         """tables of the underlying SpatialData object."""
-        return _TablesAccessor(self.storage["tables"])
+        return _TablesAccessor(self.storage["tables"], self._artifact)
     def __repr__(self):
         """Description of the SpatialDataAccessor object."""

lamindb/core/storage/_zarr.py CHANGED Viewed

@@ -37,6 +37,9 @@ def get_zarr_store(
     if isinstance(storepath, LocalPathClasses):
         store = storepath_str
     elif IS_ZARR_V3:
+        # todo: also check how to treat non-asynchronous filesystems
+        # zarr has something for this, using fsspec async wrapper
+        # check FsspecStore code
         store = zarr.storage.FsspecStore.from_upath(UPath(storepath, asynchronous=True))
     else:
         store = create_mapper(storepath.fs, storepath_str, check=check, create=create)

lamindb/curators/_legacy.py CHANGED Viewed

@@ -133,7 +133,7 @@ class CatManager:
         if self._artifact is None:
             if isinstance(self._dataset, pd.DataFrame):
-                artifact = Artifact.from_df(
+                artifact = Artifact.from_dataframe(
                     self._dataset,
                     key=key,
                     description=description,
@@ -1275,7 +1275,7 @@ class TiledbsomaCatManager(CatManager):
                 empty_dict, schema=self._obs_pa_schema
             ).to_pandas()
             # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
-            feature_sets["obs"] = Schema.from_df(
+            feature_sets["obs"] = Schema.from_dataframe(
                 df=mock_df,
                 field=self._columns_field,
                 mute=True,
@@ -1367,7 +1367,7 @@ def legacy_annotate_artifact(
 @classmethod  # type: ignore
-def from_df(
+def from_dataframe(
     cls,
     df: pd.DataFrame,
     categoricals: dict[str, FieldAttr] | None = None,
@@ -1383,6 +1383,18 @@ def from_df(
     )
+@classmethod  # type: ignore
+@deprecated("from_dataframe")
+def from_df(
+    cls,
+    df: pd.DataFrame,
+    categoricals: dict[str, FieldAttr] | None = None,
+    columns: FieldAttr = Feature.name,
+    organism: str | None = None,
+) -> DataFrameCatManager:
+    return cls.from_dataframe(df, categoricals, columns, organism)
 @classmethod  # type: ignore
 def from_anndata(
     cls,
@@ -1468,6 +1480,7 @@ def from_spatialdata(
     )
+CatManager.from_dataframe = from_dataframe  # type: ignore
 CatManager.from_df = from_df  # type: ignore
 CatManager.from_anndata = from_anndata  # type: ignore
 CatManager.from_mudata = from_mudata  # type: ignore

lamindb 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

lamindb 1.10.1py3-none-any.whl → 1.11.0py3-none-any.whl