PyPI - lamindb - Versions diffs - 1.2a2__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

lamindb 1.2a2py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

lamindb/__init__.py +3 -1
lamindb/_view.py +2 -2
lamindb/base/types.py +50 -11
lamindb/core/_compat.py +60 -0
lamindb/core/_context.py +15 -12
lamindb/core/datasets/__init__.py +1 -0
lamindb/core/datasets/_core.py +23 -0
lamindb/core/datasets/_small.py +16 -2
lamindb/core/loaders.py +22 -12
lamindb/core/storage/_tiledbsoma.py +2 -2
lamindb/core/storage/_zarr.py +84 -26
lamindb/core/storage/objects.py +45 -44
lamindb/core/types.py +11 -1
lamindb/curators/__init__.py +1430 -1665
lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
lamindb/models/_feature_manager.py +86 -42
lamindb/models/_from_values.py +110 -119
lamindb/models/_label_manager.py +17 -10
lamindb/models/artifact.py +170 -102
lamindb/models/can_curate.py +200 -231
lamindb/models/feature.py +76 -47
lamindb/models/project.py +69 -7
lamindb/models/query_set.py +12 -2
lamindb/models/record.py +77 -50
lamindb/models/run.py +20 -7
lamindb/models/schema.py +7 -15
{lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/METADATA +8 -7
{lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/RECORD +31 -30
lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
{lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/LICENSE +0 -0
{lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/WHEEL +0 -0

lamindb/__init__.py CHANGED Viewed

@@ -32,6 +32,7 @@ Registries.
 Curators & integrations.
 .. autosummary::
+   :toctree: .
    curators
    integrations
@@ -71,7 +72,7 @@ Backward compatibility.
 # ruff: noqa: I001
 # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
-__version__ = "1.2a2"
+__version__ = "1.3.1"
 import warnings
@@ -120,6 +121,7 @@ if _check_instance_setup(from_module="lamindb"):
     from .models.save import save
     from . import core
     from . import integrations
+    from . import curators
     track = context.track  # simple access
     finish = context.finish  # simple access

lamindb/_view.py CHANGED Viewed

@@ -11,7 +11,7 @@ from lamindb_setup._init_instance import get_schema_module_name
 from lamindb.models import Feature, FeatureValue, ParamValue, Record
-from .models.feature import convert_pandas_dtype_to_lamin_dtype
+from .models.feature import serialize_pandas_dtype
 if TYPE_CHECKING:
     import pandas as pd
@@ -114,7 +114,7 @@ def view(
     """
     if df is not None:
         descriptions = {
-            col_name: convert_pandas_dtype_to_lamin_dtype(dtype)
+            col_name: serialize_pandas_dtype(dtype)
             for col_name, dtype in df.dtypes.to_dict().items()
         }
         feature_dtypes = dict(Feature.objects.values_list("name", "dtype"))

lamindb/base/types.py CHANGED Viewed

@@ -7,7 +7,7 @@ Central object types.
    ArtifactKind
    TransformType
-   FeatureDtype
+   Dtype
 Basic types.
@@ -38,14 +38,53 @@ TransformType = Literal[
     "pipeline", "notebook", "upload", "script", "function", "linker"
 ]
 ArtifactKind = Literal["dataset", "model"]
-FeatureDtype = Literal[
-    "cat",  # categorical variables
-    "num",  # numerical variables
-    "str",  # string variables
-    "int",  # integer variables
-    "float",  # float variables
-    "bool",  # boolean variables
-    "date",  # date variables
-    "datetime",  # datetime variables
-    "object",  # this is a pandas type, we're only using it for complicated types, not for strings
+# below is used for Feature.dtype and Param.dtype
+Dtype = Literal[
+    "cat",  # categoricals
+    "num",  # numericals
+    "str",  # string
+    "int",  # integer / numpy.integer
+    "float",  # float
+    "bool",  # boolean
+    "date",  # date
+    "datetime",  # datetime
+    "object",  # this is a pandas input dtype, we're only using it for complicated types, not for strings
 ]
+"""Data type.
+Data types in lamindb are a string-serialized abstraction of common data types.
+Overview
+========
+============  ============  =================================================
+description   lamindb       pandas
+============  ============  =================================================
+categorical   `"cat"`       `category`
+numerical     `"num"`       `int | float`
+integer       `"int"`       `int64 | int32 | int16 | int8 | uint | ...`
+float         `"float"`     `float64 | float32 | float16 | float8 | ...`
+string        `"str"`       `object`
+datetime      `"datetime"`  `datetime`
+date          `"date"`      `date`
+============  ============  =================================================
+Categoricals
+============
+Beyond indicating that a feature is a categorical, `lamindb` allows you to define the registry to which values are restricted.
+For example, `'cat[ULabel]'` or `'cat[bionty.CellType]'` indicate that permissible values are from the `ULabel` or `CellType` registry, respectively.
+You can also reference multiple registries, e.g., `'cat[ULabel|bionty.CellType]'` indicates that values can be from either registry.
+You can also restrict to sub-types defined in registries via the `type` column, e.g., `'cat[ULabel[CellMedium]]'` indicates that values must be of type `CellMedium` within the `ULabel` registry.
+Literal
+=======
+A `Dtype` object in `lamindb` is a `Literal` up to further specification of `"cat"`.
+"""
+FeatureDtype = Dtype  # backward compat

lamindb/core/_compat.py ADDED Viewed

@@ -0,0 +1,60 @@
+import importlib.util
+from typing import Any, Callable, TypeVar
+T = TypeVar("T")
+def is_package_installed(package_name: str) -> bool:
+    spec = importlib.util.find_spec(package_name)
+    return spec is not None
+def with_package(package_name: str, operation: Callable[[Any], T]) -> T:
+    """Execute an operation that requires a specific package.
+    Args:
+        package_name: Package name (e.g., "mudata")
+        operation: Function that takes the imported module and returns a result
+    Examples:
+        # For direct package functions
+        result = with_package("mudata", lambda mod: mod.read_zarr(path))
+    """
+    try:
+        module = importlib.import_module(package_name)
+        return operation(module)
+    except ImportError:
+        raise ImportError(
+            f"Package '{package_name}' is required but not installed. "
+            f"Please install with: pip install {package_name}"
+        ) from None
+def with_package_obj(
+    obj: Any, class_name: str, package_name: str, operation: Callable[[Any], T]
+) -> tuple[bool, T | None]:
+    """Handle operations on objects that require specific packages.
+    Args:
+        obj: The object to operate on
+        class_name: Expected class name (e.g., "MuData")
+        package_name: Package that provides the class (e.g., "mudata")
+        operation: Function to call with the object if package is available.
+    Examples:
+        # For instance methods
+        handled, res = apply_class_func(dmem, "MuData", "mudata",
+                                      lambda obj: obj.write(filepath))
+    """
+    if obj.__class__.__name__ == class_name:
+        try:
+            importlib.import_module(package_name)
+            result = operation(obj)
+            return True, result
+        except ImportError:
+            raise ImportError(
+                f"Object appears to be {class_name} but '{package_name}' package is not installed. "
+                f"Please install with: pip install {package_name}"
+            ) from None
+    return False, None

lamindb/core/_context.py CHANGED Viewed

@@ -301,6 +301,12 @@ class Context:
         """
         from lamindb.models import Project
+        instance_settings = ln_setup.settings.instance
+        # similar logic here: https://github.com/laminlabs/lamindb/pull/2527
+        # TODO: refactor upon new access management
+        if instance_settings.dialect == "postgresql" and "read" in instance_settings.db:
+            logger.warning("skipping track(), connected in read-only mode")
+            return None
         if project is not None:
             project_record = Project.filter(
                 Q(name=project) | Q(uid=project)
@@ -461,26 +467,23 @@ class Context:
             path_str = get_notebook_key_colab()
             path = Path(path_str)
         else:
-            import nbproject
+            from nbproject.dev import read_notebook
+            from nbproject.dev._meta_live import get_title
+            from nbproject.dev._pypackage import infer_pypackages
             try:
-                nbproject_title = nbproject.meta.live.title
-            except IndexError:
-                # notebook is not saved
-                pass
-            if nbproject_title is not None:
-                description = nbproject_title
-            # log imported python packages
-            try:
-                from nbproject.dev._pypackage import infer_pypackages
+                nb = read_notebook(path_str)
+                nbproject_title = get_title(nb)
+                if nbproject_title is not None:
+                    description = nbproject_title
-                nb = nbproject.dev.read_notebook(path_str)
                 self._logging_message_imports += (
                     "notebook imports:"
                     f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
                 )
             except Exception:
-                logger.debug("inferring imported packages failed")
+                logger.debug("reading the notebook file failed")
                 pass
         return path, description

lamindb/core/datasets/__init__.py CHANGED Viewed

@@ -83,6 +83,7 @@ from ._core import (
     mudata_papalexi21_subset,
     schmidt22_crispra_gws_IFNG,
     schmidt22_perturbseq,
+    spatialdata_blobs,
 )
 from ._fake import fake_bio_notebook_titles
 from ._small import (

lamindb/core/datasets/_core.py CHANGED Viewed

@@ -13,6 +13,7 @@ from lamindb.core._settings import settings
 if TYPE_CHECKING:
     from mudata import MuData
+    from spatialdata import SpatialData
 def file_fcs() -> Path:
@@ -552,3 +553,25 @@ def schmidt22_perturbseq(basedir=".") -> Path:  # pragma: no cover
         "schmidt22_perturbseq.h5ad",
     )
     return Path(filepath).rename(Path(basedir) / filepath)
+def spatialdata_blobs() -> SpatialData:
+    """Example SpatialData dataset for tutorials."""
+    from spatialdata.datasets import blobs
+    sdata = blobs()
+    sdata.attrs["sample"] = {
+        "assay": "Visium Spatial Gene Expression",
+        "disease": "Alzheimer disease",
+        "developmental_stage": "adult stage",
+    }
+    sdata.tables["table"].var.index = [
+        "ENSG00000139618",  # BRCA2
+        "ENSG00000157764",  # BRAF
+        "ENSG00000999999",  # Does not exist
+    ]
+    sdata.tables["table"].obs["sample_region"] = pd.Categorical(
+        ["sample region 1"] * 13 + ["sample region 2"] * 13
+    )
+    return sdata

lamindb/core/datasets/_small.py CHANGED Viewed

@@ -8,9 +8,11 @@ import pandas as pd
 def small_dataset1(
-    otype: Literal["DataFrame", "AnnData"],
+    otype: Literal["DataFrame", "AnnData"] = "DataFrame",
     gene_symbols_in_index: bool = False,
     with_typo: bool = False,
+    with_cell_type_synonym: bool = False,
+    with_cell_type_typo: bool = False,
 ) -> pd.DataFrame | ad.AnnData:
     # define the data in the dataset
     # it's a mix of numerical measurements and observation-level metadata
@@ -19,14 +21,25 @@ def small_dataset1(
         var_ids = ["CD8A", "CD4", "CD14"]
     else:
         var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
+    abt_cell = (
+        "CD8-pos alpha-beta T cell"
+        if with_cell_type_typo
+        else "CD8-positive, alpha-beta T cell"
+    )
     dataset_dict = {
         var_ids[0]: [1, 2, 3],
         var_ids[1]: [3, 4, 5],
         var_ids[2]: [5, 6, 7],
         "perturbation": pd.Categorical(["DMSO", ifng, "DMSO"]),
         "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
-        "cell_type_by_expert": pd.Categorical(["B cell", "T cell", "T cell"]),
+        "cell_type_by_expert": pd.Categorical(
+            ["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
+        ),
         "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
+        "assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
+        "concentration": ["0.1%", "200 nM", "0.1%"],
+        "treatment_time_h": [24, 24, 6],
+        "donor": ["D0001", "D0002", None],
     }
     # define the dataset-level metadata
     metadata = {
@@ -100,6 +113,7 @@ def small_dataset3_cellxgene(
         "disease_ontology_term_id": ["MONDO:0004975", "MONDO:0004980", "MONDO:0004980"],
         "organism": ["human", "human", "human"],
         "sex": ["female", "male", "unknown"],
+        "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
         "tissue": ["lungg", "lungg", "heart"],
         "donor": ["-1", "1", "2"],
     }

lamindb/core/loaders.py CHANGED Viewed

@@ -20,10 +20,10 @@ from __future__ import annotations
 import builtins
 import re
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
-import anndata as ad
 import pandas as pd
+from anndata import read_h5ad
 from lamin_utils import logger
 from lamindb_setup.core.upath import (
     create_path,
@@ -33,13 +33,17 @@ from lamindb_setup.core.upath import (
 from ..core._settings import settings
 if TYPE_CHECKING:
+    from anndata import AnnData
     from lamindb_setup.core.types import UPathStr
+    from mudata import MuData
+    from lamindb.core.types import ScverseDataStructures
 try:
-    from ..core.storage._zarr import load_anndata_zarr
+    from ..core.storage._zarr import load_zarr
 except ImportError:
-    def load_anndata_zarr(storepath):  # type: ignore
+    def load_zarr(storepath):  # type: ignore
         raise ImportError("Please install zarr: pip install zarr<=2.18.4")
@@ -47,7 +51,7 @@ is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
 # tested in lamin-usecases
-def load_fcs(*args, **kwargs) -> ad.AnnData:
+def load_fcs(*args, **kwargs) -> AnnData:
     """Load an `.fcs` file to `AnnData`."""
     try:
         import readfcs
@@ -62,16 +66,16 @@ def load_tsv(path: UPathStr, **kwargs) -> pd.DataFrame:
     return pd.read_csv(path_sanitized, sep="\t", **kwargs)
-def load_h5ad(filepath, **kwargs) -> ad.AnnData:
+def load_h5ad(filepath, **kwargs) -> AnnData:
     """Load an `.h5ad` file to `AnnData`."""
     fs, filepath = infer_filesystem(filepath)
     compression = kwargs.pop("compression", "infer")
     with fs.open(filepath, mode="rb", compression=compression) as file:
-        adata = ad.read_h5ad(file, backed=False, **kwargs)
+        adata = read_h5ad(file, backed=False, **kwargs)
         return adata
-def load_h5mu(filepath: UPathStr, **kwargs):
+def load_h5mu(filepath: UPathStr, **kwargs) -> MuData:
     """Load an `.h5mu` file to `MuData`."""
     import mudata as md
@@ -100,7 +104,7 @@ def load_html(path: UPathStr) -> None | UPathStr:
         return path
-def load_json(path: UPathStr) -> dict:
+def load_json(path: UPathStr) -> dict[str, Any] | list[Any]:
     """Load `.json` to `dict`."""
     import json
@@ -109,7 +113,7 @@ def load_json(path: UPathStr) -> dict:
     return data
-def load_yaml(path: UPathStr) -> dict:
+def load_yaml(path: UPathStr) -> dict[str, Any] | list[Any]:
     """Load `.yaml` to `dict`."""
     import yaml  # type: ignore
@@ -156,7 +160,7 @@ FILE_LOADERS = {
     ".parquet": pd.read_parquet,
     ".parquet.gz": pd.read_parquet,  # this doesn't work for externally gzipped files, REMOVE LATER
     ".fcs": load_fcs,
-    ".zarr": load_anndata_zarr,
+    ".zarr": load_zarr,
     ".html": load_html,
     ".json": load_json,
     ".yaml": load_yaml,
@@ -172,10 +176,15 @@ SUPPORTED_SUFFIXES = [sfx for sfx in FILE_LOADERS.keys() if sfx != ".rds"]
 """Suffixes with defined artifact loaders."""
-def load_to_memory(filepath: UPathStr, **kwargs):
+def load_to_memory(
+    filepath: UPathStr, **kwargs
+) -> (
+    pd.DataFrame | ScverseDataStructures | dict[str, Any] | list[Any] | UPathStr | None
+):
     """Load a file into memory.
     Returns the filepath if no in-memory form is found.
+    May return None in interactive sessions for images.
     """
     filepath = create_path(filepath)
@@ -194,4 +203,5 @@ def load_to_memory(filepath: UPathStr, **kwargs):
         )
     filepath = settings._storage_settings.cloud_to_local(filepath, print_progress=True)
     return loader(filepath, **kwargs)

lamindb/core/storage/_tiledbsoma.py CHANGED Viewed

@@ -24,10 +24,10 @@ if TYPE_CHECKING:
 def _load_h5ad_zarr(objpath: UPath):
-    from lamindb.core.loaders import load_anndata_zarr, load_h5ad
+    from lamindb.core.loaders import load_h5ad, load_zarr
     if objpath.is_dir():
-        adata = load_anndata_zarr(objpath)
+        adata = load_zarr(objpath, expected_type="anndata")
     else:
         # read only local in backed for now
         # in principle possible to read remote in backed also

lamindb/core/storage/_zarr.py CHANGED Viewed

@@ -9,25 +9,60 @@ from anndata import __version__ as anndata_version
 from anndata._io.specs import write_elem
 from fsspec.implementations.local import LocalFileSystem
 from lamin_utils import logger
-from lamindb_setup.core.upath import create_mapper, infer_filesystem
+from lamindb_setup.core.upath import S3FSMap, create_mapper, infer_filesystem
 from packaging import version
+from lamindb.core._compat import with_package
 from ._anndata_sizes import _size_elem, _size_raw, size_adata
 if version.parse(anndata_version) < version.parse("0.11.0"):
-    from anndata._io import read_zarr
+    from anndata._io import read_zarr as read_anndata_zarr
 else:
-    from anndata.io import read_zarr
+    from anndata.io import read_zarr as read_anndata_zarr
 if TYPE_CHECKING:
     from anndata import AnnData
+    from fsspec import FSMap
     from lamindb_setup.core.types import UPathStr
+    from lamindb.core.types import ScverseDataStructures
+def create_zarr_open_obj(
+    storepath: UPathStr, *, check: bool = True
+) -> str | S3FSMap | FSMap:
+    """Creates the correct object that can be used to open a zarr file depending on local or remote location."""
+    fs, storepath_str = infer_filesystem(storepath)
+    if isinstance(fs, LocalFileSystem):
+        open_obj = storepath_str
+    else:
+        open_obj = create_mapper(fs, storepath_str, check=check)
+    return open_obj
+def _identify_zarr_type_from_storage(
+    storage: zarr.Group,
+) -> Literal["anndata", "mudata", "spatialdata", "unknown"]:
+    """Internal helper to identify zarr type from an open storage object."""
+    try:
+        if storage.attrs.get("encoding-type", "") == "anndata":
+            return "anndata"
+        elif storage.attrs.get("encoding-type", "") == "MuData":
+            return "mudata"
+        elif "spatialdata_attrs" in storage.attrs:
+            return "spatialdata"
+    except Exception as error:
+        logger.warning(f"an exception occurred {error}")
+    return "unknown"
 def identify_zarr_type(
     storepath: UPathStr, *, check: bool = True
-) -> Literal["anndata", "spatialdata", "unknown"]:
+) -> Literal["anndata", "mudata", "spatialdata", "unknown"]:
     """Identify whether a zarr store is AnnData, SpatialData, or unknown type."""
     # we can add these cheap suffix-based-checks later
     # also need to check whether the .spatialdata.zarr suffix
@@ -39,38 +74,61 @@ def identify_zarr_type(
     # elif ".anndata" in suffixes:
     #     return "anndata"
-    fs, storepath_str = infer_filesystem(storepath)
-    if isinstance(fs, LocalFileSystem):
-        open_obj = storepath_str
-    else:
-        open_obj = create_mapper(fs, storepath_str, check=check)
+    open_obj = create_zarr_open_obj(storepath, check=check)
     try:
         storage = zarr.open(open_obj, mode="r")
-        if "spatialdata_attrs" in storage.attrs:
-            return "spatialdata"
-        if storage.attrs.get("encoding-type", "") == "anndata":
-            return "anndata"
+        return _identify_zarr_type_from_storage(storage)
     except Exception as error:
-        logger.warning(f"an exception occured {error}")
+        logger.warning(
+            f"an exception occured while trying to open the zarr store\n {error}"
+        )
     return "unknown"
-def load_anndata_zarr(storepath: UPathStr) -> AnnData:
-    fs, storepath_str = infer_filesystem(storepath)
-    if isinstance(fs, LocalFileSystem):
-        # this is faster than through an fsspec mapper for local
-        open_obj = storepath_str
-    else:
-        open_obj = create_mapper(fs, storepath_str, check=True)
-    adata = read_zarr(open_obj)
-    return adata
+def load_zarr(
+    storepath: UPathStr,
+    expected_type: Literal["anndata", "mudata", "spatialdata"] = None,
+) -> ScverseDataStructures:
+    """Loads a zarr store and returns the corresponding scverse data structure.
+    Args:
+        storepath: Path to the zarr store
+        expected_type: If provided, ensures the zarr store is of this type ("anndata", "mudata", "spatialdata")
+                       and raises ValueError if it's not
+    """
+    open_obj = create_zarr_open_obj(storepath, check=True)
+    # Open the storage once
+    try:
+        storage = zarr.open(open_obj, mode="r")
+    except Exception as error:
+        raise ValueError(f"Could not open zarr store: {error}") from None
+    actual_type = _identify_zarr_type_from_storage(storage)
+    if expected_type is not None and actual_type != expected_type:
+        raise ValueError(
+            f"Expected zarr store of type '{expected_type}', but found '{actual_type}'"
+        )
+    match actual_type:
+        case "anndata":
+            scverse_obj = read_anndata_zarr(open_obj)
+        case "mudata":
+            scverse_obj = with_package("mudata", lambda mod: mod.read_zarr(open_obj))
+        case "spatialdata":
+            scverse_obj = with_package(
+                "spatialdata", lambda mod: mod.read_zarr(open_obj)
+            )
+        case "unknown" | _:
+            raise ValueError(
+                "Unable to determine zarr store format and therefore cannot load Artifact."
+            )
+    return scverse_obj
 def write_adata_zarr(
     adata: AnnData, storepath: UPathStr, callback=None, chunks=None, **dataset_kwargs
-):
+) -> None:
     fs, storepath_str = infer_filesystem(storepath)
     store = create_mapper(fs, storepath_str, create=True)

lamindb 1.2a2__py3-none-any.whl → 1.3.1__py3-none-any.whl

lamindb 1.2a2py3-none-any.whl → 1.3.1py3-none-any.whl