PyPI - lamindb - Versions diffs - 1.7.1__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

lamindb 1.7.1py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

lamindb/__init__.py +1 -1
lamindb/_finish.py +8 -3
lamindb/core/_context.py +32 -17
lamindb/core/_settings.py +31 -5
lamindb/core/datasets/_small.py +50 -20
lamindb/core/storage/_backed_access.py +36 -28
lamindb/core/storage/_polars_lazy_df.py +44 -5
lamindb/core/storage/_tiledbsoma.py +1 -2
lamindb/curators/__init__.py +0 -6
lamindb/curators/_legacy.py +1 -579
lamindb/curators/core.py +53 -14
lamindb/examples/__init__.py +2 -0
lamindb/examples/cellxgene/__init__.py +11 -0
lamindb/examples/cellxgene/_cellxgene.py +238 -0
lamindb/{curators/_cellxgene_schemas/schema_versions.csv → examples/cellxgene/cxg_schema_versions.csv} +11 -0
lamindb/models/_describe.py +69 -56
lamindb/models/_django.py +80 -53
lamindb/models/_feature_manager.py +37 -34
lamindb/models/artifact.py +44 -71
lamindb/models/can_curate.py +3 -1
lamindb/models/feature.py +43 -28
lamindb/models/schema.py +37 -21
lamindb/models/sqlrecord.py +48 -46
lamindb/models/storage.py +83 -34
lamindb-1.9.0.dist-info/METADATA +144 -0
{lamindb-1.7.1.dist-info → lamindb-1.9.0.dist-info}/RECORD +28 -27
lamindb/curators/_cellxgene_schemas/__init__.py +0 -198
lamindb-1.7.1.dist-info/METADATA +0 -68
{lamindb-1.7.1.dist-info → lamindb-1.9.0.dist-info}/LICENSE +0 -0
{lamindb-1.7.1.dist-info → lamindb-1.9.0.dist-info}/WHEEL +0 -0

lamindb/__init__.py CHANGED Viewed

@@ -108,7 +108,7 @@ Backwards compatibility.
 # ruff: noqa: I001
 # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
-__version__ = "1.7.1"
+__version__ = "1.9.0"
 import warnings

lamindb/_finish.py CHANGED Viewed

@@ -260,9 +260,9 @@ def save_context_core(
     is_r_notebook = filepath.suffix in {".qmd", ".Rmd"}
     source_code_path = filepath
     report_path: Path | None = None
-    save_source_code_and_report = True
+    save_source_code_and_report = filepath.exists()
     if (
-        is_run_from_ipython and notebook_runner != "nbconvert"
+        is_run_from_ipython and notebook_runner != "nbconvert" and filepath.exists()
     ):  # python notebooks in interactive session
         import nbproject
@@ -281,7 +281,7 @@ def save_context_core(
                 logger.warning(
                     "the notebook on disk wasn't saved within the last 10 sec"
                 )
-    if is_ipynb:  # could be from CLI outside interactive session
+    if is_ipynb and filepath.exists():  # could be from CLI outside interactive session
         try:
             import jupytext  # noqa: F401
             from nbproject.dev import (
@@ -315,6 +315,8 @@ def save_context_core(
             ".ipynb", ".py"
         )
         notebook_to_script(transform.description, filepath, source_code_path)
+    elif is_ipynb and not filepath.exists():
+        logger.warning("notebook file does not exist in compute environment")
     elif is_r_notebook:
         if filepath.with_suffix(".nb.html").exists():
             report_path = filepath.with_suffix(".nb.html")
@@ -365,6 +367,9 @@ def save_context_core(
         base_path = ln_setup.settings.cache_dir / "environments" / f"run_{run.uid}"
         paths = [base_path / "run_env_pip.txt", base_path / "r_pak_lockfile.json"]
         existing_paths = [path for path in paths if path.exists()]
+        if len(existing_paths) == 2:
+            # let's not store the python environment for an R session for now
+            existing_paths = [base_path / "r_pak_lockfile.json"]
         if existing_paths:
             overwrite_env = True

lamindb/core/_context.py CHANGED Viewed

@@ -17,20 +17,18 @@ from lamin_utils import logger
 from lamindb_setup.core import deprecated
 from lamindb_setup.core.hashing import hash_file
-from lamindb.base import ids
-from lamindb.base.ids import base62_12
-from lamindb.models import Run, Transform, format_field_value
-from ..core._settings import settings
+from ..base.ids import base62_12
 from ..errors import (
     InvalidArgument,
     TrackNotCalled,
     UpdateContext,
 )
+from ..models import Run, Transform, format_field_value
 from ..models._is_versioned import bump_version as bump_version_function
 from ..models._is_versioned import (
     increment_base62,
 )
+from ._settings import is_read_only_connection, settings
 from ._sync_git import get_transform_reference_from_git_repo
 from ._track_environment import track_python_environment
@@ -324,6 +322,7 @@ class Context:
         params: dict | None = None,
         new_run: bool | None = None,
         path: str | None = None,
+        pypackages: bool | None = None,
     ) -> None:
         """Track a run of your notebook or script.
@@ -343,6 +342,7 @@ class Context:
                 (default notebook), if `True`, creates new run (default non-notebook).
             path: Filepath of notebook or script. Only needed if it can't be
                 automatically detected.
+            pypackages: If `True` or `None`, infers Python packages used in a notebook.
         Examples:
@@ -365,10 +365,8 @@ class Context:
             save_context_core,
         )
-        instance_settings = ln_setup.settings.instance
         # similar logic here: https://github.com/laminlabs/lamindb/pull/2527
-        # TODO: refactor upon new access management
-        if instance_settings.dialect == "postgresql" and "read" in instance_settings.db:
+        if is_read_only_connection():
             logger.warning("skipping track(), connected in read-only mode")
             return None
         if project is None:
@@ -428,7 +426,9 @@ class Context:
         if transform is None:
             description = None
             if is_run_from_ipython:
-                self._path, description = self._track_notebook(path_str=path)
+                self._path, description = self._track_notebook(
+                    path_str=path, pypackages=pypackages
+                )
                 transform_type = "notebook"
                 transform_ref = None
                 transform_ref_type = None
@@ -591,11 +591,14 @@ class Context:
         self,
         *,
         path_str: str | None,
+        pypackages: bool | None = None,
     ) -> tuple[Path, str | None]:
         if path_str is None:
             path, self._notebook_runner = get_notebook_path()
         else:
             path = Path(path_str)
+        if pypackages is None:
+            pypackages = True
         description = None
         path_str = path.as_posix()
         if path_str.endswith("Untitled.ipynb"):
@@ -616,10 +619,11 @@ class Context:
                 if nbproject_title is not None:
                     description = nbproject_title
-                self._logging_message_imports += (
-                    "notebook imports:"
-                    f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
-                )
+                if pypackages:
+                    self._logging_message_imports += (
+                        "notebook imports:"
+                        f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
+                    )
             except Exception:
                 logger.debug("reading the notebook file failed")
                 pass
@@ -689,10 +693,21 @@ class Context:
             source_code_path = ln_setup.settings.cache_dir / self._path.name.replace(
                 ".ipynb", ".py"
             )
-            notebook_to_script(description, self._path, source_code_path)
-            transform_hash, _ = hash_file(source_code_path)
+            if (
+                self._path.exists()
+            ):  # notebook kernel might be running on a different machine
+                notebook_to_script(description, self._path, source_code_path)
+                transform_hash, _ = hash_file(source_code_path)
+            else:
+                logger.debug(
+                    "skipping notebook hash comparison, notebook kernel running on a different machine"
+                )
+                transform_hash = None
         # see whether we find a transform with the exact same hash
-        aux_transform = Transform.filter(hash=transform_hash).one_or_none()
+        if transform_hash is not None:
+            aux_transform = Transform.filter(hash=transform_hash).one_or_none()
+        else:
+            aux_transform = None
         # if the user did not pass a uid and there is no matching aux_transform
         # need to search for the transform based on the filename
         if self.uid is None and aux_transform is None:
@@ -856,7 +871,7 @@ class Context:
                 and not transform_was_saved
             ):
                 raise UpdateContext(
-                    f'{transform.created_by.name} ({transform.created_by.handle}) already works on this draft {transform.type}.\n\nPlease create a revision via `ln.track("{uid[:-4]}{increment_base62(uid[-4:])}")` or a new transform with a *different* key and `ln.track("{ids.base62_12()}0000")`.'
+                    f'{transform.created_by.name} ({transform.created_by.handle}) already works on this draft {transform.type}.\n\nPlease create a revision via `ln.track("{uid[:-4]}{increment_base62(uid[-4:])}")` or a new transform with a *different* key and `ln.track("{base62_12()}0000")`.'
                 )
             # check whether transform source code was already saved
             if transform_was_saved:

lamindb/core/_settings.py CHANGED Viewed

@@ -1,12 +1,14 @@
 from __future__ import annotations
 import os
+import sys
 from typing import TYPE_CHECKING
 import lamindb_setup as ln_setup
 from lamin_utils import colors, logger
+from lamindb_setup import settings as setup_settings
 from lamindb_setup._set_managed_storage import set_managed_storage
-from lamindb_setup.core._settings import settings as setup_settings
+from lamindb_setup.core import deprecated
 from lamindb_setup.core._settings_instance import sanitize_git_repo_url
 from .subsettings._annotation_settings import AnnotationSettings, annotation_settings
@@ -19,6 +21,15 @@ if TYPE_CHECKING:
     from lamindb_setup.core._settings_storage import StorageSettings
     from upath import UPath
+def is_read_only_connection() -> bool:
+    instance = setup_settings.instance
+    if instance.dialect == "postgresql":
+        db_url = instance.db
+        return "read" in db_url or "public" in db_url
+    return False
 VERBOSITY_TO_INT = {
     "error": 0,  # 40
     "warning": 1,  # 30
@@ -44,6 +55,9 @@ class Settings:
         self._sync_git_repo: str | None = None
     def __repr__(self) -> str:  # pragma: no cover
+        if "sphinx" in sys.modules:
+            return object.__repr__(self)
         cls_name = colors.green(self.__class__.__name__)
         verbosity_color = colors.yellow if self.verbosity == "warning" else colors.green
         verbosity_str = verbosity_color(self.verbosity)
@@ -181,6 +195,8 @@ class Settings:
     def storage(self, path_kwargs: str | Path | UPath | tuple[str | UPath, Mapping]):
         if isinstance(path_kwargs, tuple):
             path, kwargs = path_kwargs
+            if isinstance(kwargs, str):
+                kwargs = {"host": kwargs}
         else:
             path, kwargs = path_kwargs, {}
         set_managed_storage(path, **kwargs)
@@ -196,18 +212,28 @@ class Settings:
         return ln_setup.settings.cache_dir
     @property
-    def storage_local(self) -> StorageSettings:
+    def local_storage(self) -> StorageSettings:
         """An additional local default storage (a path to its root).
         Is only available if :attr:`~lamindb.setup.core.InstanceSettings.keep_artifacts_local` is enabled.
         Guide: :doc:`faq/keep-artifacts-local`
         """
-        return ln_setup.settings.instance.storage_local
+        return ln_setup.settings.instance.local_storage
+    @local_storage.setter
+    def local_storage(self, local_root: Path):
+        ln_setup.settings.instance.local_storage = local_root
+    @property
+    @deprecated("local_storage")
+    def storage_local(self) -> StorageSettings:
+        return self.local_storage
     @storage_local.setter
-    def storage_local(self, local_root: Path):
-        ln_setup.settings.instance.storage_local = local_root
+    @deprecated("local_storage")
+    def storage_local(self, local_root_host: tuple[Path | str, str]):
+        self.local_storage = local_root_host  # type: ignore
     @property
     def verbosity(self) -> str:

lamindb/core/datasets/_small.py CHANGED Viewed

@@ -9,35 +9,65 @@ import pandas as pd
 def small_dataset3_cellxgene(
     otype: Literal["DataFrame", "AnnData"] = "AnnData",
+    with_obs_defaults: bool = False,
+    with_obs_typo: bool = False,
 ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
     # TODO: consider other ids for other organisms
     # "ENSMUSG00002076988"
     var_ids = ["invalid_ensembl_id", "ENSG00000000419", "ENSG00000139618"]
-    dataset_dict = {
-        var_ids[0]: [2, 3, 3],
-        var_ids[1]: [3, 4, 5],
-        var_ids[2]: [4, 2, 3],
-        "disease_ontology_term_id": ["MONDO:0004975", "MONDO:0004980", "MONDO:0004980"],
-        "organism": ["human", "human", "human"],
-        "sex": ["female", "male", "unknown"],
-        "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
-        "tissue": ["lungg", "lungg", "heart"],
-        "donor": ["-1", "1", "2"],
-    }
-    dataset_df = pd.DataFrame(
-        dataset_dict,
+    lung_id = "UBERON:0002048XXX" if with_obs_typo else "UBERON:0002048"
+    obs_df = pd.DataFrame(
+        {
+            "disease_ontology_term_id": [
+                "MONDO:0004975",
+                "MONDO:0004980",
+                "MONDO:0004980",
+            ],
+            "development_stage_ontology_term_id": ["unknown", "unknown", "unknown"],
+            "organism": ["human", "human", "human"],
+            "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
+            "tissue_ontology_term_id": [lung_id, lung_id, "UBERON:0000948"],
+            "cell_type": ["T cell", "B cell", "B cell"],
+            "self_reported_ethnicity": ["South Asian", "South Asian", "South Asian"],
+            "donor_id": ["-1", "1", "2"],
+            "is_primary_data": [False, False, False],
+            "suspension_type": ["cell", "cell", "cell"],
+            "tissue_type": ["tissue", "tissue", "tissue"],
+        },
         index=["barcode1", "barcode2", "barcode3"],
     )
-    dataset_df["tissue"] = dataset_df["tissue"].astype("category")
-    ad.AnnData(
-        dataset_df[var_ids],
-        obs=dataset_df[[key for key in dataset_dict if key not in var_ids]],
+    var_df = pd.DataFrame(
+        index=var_ids, data={"feature_is_filtered": [False, False, False]}
     )
+    X = pd.DataFrame(
+        {
+            var_ids[0]: [2, 3, 3],
+            var_ids[1]: [3, 4, 5],
+            var_ids[2]: [4, 2, 3],
+        },
+        index=["barcode1", "barcode2", "barcode3"],
+        dtype="float32",
+    )
+    obs_df["donor_id"] = obs_df["donor_id"].astype("category")
     if otype == "DataFrame":
-        return dataset_df
+        return pd.concat([X, obs_df], axis=1)
     else:
-        dataset_ad = ad.AnnData(dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:])
-        return dataset_ad
+        adata = ad.AnnData(X=X, obs=obs_df, var=var_df)
+        adata.uns["title"] = "CELLxGENE example"
+        adata.obsm["X_pca"] = np.array(
+            [[-1.2, 0.8], [0.5, -0.3], [0.7, -0.5]], dtype="float32"
+        )
+        # CELLxGENE requires the `.raw` slot to be set - https://github.com/chanzuckerberg/single-cell-curation/issues/1304
+        adata.raw = adata.copy()
+        adata.raw.var.drop(columns="feature_is_filtered", inplace=True)
+        if with_obs_defaults:
+            adata.obs["assay"] = "single-cell RNA sequencing"
+        return adata
 def anndata_with_obs() -> ad.AnnData:

lamindb/core/storage/_backed_access.py CHANGED Viewed

@@ -163,6 +163,11 @@ def _open_dataframe(
     engine: Literal["pyarrow", "polars"] = "pyarrow",
     **kwargs,
 ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
+    if engine not in {"pyarrow", "polars"}:
+        raise ValueError(
+            f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'."
+        )
     df_suffix: str
     if suffix is None:
         df_suffixes = _flat_suffixes(paths)
@@ -175,34 +180,37 @@ def _open_dataframe(
     else:
         df_suffix = suffix
-    if engine == "pyarrow":
-        if df_suffix not in PYARROW_SUFFIXES:
-            raise ValueError(
-                f"{df_suffix} files are not supported by pyarrow, "
-                f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}."
-            )
-        # this checks that the filesystem is the same for all paths
-        # this is a requirement of pyarrow.dataset.dataset
-        if not isinstance(paths, Path):  # is a list then
-            fs = getattr(paths[0], "fs", None)
-            for path in paths[1:]:
-                # this assumes that the filesystems are cached by fsspec
-                if getattr(path, "fs", None) is not fs:
-                    raise ValueError(
-                        "The collection has artifacts with different filesystems, "
-                        "this is not supported by pyarrow."
-                    )
-        dataframe = _open_pyarrow_dataset(paths, **kwargs)
-    elif engine == "polars":
-        if df_suffix not in POLARS_SUFFIXES:
-            raise ValueError(
-                f"{df_suffix} files are not supported by polars, "
-                f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}."
-            )
-        dataframe = _open_polars_lazy_df(paths, **kwargs)
-    else:
+    if engine == "pyarrow" and df_suffix not in PYARROW_SUFFIXES:
         raise ValueError(
-            f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'."
+            f"{df_suffix} files are not supported by pyarrow, "
+            f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}."
+        )
+    elif engine == "polars" and df_suffix not in POLARS_SUFFIXES:
+        raise ValueError(
+            f"{df_suffix} files are not supported by polars, "
+            f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}."
         )
-    return dataframe
+    polars_without_fsspec = engine == "polars" and not kwargs.get("use_fsspec", False)
+    if (engine == "pyarrow" or polars_without_fsspec) and not isinstance(paths, Path):
+        # this checks that the filesystem is the same for all paths
+        # this is a requirement of pyarrow.dataset.dataset
+        fs = getattr(paths[0], "fs", None)
+        for path in paths[1:]:
+            # this assumes that the filesystems are cached by fsspec
+            if getattr(path, "fs", None) is not fs:
+                engine_msg = (
+                    "polars engine without passing `use_fsspec=True`"
+                    if engine == "polars"
+                    else "pyarrow engine"
+                )
+                raise ValueError(
+                    "The collection has artifacts with different filesystems, "
+                    f"this is not supported for {engine_msg}."
+                )
+    return (
+        _open_pyarrow_dataset(paths, **kwargs)
+        if engine == "pyarrow"
+        else _open_polars_lazy_df(paths, **kwargs)
+    )

lamindb/core/storage/_polars_lazy_df.py CHANGED Viewed

@@ -4,6 +4,8 @@ from contextlib import contextmanager
 from pathlib import Path
 from typing import TYPE_CHECKING
+from lamindb_setup.core.upath import get_storage_region
 if TYPE_CHECKING:
     from collections.abc import Iterator
@@ -13,9 +15,35 @@ if TYPE_CHECKING:
 POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc")
+def _polars_storage_options(storepath: UPath) -> dict[str, str | bool]:
+    storage_options: dict[str, str | bool] = {}
+    s3fs_options = storepath.storage_options
+    endpoint_url = s3fs_options.get("endpoint_url", None)
+    if endpoint_url is not None:
+        storage_options["aws_virtual_hosted_style_request"] = False
+        storage_options["aws_endpoint_url"] = endpoint_url
+        if endpoint_url.startswith("http://"):
+            storage_options["aws_allow_http"] = True
+    else:
+        storage_options["aws_region"] = get_storage_region(storepath)
+    if s3fs_options.get("anon", False):
+        storage_options["aws_skip_signature"] = True
+    else:
+        if "key" in s3fs_options:
+            storage_options["aws_access_key_id"] = s3fs_options["key"]
+        if "secret" in s3fs_options:
+            storage_options["aws_secret_access_key"] = s3fs_options["secret"]
+        if "token" in s3fs_options:
+            storage_options["aws_session_token"] = s3fs_options["token"]
+    return storage_options
 @contextmanager
 def _open_polars_lazy_df(
-    paths: UPath | list[UPath], **kwargs
+    paths: UPath | list[UPath], use_fsspec: bool = False, **kwargs
 ) -> Iterator[PolarsLazyFrame]:
     try:
         import polars as pl
@@ -38,14 +66,25 @@ def _open_polars_lazy_df(
             path_list += [p for p in path.rglob("*") if p.suffix != ""]
         else:
             path_list.append(path)
+    # assume the filesystem is the same for all
+    # it is checked in _open_dataframe
+    path0 = path_list[0]
+    storage_options = None
+    if not use_fsspec:
+        storage_options = kwargs.pop("storage_options", None)
+        if path0.protocol == "s3" and storage_options is None:
+            storage_options = _polars_storage_options(path0)
     open_files = []
     try:
         for path in path_list:
-            open_files.append(path.open(mode="rb"))
+            open_files.append(path.open(mode="rb") if use_fsspec else path.as_posix())
-        yield scans[path_list[0].suffix](open_files, **kwargs)
+        yield scans[path_list[0].suffix](
+            open_files, storage_options=storage_options, **kwargs
+        )
     finally:
-        for open_file in open_files:
-            open_file.close()
+        if use_fsspec:
+            for open_file in open_files:
+                open_file.close()

lamindb/core/storage/_tiledbsoma.py CHANGED Viewed

@@ -8,8 +8,7 @@ import pyarrow as pa
 from anndata import AnnData, read_h5ad
 from lamin_utils import logger
 from lamindb_setup import settings as setup_settings
-from lamindb_setup.core._settings_storage import get_storage_region
-from lamindb_setup.core.upath import LocalPathClasses, create_path
+from lamindb_setup.core.upath import LocalPathClasses, create_path, get_storage_region
 from packaging import version
 if TYPE_CHECKING:

lamindb/curators/__init__.py CHANGED Viewed

@@ -18,10 +18,6 @@ Modules.
 """
-from ._legacy import (  # backward compat
-    CellxGeneAnnDataCatManager,
-    PertAnnDataCatManager,
-)
 from .core import (
     AnnDataCurator,
     DataFrameCurator,
@@ -31,8 +27,6 @@ from .core import (
 )
 __all__ = [
-    "CellxGeneAnnDataCatManager",
-    "PertAnnDataCatManager",
     "AnnDataCurator",
     "DataFrameCurator",
     "MuDataCurator",

lamindb 1.7.1__py3-none-any.whl → 1.9.0__py3-none-any.whl

lamindb 1.7.1py3-none-any.whl → 1.9.0py3-none-any.whl