PyPI - lamindb - Versions diffs - 0.76.15__py3-none-any.whl → 0.77.0__py3-none-any.whl - Mend

lamindb 0.76.15py3-none-any.whl → 0.77.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

lamindb/__init__.py +2 -2
lamindb/_artifact.py +9 -4
lamindb/{_can_validate.py → _can_curate.py} +32 -23
lamindb/_curate.py +24 -13
lamindb/_finish.py +12 -7
lamindb/_parents.py +8 -1
lamindb/_query_set.py +58 -13
lamindb/_record.py +192 -62
lamindb/core/__init__.py +4 -2
lamindb/core/_context.py +59 -32
lamindb/core/_django.py +2 -2
lamindb/core/_label_manager.py +3 -3
lamindb/core/loaders.py +15 -5
lamindb/core/storage/_anndata_accessor.py +7 -4
lamindb/core/storage/_zarr.py +8 -1
{lamindb-0.76.15.dist-info → lamindb-0.77.0.dist-info}/METADATA +18 -6
{lamindb-0.76.15.dist-info → lamindb-0.77.0.dist-info}/RECORD +19 -20
lamindb/_filter.py +0 -21
{lamindb-0.76.15.dist-info → lamindb-0.77.0.dist-info}/LICENSE +0 -0
{lamindb-0.76.15.dist-info → lamindb-0.77.0.dist-info}/WHEEL +0 -0

lamindb/__init__.py CHANGED Viewed

@@ -43,7 +43,7 @@ Modules and settings.
 """
 # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
-__version__ = "0.76.15"
+__version__ = "0.77.0"
 import os as _os
@@ -79,7 +79,7 @@ if _check_instance_setup(from_module="lnschema_core"):
     from . import core  # isort: split
     from . import (
         _artifact,
-        _can_validate,
+        _can_curate,
         _collection,
         _curate,
         _feature,

lamindb/_artifact.py CHANGED Viewed

@@ -111,7 +111,12 @@ def process_pathlike(
             # for the storage root: the bucket
             if not isinstance(filepath, LocalPathClasses):
                 # for a cloud path, new_root is always the bucket name
-                new_root = list(filepath.parents)[-1]
+                if filepath.protocol == "hf":
+                    hf_path = filepath.fs.resolve_path(filepath.as_posix())
+                    hf_path.path_in_repo = ""
+                    new_root = "hf://" + hf_path.unresolve()
+                else:
+                    new_root = list(filepath.parents)[-1]
                 # do not register remote storage locations on hub if the current instance
                 # is not managed on the hub
                 storage_settings, _ = init_storage(
@@ -213,9 +218,9 @@ def get_stat_or_artifact(
         if stat is not None:
             # convert UPathStatResult to fsspec info dict
             stat = stat.as_info()
-            if "ETag" in stat:  # is file
+            if (store_type := stat["type"]) == "file":
                 size, hash, hash_type = get_stat_file_cloud(stat)
-            elif stat["type"] == "directory":
+            elif store_type == "directory":
                 size, hash, hash_type, n_objects = get_stat_dir_cloud(path)
         if hash is None:
             logger.warning(f"did not add hash for {path}")
@@ -240,7 +245,7 @@ def get_stat_or_artifact(
             .order_by("-created_at")
             .all()
         )
-        artifact_with_same_hash_exists = len(result.filter(hash=hash).all()) > 0
+        artifact_with_same_hash_exists = result.filter(hash=hash).count() > 0
         if not artifact_with_same_hash_exists and len(result) > 0:
             logger.important(
                 f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"

lamindb/{_can_validate.py → _can_curate.py} RENAMED Viewed

@@ -8,7 +8,7 @@ import pandas as pd
 from django.core.exceptions import FieldDoesNotExist
 from lamin_utils import colors, logger
 from lamindb_setup.core._docs import doc_args
-from lnschema_core import CanValidate, Record
+from lnschema_core import CanCurate, Record
 from ._from_values import _has_organism_field, _print_values, get_or_create_records
 from ._record import _queryset, get_name_field
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
 # from_values doesn't apply for QuerySet or Manager
 @classmethod  # type:ignore
-@doc_args(CanValidate.from_values.__doc__)
+@doc_args(CanCurate.from_values.__doc__)
 def from_values(
     cls,
     values: ListLike,
@@ -49,7 +49,7 @@ def from_values(
 @classmethod  # type: ignore
-@doc_args(CanValidate.inspect.__doc__)
+@doc_args(CanCurate.inspect.__doc__)
 def inspect(
     cls,
     values: ListLike,
@@ -71,7 +71,7 @@ def inspect(
 @classmethod  # type: ignore
-@doc_args(CanValidate.validate.__doc__)
+@doc_args(CanCurate.validate.__doc__)
 def validate(
     cls,
     values: ListLike,
@@ -108,14 +108,14 @@ def _check_organism_db(organism: Record, using_key: str | None):
 def _concat_lists(values: ListLike) -> list[str]:
     """Concatenate a list of lists of strings into a single list."""
-    if len(values) > 0 and isinstance(values, (list, pd.Series)):
-        try:
-            if isinstance(values[0], list):
-                if isinstance(values, pd.Series):
-                    values = values.tolist()
-                values = sum([v for v in values if isinstance(v, list)], [])
-        except KeyError:
-            pass
+    if isinstance(values, (list, pd.Series)) and len(values) > 0:
+        first_item = values[0] if isinstance(values, list) else values.iloc[0]
+        if isinstance(first_item, list):
+            if isinstance(values, pd.Series):
+                values = values.tolist()
+            values = [
+                v for sublist in values if isinstance(sublist, list) for v in sublist
+            ]
     return values
@@ -250,7 +250,7 @@ def _validate(
                 f"Your {cls.__name__} registry is empty, consider populating it first!"
             )
             if hasattr(cls, "source_id"):
-                msg += "\n   → use `.import_from_source()` to import records from a source, e.g. a public ontology"
+                msg += "\n   → use `.import_source()` to import records from a source, e.g. a public ontology"
             logger.warning(msg)
         return np.array([False] * len(values))
@@ -268,7 +268,7 @@ def _validate(
 @classmethod  # type: ignore
-@doc_args(CanValidate.standardize.__doc__)
+@doc_args(CanCurate.standardize.__doc__)
 def standardize(
     cls,
     values: ListLike,
@@ -388,7 +388,11 @@ def _standardize(
     try:
         registry._meta.get_field(synonyms_field)
-        fields = {i for i in [field, return_field, synonyms_field] if i is not None}
+        fields = {
+            field_name
+            for field_name in [field, return_field, synonyms_field]
+            if field_name is not None
+        }
         df = _filter_query_based_on_organism(
             queryset=queryset,
             field=field,
@@ -445,14 +449,19 @@ def _standardize(
         if len(std_names_bt_mapper) > 0 and not mute:
             s = "" if len(std_names_bt_mapper) == 1 else "s"
             field_print = "synonym" if field == return_field else field
-            warn_msg = (
-                f"found {len(std_names_bt_mapper)} {field_print}{s} in Bionty:"
-                f" {list(std_names_bt_mapper.keys())}"
+            reduced_mapped_keys_str = f"{list(std_names_bt_mapper.keys())[:10] + ['...'] if len(std_names_bt_mapper) > 10 else list(std_names_bt_mapper.keys())}"
+            truncated_note = (
+                " (output truncated)" if len(std_names_bt_mapper) > 10 else ""
             )
-            warn_msg += (
-                f"\n   please add corresponding {registry._meta.model.__name__} records via"
-                f" `.from_values({list(set(std_names_bt_mapper.values()))})`"
+            warn_msg = (
+                f"found {len(std_names_bt_mapper)} {field_print}{s} in Bionty{truncated_note}:"
+                f" {reduced_mapped_keys_str}\n"
+                f"  please add corresponding {registry._meta.model.__name__} records via{truncated_note}:"
+                f" `.from_values({reduced_mapped_keys_str})`"
             )
             logger.warning(warn_msg)
         mapper.update(std_names_bt_mapper)
@@ -612,10 +621,10 @@ if ln_setup._TESTING:  # type: ignore
     from inspect import signature
     SIGS = {
-        name: signature(getattr(CanValidate, name))
+        name: signature(getattr(CanCurate, name))
         for name in METHOD_NAMES
         if not name.startswith("__")
     }
 for name in METHOD_NAMES:
-    attach_func_to_class_method(name, CanValidate, globals())
+    attach_func_to_class_method(name, CanCurate, globals())

lamindb/_curate.py CHANGED Viewed

@@ -20,6 +20,7 @@ from .core.exceptions import ValidationError
 if TYPE_CHECKING:
     from collections.abc import Iterable
+    from typing import Any
     from lamindb_setup.core.types import UPathStr
     from lnschema_core.types import FieldAttr
@@ -226,7 +227,7 @@ class DataFrameCurator(BaseCurator):
                     f"the following keys passed to {name} are not allowed: {nonval_keys}"
                 )
-    def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
+    def _save_columns(self, validated_only: bool = True) -> None:
         """Save column name records."""
         # Always save features specified as the fields keys
         update_registry(
@@ -238,7 +239,7 @@ class DataFrameCurator(BaseCurator):
             validated_only=False,
             source=self._sources.get("columns"),
             exclude=self._exclude.get("columns"),
-            **kwargs,
+            **self._kwargs,  # type: ignore
         )
         # Save the rest of the columns based on validated_only
@@ -255,7 +256,7 @@ class DataFrameCurator(BaseCurator):
                 source=self._sources.get("columns"),
                 exclude=self._exclude.get("columns"),
                 warning=False,  # Do not warn about missing columns, just an info message
-                **kwargs,
+                **self._kwargs,  # type: ignore
             )
     def add_new_from(self, key: str, organism: str | None = None, **kwargs):
@@ -292,7 +293,7 @@ class DataFrameCurator(BaseCurator):
                     f"Feature {categorical} is not part of the fields!"
                 )
             update_registry(
-                values=flatten_unique(self._df[categorical]),
+                values=_flatten_unique(self._df[categorical]),
                 field=self.fields[categorical],
                 key=categorical,
                 using_key=self._using_key,
@@ -305,7 +306,6 @@ class DataFrameCurator(BaseCurator):
     def _update_registry_all(self, validated_only: bool = True, **kwargs):
         """Save labels for all features."""
         for name in self.fields.keys():
-            logger.info(f"saving validated records of '{name}'")
             self._update_registry(name, validated_only=validated_only, **kwargs)
     def validate(self, organism: str | None = None) -> bool:
@@ -436,12 +436,15 @@ class AnnDataCurator(DataFrameCurator):
     ) -> None:
         from lamindb_setup.core import upath
+        if isinstance(var_index, str):
+            raise TypeError("var_index parameter has to be a bionty field")
         from ._artifact import data_is_anndata
         if sources is None:
             sources = {}
         if not data_is_anndata(data):
-            raise ValueError(
+            raise TypeError(
                 "data has to be an AnnData object or a path to AnnData-like"
             )
         if isinstance(data, ad.AnnData):
@@ -451,6 +454,11 @@ class AnnDataCurator(DataFrameCurator):
             self._adata = backed_access(upath.create_path(data))
+        if "symbol" in str(var_index):
+            logger.warning(
+                "Curating gene symbols is discouraged. See FAQ for more details."
+            )
         self._data = data
         self._var_field = var_index
         super().__init__(
@@ -512,10 +520,8 @@ class AnnDataCurator(DataFrameCurator):
     def _update_registry_all(self, validated_only: bool = True, **kwargs):
         """Save labels for all features."""
-        logger.info("saving validated records of 'var_index'")
         self._save_from_var_index(validated_only=validated_only, **self._kwargs)
         for name in self._obs_fields.keys():
-            logger.info(f"saving validated terms of '{name}'")
             self._update_registry(name, validated_only=validated_only, **self._kwargs)
     def add_new_from_var_index(self, organism: str | None = None, **kwargs):
@@ -1229,7 +1235,7 @@ def validate_categories(
     if n_non_validated == 0:
         if n_validated == 0:
             logger.indent = ""
-            logger.success(f"{key} is validated against {colors.italic(model_field)}")
+            logger.success(f"'{key}' is validated against {colors.italic(model_field)}")
             return True, []
         else:
             # validated values still need to be saved to the current instance
@@ -1434,8 +1440,8 @@ def save_artifact(
     return artifact
-def flatten_unique(series):
-    """Flatten a pandas series if it contains lists."""
+def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:
+    """Flatten a Pandas series containing lists or single items into a unique list of elements."""
     result = set()
     for item in series:
@@ -1505,9 +1511,14 @@ def update_registry(
         public_records = [r for r in existing_and_public_records if r._state.adding]
         # here we check to only save the public records if they are from the specified source
-        # we check the uid because r.source and soruce can be from different instances
+        # we check the uid because r.source and source can be from different instances
         if source:
             public_records = [r for r in public_records if r.source.uid == source.uid]
+        if public_records:
+            settings.verbosity = "info"
+            logger.info(f"saving validated records of '{key}'")
+            settings.verbosity = "error"
         ln_save(public_records)
         labels_saved["from public"] = [
             getattr(r, field.field.name) for r in public_records
@@ -1720,7 +1731,7 @@ def _save_organism(name: str):  # pragma: no cover
 def _ref_is_name(field: FieldAttr) -> bool | None:
     """Check if the reference field is a name field."""
-    from ._can_validate import get_name_field
+    from ._can_curate import get_name_field
     name_field = get_name_field(field.field.model)
     return field.field.name == name_field

lamindb/_finish.py CHANGED Viewed

@@ -103,10 +103,10 @@ def save_context_core(
     # for scripts, things are easy
     is_consecutive = True
-    is_notebook = transform.type == "notebook"
+    is_ipynb = filepath.suffix == ".ipynb"
     source_code_path = filepath
     # for notebooks, we need more work
-    if is_notebook:
+    if is_ipynb:
         try:
             import jupytext
             from nbproject.dev import (
@@ -198,7 +198,7 @@ def save_context_core(
         run.finished_at = datetime.now(timezone.utc)
     # track report and set is_consecutive
-    if not is_notebook:
+    if not is_ipynb:
         run.is_consecutive = True
         run.save()
     else:
@@ -234,8 +234,15 @@ def save_context_core(
     # finalize
     if not from_cli:
         run_time = run.finished_at - run.started_at
+        days = run_time.days
+        seconds = run_time.seconds
+        hours = seconds // 3600
+        minutes = (seconds % 3600) // 60
+        secs = seconds % 60
+        formatted_run_time = f"{days}d {hours}h {minutes}m {secs}s"
         logger.important(
-            f"finished Run('{run.uid[:8]}') after {run_time} at {format_field_value(run.finished_at)}"
+            f"finished Run('{run.uid[:8]}') after {formatted_run_time} at {format_field_value(run.finished_at)}"
         )
     if ln_setup.settings.instance.is_on_hub:
         identifier = ln_setup.settings.instance.slug
@@ -244,9 +251,7 @@ def save_context_core(
         )
         if not from_cli:
             thing, name = (
-                ("notebook", "notebook.ipynb")
-                if is_notebook
-                else ("script", "script.py")
+                ("notebook", "notebook.ipynb") if is_ipynb else ("script", "script.py")
             )
             logger.important(
                 f"if you want to update your {thing} without re-running it, use `lamin save {name}`"

lamindb/_parents.py CHANGED Viewed

@@ -19,7 +19,14 @@ if TYPE_CHECKING:
 LAMIN_GREEN_LIGHTER = "#10b981"
 LAMIN_GREEN_DARKER = "#065f46"
 GREEN_FILL = "honeydew"
-TRANSFORM_EMOJIS = {"notebook": "📔", "app": "🖥️", "pipeline": "🧩"}
+TRANSFORM_EMOJIS = {
+    "notebook": "📔",
+    "upload": "🖥️",
+    "pipeline": "🧩",
+    "script": "📝",
+    "function": "🔧",
+    "glue": "🧲",
+}
 is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)

lamindb/_query_set.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from __future__ import annotations
 from collections import UserList
-from typing import TYPE_CHECKING, NamedTuple
+from collections.abc import Iterable
+from collections.abc import Iterable as IterableType
+from typing import TYPE_CHECKING, Any, NamedTuple
 import pandas as pd
 from django.db import models
@@ -10,7 +12,7 @@ from lamin_utils import colors, logger
 from lamindb_setup.core._docs import doc_args
 from lnschema_core.models import (
     Artifact,
-    CanValidate,
+    CanCurate,
     Collection,
     IsVersioned,
     Record,
@@ -69,8 +71,33 @@ def one_helper(self):
         return self[0]
-def process_expressions(registry: Registry, expressions: dict) -> dict:
-    if registry in {Artifact, Collection}:
+def process_expressions(queryset: QuerySet, expressions: dict) -> dict:
+    def _map_databases(value: Any, key: str, target_db: str) -> tuple[str, Any]:
+        if isinstance(value, Record):
+            if value._state.db != target_db:
+                logger.warning(
+                    f"passing record from database {value._state.db} to query {target_db}, matching on uid '{value.uid}'"
+                )
+                return f"{key}__uid", value.uid
+            return key, value
+        if (
+            key.endswith("__in")
+            and isinstance(value, IterableType)
+            and not isinstance(value, str)
+        ):
+            if any(isinstance(v, Record) and v._state.db != target_db for v in value):
+                logger.warning(
+                    f"passing records from another database to query {target_db}, matching on uids"
+                )
+                return key.replace("__in", "__uid__in"), [
+                    v.uid if isinstance(v, Record) else v for v in value
+                ]
+            return key, value
+        return key, value
+    if queryset.model in {Artifact, Collection}:
         # visibility is set to 0 unless expressions contains id or uid equality
         if not (
             "id" in expressions
@@ -87,7 +114,17 @@ def process_expressions(registry: Registry, expressions: dict) -> dict:
             # sense for a non-NULLABLE column
             elif visibility in expressions and expressions[visibility] is None:
                 expressions.pop(visibility)
-    return expressions
+    if queryset._db is not None:
+        # only check for database mismatch if there is a defined database on the
+        # queryset
+        return dict(
+            (
+                _map_databases(value, key, queryset._db)
+                for key, value in expressions.items()
+            )
+        )
+    else:
+        return expressions
 def get(
@@ -114,7 +151,7 @@ def get(
             return qs.one()
     else:
         assert idlike is None  # noqa: S101
-        expressions = process_expressions(registry, expressions)
+        expressions = process_expressions(qs, expressions)
         return registry.objects.using(qs.db).get(**expressions)
@@ -282,6 +319,14 @@ class QuerySet(models.QuerySet):
         """Query a single record. Raises error if there are more or none."""
         return get(self, idlike, **expressions)
+    def filter(self, *queries, **expressions) -> QuerySet:
+        """Query a set of records."""
+        expressions = process_expressions(self, expressions)
+        if len(expressions) > 0:
+            return super().filter(*queries, **expressions)
+        else:
+            return self
     def one(self) -> Record:
         """Exactly one result. Raises error if there are more or none."""
         return one_helper(self)
@@ -309,7 +354,7 @@ class QuerySet(models.QuerySet):
 # -------------------------------------------------------------------------------------
-# CanValidate
+# CanCurate
 # -------------------------------------------------------------------------------------
@@ -329,26 +374,26 @@ def lookup(self, field: StrField | None = None, **kwargs) -> NamedTuple:
     return _lookup(cls=self, field=field, **kwargs)
-@doc_args(CanValidate.validate.__doc__)
+@doc_args(CanCurate.validate.__doc__)
 def validate(self, values: ListLike, field: str | StrField | None = None, **kwargs):
     """{}"""  # noqa: D415
-    from ._can_validate import _validate
+    from ._can_curate import _validate
     return _validate(cls=self, values=values, field=field, **kwargs)
-@doc_args(CanValidate.inspect.__doc__)
+@doc_args(CanCurate.inspect.__doc__)
 def inspect(self, values: ListLike, field: str | StrField | None = None, **kwargs):
     """{}"""  # noqa: D415
-    from ._can_validate import _inspect
+    from ._can_curate import _inspect
     return _inspect(cls=self, values=values, field=field, **kwargs)
-@doc_args(CanValidate.standardize.__doc__)
+@doc_args(CanCurate.standardize.__doc__)
 def standardize(self, values: Iterable, field: str | StrField | None = None, **kwargs):
     """{}"""  # noqa: D415
-    from ._can_validate import _standardize
+    from ._can_curate import _standardize
     return _standardize(cls=self, values=values, field=field, **kwargs)

lamindb 0.76.15__py3-none-any.whl → 0.77.0__py3-none-any.whl

lamindb 0.76.15py3-none-any.whl → 0.77.0py3-none-any.whl