PyPI - lamindb - Versions diffs - 0.76.14__py3-none-any.whl → 0.76.16__py3-none-any.whl - Mend

lamindb 0.76.14py3-none-any.whl → 0.76.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

lamindb/__init__.py +1 -1
lamindb/_artifact.py +44 -35
lamindb/_can_validate.py +31 -22
lamindb/_collection.py +6 -5
lamindb/_curate.py +80 -48
lamindb/_feature.py +2 -3
lamindb/_feature_set.py +1 -2
lamindb/_finish.py +12 -7
lamindb/_is_versioned.py +1 -2
lamindb/_parents.py +28 -5
lamindb/_query_manager.py +1 -2
lamindb/_query_set.py +51 -6
lamindb/_record.py +125 -62
lamindb/_save.py +2 -2
lamindb/_transform.py +1 -2
lamindb/_ulabel.py +1 -1
lamindb/core/_context.py +48 -26
lamindb/core/_label_manager.py +1 -1
lamindb/core/_mapped_collection.py +1 -1
lamindb/core/storage/_anndata_accessor.py +7 -4
lamindb/core/storage/_backed_access.py +16 -8
lamindb/core/storage/_pyarrow_dataset.py +31 -0
{lamindb-0.76.14.dist-info → lamindb-0.76.16.dist-info}/METADATA +20 -9
{lamindb-0.76.14.dist-info → lamindb-0.76.16.dist-info}/RECORD +26 -26
lamindb/_filter.py +0 -21
{lamindb-0.76.14.dist-info → lamindb-0.76.16.dist-info}/LICENSE +0 -0
{lamindb-0.76.14.dist-info → lamindb-0.76.16.dist-info}/WHEEL +0 -0

lamindb/_finish.py CHANGED Viewed

@@ -103,10 +103,10 @@ def save_context_core(
     # for scripts, things are easy
     is_consecutive = True
-    is_notebook = transform.type == "notebook"
+    is_ipynb = filepath.suffix == ".ipynb"
     source_code_path = filepath
     # for notebooks, we need more work
-    if is_notebook:
+    if is_ipynb:
         try:
             import jupytext
             from nbproject.dev import (
@@ -198,7 +198,7 @@ def save_context_core(
         run.finished_at = datetime.now(timezone.utc)
     # track report and set is_consecutive
-    if not is_notebook:
+    if not is_ipynb:
         run.is_consecutive = True
         run.save()
     else:
@@ -234,8 +234,15 @@ def save_context_core(
     # finalize
     if not from_cli:
         run_time = run.finished_at - run.started_at
+        days = run_time.days
+        seconds = run_time.seconds
+        hours = seconds // 3600
+        minutes = (seconds % 3600) // 60
+        secs = seconds % 60
+        formatted_run_time = f"{days}d {hours}h {minutes}m {secs}s"
         logger.important(
-            f"finished Run('{run.uid[:8]}') after {run_time} at {format_field_value(run.finished_at)}"
+            f"finished Run('{run.uid[:8]}') after {formatted_run_time} at {format_field_value(run.finished_at)}"
         )
     if ln_setup.settings.instance.is_on_hub:
         identifier = ln_setup.settings.instance.slug
@@ -244,9 +251,7 @@ def save_context_core(
         )
         if not from_cli:
             thing, name = (
-                ("notebook", "notebook.ipynb")
-                if is_notebook
-                else ("script", "script.py")
+                ("notebook", "notebook.ipynb") if is_ipynb else ("script", "script.py")
             )
             logger.important(
                 f"if you want to update your {thing} without re-running it, use `lamin save {name}`"

lamindb/_is_versioned.py CHANGED Viewed

@@ -5,8 +5,7 @@ from lamin_utils import logger
 from lamindb_setup.core.upath import UPath
 from lnschema_core.models import IsVersioned
-from lamindb._utils import attach_func_to_class_method
+from ._utils import attach_func_to_class_method
 from .core.versioning import create_uid, get_new_path_from_uid

lamindb/_parents.py CHANGED Viewed

@@ -8,13 +8,14 @@ from lamin_utils import logger
 from lnschema_core import Artifact, Collection, Record, Run, Transform
 from lnschema_core.models import HasParents, format_field_value
-from lamindb._utils import attach_func_to_class_method
 from ._record import get_name_field
+from ._utils import attach_func_to_class_method
 if TYPE_CHECKING:
     from lnschema_core.types import StrField
+    from lamindb.core import QuerySet
 LAMIN_GREEN_LIGHTER = "#10b981"
 LAMIN_GREEN_DARKER = "#065f46"
 GREEN_FILL = "honeydew"
@@ -22,6 +23,30 @@ TRANSFORM_EMOJIS = {"notebook": "📔", "app": "🖥️", "pipeline": "🧩"}
 is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
+# this is optimized to have fewer recursive calls
+# also len of QuerySet can be costly at times
+def _query_relatives(
+    records: QuerySet | list[Record],
+    kind: Literal["parents", "children"],
+    cls: type[HasParents],
+) -> QuerySet:
+    relatives = cls.objects.none()
+    if len(records) == 0:
+        return relatives
+    for record in records:
+        relatives = relatives.union(getattr(record, kind).all())
+    relatives = relatives.union(_query_relatives(relatives, kind, cls))
+    return relatives
+def query_parents(self) -> QuerySet:
+    return _query_relatives([self], "parents", self.__class__)
+def query_children(self) -> QuerySet:
+    return _query_relatives([self], "children", self.__class__)
 def _transform_emoji(transform: Transform):
     if transform is not None:
         return TRANSFORM_EMOJIS.get(transform.type, "💫")
@@ -474,9 +499,7 @@ def _df_edges_from_runs(df_values: list):
     return df
-METHOD_NAMES = [
-    "view_parents",
-]
+METHOD_NAMES = ["view_parents", "query_parents", "query_children"]
 if ln_setup._TESTING:  # type: ignore
     from inspect import signature

lamindb/_query_manager.py CHANGED Viewed

@@ -7,9 +7,8 @@ from lamin_utils import logger
 from lamindb_setup.core._docs import doc_args
 from lnschema_core.models import Record
-from lamindb.core._settings import settings
 from .core._feature_manager import get_feature_set_by_slot_
+from .core._settings import settings
 if TYPE_CHECKING:
     from lnschema_core.types import StrField

lamindb/_query_set.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from __future__ import annotations
 from collections import UserList
-from typing import TYPE_CHECKING, NamedTuple
+from collections.abc import Iterable
+from collections.abc import Iterable as IterableType
+from typing import TYPE_CHECKING, Any, NamedTuple
 import pandas as pd
 from django.db import models
@@ -20,7 +22,7 @@ from lnschema_core.models import (
     VisibilityChoice,
 )
-from lamindb.core.exceptions import DoesNotExist
+from .core.exceptions import DoesNotExist
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -69,8 +71,33 @@ def one_helper(self):
         return self[0]
-def process_expressions(registry: Registry, expressions: dict) -> dict:
-    if registry in {Artifact, Collection}:
+def process_expressions(queryset: QuerySet, expressions: dict) -> dict:
+    def _map_databases(value: Any, key: str, target_db: str) -> tuple[str, Any]:
+        if isinstance(value, Record):
+            if value._state.db != target_db:
+                logger.warning(
+                    f"passing record from database {value._state.db} to query {target_db}, matching on uid '{value.uid}'"
+                )
+                return f"{key}__uid", value.uid
+            return key, value
+        if (
+            key.endswith("__in")
+            and isinstance(value, IterableType)
+            and not isinstance(value, str)
+        ):
+            if any(isinstance(v, Record) and v._state.db != target_db for v in value):
+                logger.warning(
+                    f"passing records from another database to query {target_db}, matching on uids"
+                )
+                return key.replace("__in", "__uid__in"), [
+                    v.uid if isinstance(v, Record) else v for v in value
+                ]
+            return key, value
+        return key, value
+    if queryset.model in {Artifact, Collection}:
         # visibility is set to 0 unless expressions contains id or uid equality
         if not (
             "id" in expressions
@@ -87,7 +114,17 @@ def process_expressions(registry: Registry, expressions: dict) -> dict:
             # sense for a non-NULLABLE column
             elif visibility in expressions and expressions[visibility] is None:
                 expressions.pop(visibility)
-    return expressions
+    if queryset._db is not None:
+        # only check for database mismatch if there is a defined database on the
+        # queryset
+        return dict(
+            (
+                _map_databases(value, key, queryset._db)
+                for key, value in expressions.items()
+            )
+        )
+    else:
+        return expressions
 def get(
@@ -114,7 +151,7 @@ def get(
             return qs.one()
     else:
         assert idlike is None  # noqa: S101
-        expressions = process_expressions(registry, expressions)
+        expressions = process_expressions(qs, expressions)
         return registry.objects.using(qs.db).get(**expressions)
@@ -282,6 +319,14 @@ class QuerySet(models.QuerySet):
         """Query a single record. Raises error if there are more or none."""
         return get(self, idlike, **expressions)
+    def filter(self, *queries, **expressions) -> QuerySet:
+        """Query a set of records."""
+        expressions = process_expressions(self, expressions)
+        if len(expressions) > 0:
+            return super().filter(*queries, **expressions)
+        else:
+            return self
     def one(self) -> Record:
         """Exactly one result. Raises error if there are more or none."""
         return one_helper(self)

lamindb/_record.py CHANGED Viewed

@@ -1,12 +1,16 @@
 from __future__ import annotations
 import builtins
+from functools import reduce
 from typing import TYPE_CHECKING, NamedTuple
 import dj_database_url
 import lamindb_setup as ln_setup
+from django.core.exceptions import FieldDoesNotExist
 from django.db import connections, transaction
-from django.db.models import IntegerField, Manager, Q, QuerySet, Value
+from django.db.models import F, IntegerField, Manager, Q, QuerySet, TextField, Value
+from django.db.models.functions import Cast, Coalesce
+from django.db.models.lookups import Contains, Exact, IContains, IExact, IRegex, Regex
 from lamin_utils import colors, logger
 from lamin_utils._lookup import Lookup
 from lamindb_setup._connect_instance import (
@@ -17,11 +21,22 @@ from lamindb_setup._connect_instance import (
 from lamindb_setup.core._docs import doc_args
 from lamindb_setup.core._hub_core import connect_instance_hub
 from lamindb_setup.core._settings_store import instance_settings_file
-from lnschema_core.models import Artifact, Feature, IsVersioned, Record, Run, Transform
+from lnschema_core.models import (
+    Artifact,
+    Collection,
+    Feature,
+    FeatureSet,
+    IsVersioned,
+    Param,
+    Record,
+    Run,
+    Transform,
+    ULabel,
+)
-from lamindb._utils import attach_func_to_class_method
-from lamindb.core._settings import settings
-from lamindb.core.exceptions import RecordNameChangeIntegrityError
+from ._utils import attach_func_to_class_method
+from .core._settings import settings
+from .core.exceptions import RecordNameChangeIntegrityError, ValidationError
 if TYPE_CHECKING:
     import pandas as pd
@@ -48,6 +63,7 @@ def update_attributes(record: Record, attributes: dict[str, str]):
 def validate_required_fields(record: Record, kwargs):
+    # a "required field" is a Django field that has `null=True, default=None`
     required_fields = {
         k.name for k in record._meta.fields if not k.null and k.default is None
     }
@@ -58,25 +74,47 @@ def validate_required_fields(record: Record, kwargs):
     ]
     if missing_fields:
         raise TypeError(f"{missing_fields} are required.")
+    # ensure the exact length of the internal uid for core entities
+    if "uid" in kwargs and record.__class__ in {
+        Artifact,
+        Collection,
+        Transform,
+        Run,
+        ULabel,
+        Feature,
+        FeatureSet,
+        Param,
+    }:
+        uid_max_length = record.__class__._meta.get_field(
+            "uid"
+        ).max_length  # triggers FieldDoesNotExist
+        if len(kwargs["uid"]) != uid_max_length:  # triggers KeyError
+            raise ValidationError(
+                f'`uid` must be exactly {uid_max_length} characters long, got {len(kwargs["uid"])}.'
+            )
-def suggest_records_with_similar_names(record: Record, kwargs) -> bool:
+def suggest_records_with_similar_names(record: Record, name_field: str, kwargs) -> bool:
     """Returns True if found exact match, otherwise False.
     Logs similar matches if found.
     """
-    if kwargs.get("name") is None:
+    if kwargs.get(name_field) is None or not isinstance(kwargs.get(name_field), str):
         return False
     queryset = _search(
-        record.__class__, kwargs["name"], field="name", truncate_words=True, limit=3
+        record.__class__,
+        kwargs[name_field],
+        field=name_field,
+        truncate_string=True,
+        limit=3,
     )
     if not queryset.exists():  # empty queryset
         return False
     for alternative_record in queryset:
-        if alternative_record.name == kwargs["name"]:
+        if getattr(alternative_record, name_field) == kwargs[name_field]:
             return True
     s, it, nots = ("", "it", "s") if len(queryset) == 1 else ("s", "one of them", "")
-    msg = f"record{s} with similar name{s} exist{nots}! did you mean to load {it}?"
+    msg = f"record{s} with similar {name_field}{s} exist{nots}! did you mean to load {it}?"
     if IPYTHON:
         from IPython.display import display
@@ -98,13 +136,19 @@ def __init__(record: Record, *args, **kwargs):
         if "_has_consciously_provided_uid" in kwargs:
             has_consciously_provided_uid = kwargs.pop("_has_consciously_provided_uid")
         if settings.creation.search_names and not has_consciously_provided_uid:
-            match = suggest_records_with_similar_names(record, kwargs)
+            name_field = (
+                "name" if not hasattr(record, "_name_field") else record._name_field
+            )
+            match = suggest_records_with_similar_names(record, name_field, kwargs)
             if match:
                 if "version" in kwargs:
                     if kwargs["version"] is not None:
                         version_comment = " and version"
                         existing_record = record.__class__.filter(
-                            name=kwargs["name"], version=kwargs["version"]
+                            **{
+                                name_field: kwargs[name_field],
+                                "version": kwargs["version"],
+                            }
                         ).one_or_none()
                     else:
                         # for a versioned record, an exact name match is not a
@@ -115,12 +159,12 @@ def __init__(record: Record, *args, **kwargs):
                 else:
                     version_comment = ""
                     existing_record = record.__class__.filter(
-                        name=kwargs["name"]
+                        **{name_field: kwargs[name_field]}
                     ).one_or_none()
                 if existing_record is not None:
                     logger.important(
                         f"returning existing {record.__class__.__name__} record with same"
-                        f" name{version_comment}: '{kwargs['name']}'"
+                        f" {name_field}{version_comment}: '{kwargs[name_field]}'"
                     )
                     init_self_from_db(record, existing_record)
                     return None
@@ -137,9 +181,13 @@ def __init__(record: Record, *args, **kwargs):
 @doc_args(Record.filter.__doc__)
 def filter(cls, *queries, **expressions) -> QuerySet:
     """{}"""  # noqa: D415
-    from lamindb._filter import filter
+    from lamindb._query_set import QuerySet
-    return filter(cls, *queries, **expressions)
+    _using_key = None
+    if "_using_key" in expressions:
+        _using_key = expressions.pop("_using_key")
+    return QuerySet(model=cls, using=_using_key).filter(*queries, **expressions)
 @classmethod  # type:ignore
@@ -150,8 +198,6 @@ def get(
     **expressions,
 ) -> Record:
     """{}"""  # noqa: D415
-    # this is the only place in which we need the lamindb queryset
-    # in this file; everywhere else it should be Django's
     from lamindb._query_set import QuerySet
     return QuerySet(model=cls).get(idlike, **expressions)
@@ -166,9 +212,7 @@ def df(
     limit: int = 100,
 ) -> pd.DataFrame:
     """{}"""  # noqa: D415
-    from lamindb._filter import filter
-    query_set = filter(cls)
+    query_set = cls.filter()
     if hasattr(cls, "updated_at"):
         query_set = query_set.order_by("-updated_at")
     return query_set[:limit].df(include=include, join=join)
@@ -182,7 +226,7 @@ def _search(
     limit: int | None = 20,
     case_sensitive: bool = False,
     using_key: str | None = None,
-    truncate_words: bool = False,
+    truncate_string: bool = False,
 ) -> QuerySet:
     input_queryset = _queryset(cls, using_key=using_key)
     registry = input_queryset.model
@@ -209,48 +253,67 @@ def _search(
             else:
                 fields.append(field)
-    # decompose search string
-    def truncate_word(word) -> str:
-        if len(word) > 5:
-            n_80_pct = int(len(word) * 0.8)
-            return word[:n_80_pct]
-        elif len(word) > 3:
-            return word[:3]
-        else:
-            return word
-    decomposed_string = str(string).split()
-    # add the entire string back
-    decomposed_string += [string]
-    for word in decomposed_string:
-        # will not search against words with 3 or fewer characters
-        if len(word) <= 3:
-            decomposed_string.remove(word)
-    if truncate_words:
-        decomposed_string = [truncate_word(word) for word in decomposed_string]
-    # construct the query
-    expression = Q()
-    case_sensitive_i = "" if case_sensitive else "i"
-    for field in fields:
-        for word in decomposed_string:
-            query = {f"{field}__{case_sensitive_i}contains": word}
-            expression |= Q(**query)
-    output_queryset = input_queryset.filter(expression)
-    # ensure exact matches are at the top
-    narrow_expression = Q()
+    if truncate_string:
+        if (len_string := len(string)) > 5:
+            n_80_pct = int(len_string * 0.8)
+            string = string[:n_80_pct]
+    string = string.strip()
+    exact_lookup = Exact if case_sensitive else IExact
+    regex_lookup = Regex if case_sensitive else IRegex
+    contains_lookup = Contains if case_sensitive else IContains
+    ranks = []
+    contains_filters = []
     for field in fields:
-        query = {f"{field}__{case_sensitive_i}contains": string}
-        narrow_expression |= Q(**query)
-    refined_output_queryset = output_queryset.filter(narrow_expression).annotate(
-        ordering=Value(1, output_field=IntegerField())
-    )
-    remaining_output_queryset = output_queryset.exclude(narrow_expression).annotate(
-        ordering=Value(2, output_field=IntegerField())
+        field_expr = Coalesce(
+            Cast(field, output_field=TextField()),
+            Value(""),
+            output_field=TextField(),
+        )
+        # exact rank
+        exact_expr = exact_lookup(field_expr, string)
+        exact_rank = Cast(exact_expr, output_field=IntegerField()) * 200
+        ranks.append(exact_rank)
+        # exact synonym
+        synonym_expr = regex_lookup(field_expr, rf"(?:^|.*\|){string}(?:\|.*|$)")
+        synonym_rank = Cast(synonym_expr, output_field=IntegerField()) * 200
+        ranks.append(synonym_rank)
+        # match as sub-phrase
+        sub_expr = regex_lookup(
+            field_expr, rf"(?:^|.*[ \|\.,;:]){string}(?:[ \|\.,;:].*|$)"
+        )
+        sub_rank = Cast(sub_expr, output_field=IntegerField()) * 10
+        ranks.append(sub_rank)
+        # startswith and avoid matching string with " " on the right
+        # mostly for truncated
+        startswith_expr = regex_lookup(field_expr, rf"(?:^|\|){string}[^ ]*(\||$)")
+        startswith_rank = Cast(startswith_expr, output_field=IntegerField()) * 8
+        ranks.append(startswith_rank)
+        # match as sub-phrase from the left, mostly for truncated
+        right_expr = regex_lookup(field_expr, rf"(?:^|.*[ \|]){string}.*")
+        right_rank = Cast(right_expr, output_field=IntegerField()) * 2
+        ranks.append(right_rank)
+        # match as sub-phrase from the right
+        left_expr = regex_lookup(field_expr, rf".*{string}(?:$|[ \|\.,;:].*)")
+        left_rank = Cast(left_expr, output_field=IntegerField()) * 2
+        ranks.append(left_rank)
+        # simple contains filter
+        contains_expr = contains_lookup(field_expr, string)
+        contains_filter = Q(contains_expr)
+        contains_filters.append(contains_filter)
+        # also rank by contains
+        contains_rank = Cast(contains_expr, output_field=IntegerField())
+        ranks.append(contains_rank)
+    ranked_queryset = (
+        input_queryset.filter(reduce(lambda a, b: a | b, contains_filters))
+        .alias(rank=sum(ranks))
+        .order_by("-rank")
     )
-    combined_queryset = refined_output_queryset.union(
-        remaining_output_queryset
-    ).order_by("ordering")[:limit]
-    return combined_queryset
+    return ranked_queryset[:limit]
 @classmethod  # type: ignore

lamindb/_save.py CHANGED Viewed

@@ -15,8 +15,8 @@ from lamin_utils import logger
 from lamindb_setup.core.upath import LocalPathClasses
 from lnschema_core.models import Artifact, Record
-from lamindb.core._settings import settings
-from lamindb.core.storage.paths import (
+from .core._settings import settings
+from .core.storage.paths import (
     _cache_key_from_artifact_storage,
     attempt_accessing_path,
     auto_storage_key_from_artifact,

lamindb/_transform.py CHANGED Viewed

@@ -6,10 +6,9 @@ from lamin_utils import logger
 from lamindb_setup.core._docs import doc_args
 from lnschema_core.models import Run, Transform
-from lamindb.core.exceptions import InconsistentKey
 from ._parents import _view_parents
 from ._run import delete_run_artifacts
+from .core.exceptions import InconsistentKey
 from .core.versioning import message_update_key_in_version_family, process_revises
 if TYPE_CHECKING:

lamindb/_ulabel.py CHANGED Viewed

@@ -6,7 +6,7 @@ import lamindb_setup as ln_setup
 from lamindb_setup.core._docs import doc_args
 from lnschema_core import ULabel
-from lamindb._utils import attach_func_to_class_method
+from ._utils import attach_func_to_class_method
 if TYPE_CHECKING:
     from lnschema_core.types import ListLike

lamindb 0.76.14__py3-none-any.whl → 0.76.16__py3-none-any.whl

lamindb 0.76.14py3-none-any.whl → 0.76.16py3-none-any.whl