PyPI - lamindb - Versions diffs - 0.45.0__py3-none-any.whl → 0.46a1__py3-none-any.whl - Mend

lamindb 0.45.0py3-none-any.whl → 0.46a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

lamindb/__init__.py +30 -9
lamindb/_context.py +11 -12
lamindb/_dataset.py +142 -0
lamindb/_delete.py +6 -6
lamindb/_feature_set.py +138 -0
lamindb/_file.py +322 -81
lamindb/_from_values.py +57 -160
lamindb/_orm.py +398 -0
lamindb/_save.py +26 -10
lamindb/_select.py +3 -3
lamindb/_view.py +2 -2
lamindb/dev/__init__.py +2 -2
lamindb/dev/_settings.py +2 -1
lamindb/dev/datasets/__init__.py +6 -0
lamindb/dev/datasets/_core.py +30 -0
lamindb/dev/hashing.py +4 -0
lamindb/dev/storage/__init__.py +4 -3
lamindb/dev/storage/_backed_access.py +3 -3
lamindb/dev/storage/{_file.py → file.py} +48 -3
lamindb/dev/storage/{_object.py → object.py} +1 -0
lamindb/dev/utils.py +9 -0
lamindb/types.py +9 -1
{lamindb-0.45.0.dist-info → lamindb-0.46a1.dist-info}/METADATA +20 -17
lamindb-0.46a1.dist-info/RECORD +36 -0
lamindb/_baseorm_methods.py +0 -535
lamindb/_featureset_methods.py +0 -73
lamindb/_file_access.py +0 -48
lamindb/_file_methods.py +0 -319
lamindb-0.45.0.dist-info/RECORD +0 -36
/lamindb/{_transform_methods.py → _transform.py} +0 -0
{lamindb-0.45.0.dist-info → lamindb-0.46a1.dist-info}/LICENSE +0 -0
{lamindb-0.45.0.dist-info → lamindb-0.46a1.dist-info}/WHEEL +0 -0
{lamindb-0.45.0.dist-info → lamindb-0.46a1.dist-info}/entry_points.txt +0 -0

lamindb/_orm.py ADDED Viewed

@@ -0,0 +1,398 @@
+import builtins
+from typing import Dict, Iterable, List, Literal, NamedTuple, Optional, Set, Union
+import pandas as pd
+from django.core.exceptions import FieldDoesNotExist
+from django.db.models import CharField, TextField
+from django.db.models.query_utils import DeferredAttribute as Field
+from lamin_logger import logger
+from lamin_logger._lookup import Lookup
+from lamin_logger._search import search as base_search
+from lamindb_setup.dev._docs import doc_args
+from lnschema_core import ORM
+from lnschema_core.types import ListLike, StrField
+from lamindb.dev.utils import attach_func_to_class_method
+from . import _TESTING
+from ._from_values import _has_species_field, get_or_create_records
+from .dev._settings import settings
+IPYTHON = getattr(builtins, "__IPYTHON__", False)
+class ValidationError(Exception):
+    pass
+def init_self_from_db(self: ORM, existing_record: ORM):
+    new_args = [
+        getattr(existing_record, field.attname) for field in self._meta.concrete_fields
+    ]
+    super(self.__class__, self).__init__(*new_args)
+    self._state.adding = False  # mimic from_db
+    self._state.db = "default"
+def validate_required_fields(orm: ORM, kwargs):
+    required_fields = {
+        k.name for k in orm._meta.fields if not k.null and k.default is None
+    }
+    required_fields_not_passed = {k: None for k in required_fields if k not in kwargs}
+    kwargs.update(required_fields_not_passed)
+    missing_fields = [
+        k for k, v in kwargs.items() if v is None and k in required_fields
+    ]
+    if missing_fields:
+        raise TypeError(f"{missing_fields} are required.")
+def suggest_objects_with_same_name(orm: ORM, kwargs) -> Optional[str]:
+    if kwargs.get("name") is None:
+        return None
+    else:
+        results = orm.search(kwargs["name"])
+        if results.shape[0] == 0:
+            return None
+        # subset results to those with at least 0.5 levensteihn distance
+        results = results.loc[results.__ratio__ >= 90]
+        # test for exact match
+        if len(results) > 0:
+            if results.index[0] == kwargs["name"]:
+                logger.warning("Object with exact same name exists, returning it")
+                return "object-with-same-name-exists"
+            else:
+                msg = "Entries with similar names exist:"
+                if IPYTHON:
+                    from IPython.display import display
+                    logger.warning(f"{msg}")
+                    display(results)
+                else:
+                    logger.warning(f"{msg}\n{results.name}")
+    return None
+def __init__(orm: ORM, *args, **kwargs):
+    if not args:
+        validate_required_fields(orm, kwargs)
+        if settings.upon_create_search_names:
+            result = suggest_objects_with_same_name(orm, kwargs)
+            if result == "object-with-same-name-exists":
+                existing_record = orm.select(name=kwargs["name"])[0]
+                init_self_from_db(orm, existing_record)
+                return None
+        super(ORM, orm).__init__(**kwargs)
+    elif len(args) != len(orm._meta.concrete_fields):
+        raise ValueError("Please provide keyword arguments, not plain arguments")
+    else:
+        # object is loaded from DB (**kwargs could be omitted below, I believe)
+        super(ORM, orm).__init__(*args, **kwargs)
+@classmethod  # type:ignore
+@doc_args(ORM.from_values.__doc__)
+def from_values(cls, identifiers: ListLike, field: StrField, **kwargs) -> List["ORM"]:
+    """{}"""
+    if isinstance(field, str):
+        field = getattr(cls, field)
+    if not isinstance(field, Field):  # field is DeferredAttribute
+        raise TypeError(
+            "field must be a string or an ORM field, e.g., `CellType.name`!"
+        )
+    from_bionty = True if cls.__module__.startswith("lnschema_bionty.") else False
+    return get_or_create_records(
+        iterable=identifiers, field=field, from_bionty=from_bionty, **kwargs
+    )
+@classmethod  # type: ignore
+@doc_args(ORM.search.__doc__)
+def search(
+    cls,
+    string: str,
+    *,
+    field: Optional[StrField] = None,
+    top_hit: bool = False,
+    case_sensitive: bool = True,
+    synonyms_field: Optional[Union[str, TextField, CharField]] = "synonyms",
+    synonyms_sep: str = "|",
+) -> Union["pd.DataFrame", "ORM"]:
+    """{}"""
+    if field is None:
+        field = get_default_str_field(cls)
+    if not isinstance(field, str):
+        field = field.field.name
+    records = cls.objects.all()
+    df = pd.DataFrame.from_records(records.values())
+    result = base_search(
+        df=df,
+        string=string,
+        field=field,
+        synonyms_field=str(synonyms_field),
+        case_sensitive=case_sensitive,
+        return_ranked_results=not top_hit,
+        synonyms_sep=synonyms_sep,
+        tuple_name=cls.__name__,
+    )
+    if not top_hit or result is None:
+        return result
+    else:
+        if isinstance(result, list):
+            return [records.get(id=r.id) for r in result]
+        else:
+            return records.get(id=result.id)
+@classmethod  # type: ignore
+@doc_args(ORM.lookup.__doc__)
+def lookup(cls, field: Optional[StrField] = None) -> NamedTuple:
+    """{}"""
+    if field is None:
+        field = get_default_str_field(cls)
+    if not isinstance(field, str):
+        field = field.field.name
+    records = cls.objects.all()
+    return Lookup(
+        records=records,
+        values=[i.get(field) for i in records.values()],
+        tuple_name=cls.__name__,
+        prefix="ln",
+    ).lookup()
+@classmethod  # type: ignore
+@doc_args(ORM.inspect.__doc__)
+def inspect(
+    cls,
+    identifiers: ListLike,
+    field: StrField,
+    *,
+    case_sensitive: bool = False,
+    inspect_synonyms: bool = True,
+    return_df: bool = False,
+    logging: bool = True,
+    **kwargs,
+) -> Union["pd.DataFrame", Dict[str, List[str]]]:
+    """{}"""
+    from lamin_logger._inspect import inspect
+    if not isinstance(field, str):
+        field = field.field.name
+    return inspect(
+        df=_filter_df_based_on_species(orm=cls, species=kwargs.get("species")),
+        identifiers=identifiers,
+        field=str(field),
+        case_sensitive=case_sensitive,
+        inspect_synonyms=inspect_synonyms,
+        return_df=return_df,
+        logging=logging,
+    )
+@classmethod  # type: ignore
+@doc_args(ORM.map_synonyms.__doc__)
+def map_synonyms(
+    cls,
+    synonyms: Iterable,
+    *,
+    return_mapper: bool = False,
+    case_sensitive: bool = False,
+    keep: Literal["first", "last", False] = "first",
+    synonyms_field: str = "synonyms",
+    synonyms_sep: str = "|",
+    field: Optional[str] = None,
+    **kwargs,
+) -> Union[List[str], Dict[str, str]]:
+    """{}"""
+    from lamin_logger._map_synonyms import map_synonyms
+    if field is None:
+        field = get_default_str_field(cls)
+    if not isinstance(field, str):
+        field = field.field.name
+    try:
+        cls._meta.get_field(synonyms_field)
+        df = _filter_df_based_on_species(orm=cls, species=kwargs.get("species"))
+    except FieldDoesNotExist:
+        df = pd.DataFrame()
+    return map_synonyms(
+        df=df,
+        identifiers=synonyms,
+        field=field,
+        return_mapper=return_mapper,
+        case_sensitive=case_sensitive,
+        keep=keep,
+        synonyms_field=synonyms_field,
+        sep=synonyms_sep,
+    )
+def _filter_df_based_on_species(orm: ORM, species: Optional[Union[str, ORM]] = None):
+    import pandas as pd
+    records = orm.objects.all()
+    if _has_species_field(orm):
+        # here, we can safely import lnschema_bionty
+        from lnschema_bionty._bionty import create_or_get_species_record
+        species_record = create_or_get_species_record(species=species, orm=orm)
+        if species_record is not None:
+            records = records.filter(species__name=species_record.name)
+    return pd.DataFrame.from_records(records.values())
+def get_default_str_field(orm: ORM) -> str:
+    """Get the 1st char or text field from the orm."""
+    model_field_names = [i.name for i in orm._meta.fields]
+    # set default field
+    if "name" in model_field_names:
+        # by default use the name field
+        field = orm._meta.get_field("name")
+    else:
+        # first char or text field that doesn't contain "id"
+        for i in orm._meta.fields:
+            if "id" in i.name:
+                continue
+            if i.get_internal_type() in {"CharField", "TextField"}:
+                field = i
+                break
+    # no default field can be found
+    if field is None:
+        raise ValueError("Please specify a field to search against!")
+    return field.name
+def _add_or_remove_synonyms(
+    synonym: Union[str, Iterable],
+    record: ORM,
+    action: Literal["add", "remove"],
+    force: bool = False,
+):
+    """Add or remove synonyms."""
+    def check_synonyms_in_all_records(synonyms: Set[str], record: ORM):
+        """Errors if input synonym is associated with other records in the DB."""
+        import pandas as pd
+        from IPython.display import display
+        syns_all = (
+            record.__class__.objects.exclude(synonyms="").exclude(synonyms=None).all()
+        )
+        if len(syns_all) == 0:
+            return
+        df = pd.DataFrame(syns_all.values())
+        df["synonyms"] = df["synonyms"].str.split("|")
+        df = df.explode("synonyms")
+        matches_df = df[(df["synonyms"].isin(synonyms)) & (df["id"] != record.id)]
+        if matches_df.shape[0] > 0:
+            records_df = pd.DataFrame(syns_all.filter(id__in=matches_df["id"]).values())
+            logger.error(
+                f"Input synonyms {matches_df['synonyms'].unique()} already associated"
+                " with the following records:\n(Pass `force=True` to ignore this error)"
+            )
+            display(records_df)
+            raise SystemExit(AssertionError)
+    # passed synonyms
+    if isinstance(synonym, str):
+        syn_new_set = set([synonym])
+    else:
+        syn_new_set = set(synonym)
+    # nothing happens when passing an empty string or list
+    if len(syn_new_set) == 0:
+        return
+    # because we use | as the separator
+    if any(["|" in i for i in syn_new_set]):
+        raise AssertionError("A synonym can't contain '|'!")
+    # existing synonyms
+    syns_exist = record.synonyms
+    if syns_exist is None or len(syns_exist) == 0:
+        syns_exist_set = set()
+    else:
+        syns_exist_set = set(syns_exist.split("|"))
+    if action == "add":
+        if not force:
+            check_synonyms_in_all_records(syn_new_set, record)
+        syns_exist_set.update(syn_new_set)
+    elif action == "remove":
+        syns_exist_set = syns_exist_set.difference(syn_new_set)
+    if len(syns_exist_set) == 0:
+        syns_str = None
+    else:
+        syns_str = "|".join(syns_exist_set)
+    record.synonyms = syns_str
+    # if record is already in DB, save the changes to DB
+    if not record._state.adding:
+        record.save()
+def _check_synonyms_field_exist(record: ORM):
+    try:
+        record.__getattribute__("synonyms")
+    except AttributeError:
+        raise NotImplementedError(
+            f"No synonyms field found in table {record.__class__.__name__}!"
+        )
+def add_synonym(self, synonym: Union[str, ListLike], force: bool = False):
+    _check_synonyms_field_exist(self)
+    _add_or_remove_synonyms(synonym=synonym, record=self, force=force, action="add")
+def remove_synonym(self, synonym: Union[str, ListLike]):
+    _check_synonyms_field_exist(self)
+    _add_or_remove_synonyms(synonym=synonym, record=self, action="remove")
+METHOD_NAMES = [
+    "__init__",
+    "search",
+    "lookup",
+    "map_synonyms",
+    "inspect",
+    "add_synonym",
+    "remove_synonym",
+    "from_values",
+]
+if _TESTING:
+    from inspect import signature
+    SIGS = {
+        name: signature(getattr(ORM, name))
+        for name in METHOD_NAMES
+        if not name.startswith("__")
+    }
+for name in METHOD_NAMES:
+    attach_func_to_class_method(name, ORM, globals())
+@classmethod  # type: ignore
+def __name_with_type__(cls) -> str:
+    schema_module_name = cls.__module__.split(".")[0]
+    schema_name = schema_module_name.replace("lnschema_", "")
+    return f"{schema_name}.{cls.__name__}"
+setattr(ORM, "__name_with_type__", __name_with_type__)

lamindb/_save.py CHANGED Viewed

@@ -5,11 +5,14 @@ from typing import Iterable, List, Optional, Tuple, Union, overload  # noqa
 import lamindb_setup
 from django.db import transaction
 from lamin_logger import logger
-from lnschema_core.models import BaseORM, File
+from lnschema_core.models import ORM, File
-from lamindb._file_access import auto_storage_key_from_file
 from lamindb.dev.storage import store_object
-from lamindb.dev.storage._file import delete_storage_using_key, print_hook
+from lamindb.dev.storage.file import (
+    auto_storage_key_from_file,
+    delete_storage_using_key,
+    print_hook,
+)
 try:
     from lamindb.dev.storage._zarr import write_adata_zarr
@@ -20,7 +23,7 @@ except ImportError:
 @overload
-def save(record: BaseORM) -> BaseORM:
+def save(record: ORM) -> ORM:
     ...
@@ -28,11 +31,11 @@ def save(record: BaseORM) -> BaseORM:
 # Overloaded function signature 2 will never be matched: signature 1's parameter
 # type(s) are the same or broader
 @overload
-def save(records: Iterable[BaseORM]) -> Iterable[BaseORM]:  # type: ignore
+def save(records: Iterable[ORM]) -> Iterable[ORM]:  # type: ignore
     ...
-def save(record: Union[BaseORM, Iterable[BaseORM]], **fields) -> None:  # type: ignore
+def save(record: Union[ORM, Iterable[ORM]], **kwargs) -> None:  # type: ignore
     """Save to database & storage.
     Inserts a new :term:`record` if the corresponding row doesn't exist.
@@ -42,7 +45,7 @@ def save(record: Union[BaseORM, Iterable[BaseORM]], **fields) -> None:  # type:
     passing it to `save`.
     Args:
-        record: One or multiple `BaseORM` objects.
+        record: One or multiple `ORM` objects.
     Returns:
         The record as returned from the database with a `created_at` timestamp.
@@ -68,18 +71,31 @@ def save(record: Union[BaseORM, Iterable[BaseORM]], **fields) -> None:  # type:
     """
     if isinstance(record, Iterable):
         records = set(record)
-    elif isinstance(record, BaseORM):
+    elif isinstance(record, ORM):
         records = {record}
+    def atomic_save(records: Iterable[ORM], **kwargs):
+        with transaction.atomic():
+            for record in records:
+                record.save(**kwargs)
     # we're distinguishing between files and non-files
     # because for files, we want to bulk-upload
     # rather than upload one-by-one
     files = {r for r in records if isinstance(r, File)}
     non_files = records.difference(files)
     if non_files:
-        with transaction.atomic():
+        non_files_with_parents = {r for r in non_files if hasattr(r, "_parents")}
+        if len(non_files_with_parents) < 2 or kwargs.get("parents") is False:
+            atomic_save(non_files)
+        else:
+            logger.warning("Recursing through parents will take a while...")
+            # first save all records without recursing parents
+            atomic_save(non_files, parents=False)
+            # save the record with parents one by one
             for record in non_files:
                 record.save()
     if files:
         with transaction.atomic():
             for record in files:
@@ -190,5 +206,5 @@ def upload_data_object(file) -> None:
     ):
         logger.hint(f"storing file {file.id} with key {file_storage_key}")
         storagepath = lamindb_setup.settings.storage.key_to_filepath(file_storage_key)
-        print_progress = partial(print_hook, filepath=file.name)
+        print_progress = partial(print_hook, filepath=file.key)
         write_adata_zarr(file._memory_rep, storagepath, callback=print_progress)

lamindb/_select.py CHANGED Viewed

@@ -1,12 +1,12 @@
 from typing import Union
 from django.db.models import Manager
-from lnschema_core import BaseORM
+from lnschema_core import ORM
 from lnschema_core._queryset import QuerySet
-def select(*ORM: BaseORM, **expressions) -> Union[QuerySet, Manager]:
-    """Query data.
+def select(*ORM: ORM, **expressions) -> Union[QuerySet, Manager]:
+    """Query records.
     Guide: :doc:`/guide/select`.

lamindb/_view.py CHANGED Viewed

@@ -6,7 +6,7 @@ from IPython.display import display
 from lamin_logger import colors
 from lamindb_setup import settings
 from lamindb_setup.dev._setup_schema import get_schema_module_name
-from lnschema_core import BaseORM
+from lnschema_core import ORM
 from ._select import select
@@ -30,7 +30,7 @@ def view(n: int = 10, schema: Optional[str] = None):
         orms = [
             i
             for i in schema_module.__dict__.values()
-            if inspect.isclass(i) and issubclass(i, BaseORM) and i.__name__ != "BaseORM"
+            if inspect.isclass(i) and issubclass(i, ORM) and i.__name__ != "ORM"
         ]
         if len(schema_names) > 1:
             section = f"* module: {colors.green(colors.bold(schema_name))} *"

lamindb/dev/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@
 .. autosummary::
    :toctree: .
-   BaseORM
+   ORM
    QuerySet
    datasets
    hashing
@@ -12,7 +12,7 @@
 """
 from lnschema_core._queryset import QuerySet
-from lnschema_core.models import BaseORM
+from lnschema_core.models import ORM
 from . import datasets  # noqa
 from ._settings import Settings

lamindb/dev/_settings.py CHANGED Viewed

@@ -15,6 +15,7 @@ class Settings:
     def __init__(self):
         self._verbosity: int = 2  # info-level logging
+        logger.set_verbosity(self._verbosity)
     upon_file_create_if_hash_exists: Literal[
         "warn_return_existing", "error", "warn_create_new"
@@ -70,7 +71,7 @@ class Settings:
     @property
     def verbosity(self) -> int:
-        """Verbosity (default 2).
+        """Verbosity (default 3).
         - 0: only show 'error' messages
         - 1: also show 'warning' messages

lamindb/dev/datasets/__init__.py CHANGED Viewed

@@ -11,6 +11,9 @@
    dir_scrnaseq_cellranger
    generate_cell_ranger_files
    df_iris
+   df_iris_in_meter
+   df_iris_in_meter_batch1
+   df_iris_in_meter_batch2
    anndata_mouse_sc_lymph_node
    anndata_human_immune_cells
    anndata_pbmc68k_reduced
@@ -26,6 +29,9 @@ from ._core import (
     anndata_pbmc68k_reduced,
     anndata_with_obs,
     df_iris,
+    df_iris_in_meter,
+    df_iris_in_meter_batch1,
+    df_iris_in_meter_batch2,
     dir_scrnaseq_cellranger,
     file_bam,
     file_fastq,

lamindb/dev/datasets/_core.py CHANGED Viewed

@@ -169,6 +169,36 @@ def df_iris() -> pd.DataFrame:
     return pd.read_parquet(filepath)
+def df_iris_in_meter() -> pd.DataFrame:
+    """The iris dataset with lenghts in meter."""
+    df = df_iris()
+    # rename columns
+    df.rename(
+        columns={
+            "sepal length (cm)": "sepal_length",
+            "sepal width (cm)": "sepal_width",
+            "petal length (cm)": "petal_length",
+            "petal width (cm)": "petal_width",
+            "target": "iris_species_code",
+        },
+        inplace=True,
+    )
+    df[["sepal_length", "sepal_width", "petal_length", "petal_width"]] /= 100
+    return df
+def df_iris_in_meter_batch1() -> pd.DataFrame:
+    """The iris dataset with lenghts in meter."""
+    df_iris = df_iris_in_meter()
+    return df_iris.iloc[: len(df_iris) // 2]
+def df_iris_in_meter_batch2() -> pd.DataFrame:
+    """The iris dataset with lenghts in meter."""
+    df_iris = df_iris_in_meter()
+    return df_iris.iloc[len(df_iris) // 2 :]
 def generate_cell_ranger_files(
     sample_name: str, basedir: Union[str, Path] = "./", output_only: bool = True
 ):

lamindb/dev/hashing.py CHANGED Viewed

@@ -19,6 +19,10 @@ def to_b64_str(bstr: bytes):
     return b64
+def b16_to_b64(s: str):
+    return to_b64_str(base64.b16decode(s.strip('"'), casefold=True))
 # a lot to read about this: lamin-notes/2022/hashing
 def hash_set(s: Set[str]) -> str:
     bstr = ":".join(sorted(s)).encode("utf-8")

lamindb/dev/storage/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@
    :toctree: .
    AnnDataAccessor
+   BackedAccessor
    UPath
 """
@@ -13,8 +14,8 @@ from lamindb_setup.dev.upath import infer_filesystem as _infer_filesystem
 from ._anndata_sizes import size_adata
 try:
-    from ._backed_access import AnnDataAccessor
+    from ._backed_access import AnnDataAccessor, BackedAccessor
 except ImportError:
     pass
-from ._file import delete_storage, load_to_memory, store_object
-from ._object import infer_suffix, write_to_file
+from .file import delete_storage, load_to_memory, store_object
+from .object import infer_suffix, write_to_file

lamindb/dev/storage/_backed_access.py CHANGED Viewed

@@ -16,7 +16,7 @@ from fsspec.core import OpenFile
 from lamindb_setup.dev.upath import infer_filesystem
 from lnschema_core import File
-from lamindb._file_access import filepath_from_file
+from lamindb.dev.storage.file import filepath_from_file
 ZARR_INSTALLED = False
 try:
@@ -430,9 +430,9 @@ if ZARR_INSTALLED:
             )
         if file.suffix in (".h5ad", ".zrad"):
-            return AnnDataAccessor(conn, storage, file.name)
+            return AnnDataAccessor(conn, storage, file.key)
         else:
             if get_spec(storage).encoding_type == "anndata":
-                return AnnDataAccessor(conn, storage, file.name)
+                return AnnDataAccessor(conn, storage, file.key)
             else:
                 return BackedAccessor(conn, storage)

lamindb 0.45.0__py3-none-any.whl → 0.46a1__py3-none-any.whl

lamindb 0.45.0py3-none-any.whl → 0.46a1py3-none-any.whl