PyPI - lamindb - Versions diffs - 0.45a1__py3-none-any.whl → 0.46a1__py3-none-any.whl - Mend

lamindb 0.45a1py3-none-any.whl → 0.46a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

lamindb/__init__.py +30 -9
lamindb/_context.py +11 -12
lamindb/_dataset.py +142 -0
lamindb/_delete.py +6 -6
lamindb/_feature_set.py +138 -0
lamindb/_file.py +322 -81
lamindb/_from_values.py +57 -160
lamindb/_orm.py +398 -0
lamindb/_save.py +26 -10
lamindb/_select.py +3 -3
lamindb/_view.py +2 -2
lamindb/dev/__init__.py +2 -2
lamindb/dev/_settings.py +2 -1
lamindb/dev/datasets/__init__.py +6 -0
lamindb/dev/datasets/_core.py +30 -0
lamindb/dev/hashing.py +4 -0
lamindb/dev/storage/__init__.py +4 -3
lamindb/dev/storage/_backed_access.py +3 -3
lamindb/dev/storage/{_file.py → file.py} +48 -3
lamindb/dev/storage/{_object.py → object.py} +1 -0
lamindb/dev/utils.py +9 -0
lamindb/types.py +9 -1
{lamindb-0.45a1.dist-info → lamindb-0.46a1.dist-info}/METADATA +20 -17
lamindb-0.46a1.dist-info/RECORD +36 -0
lamindb/_baseorm_methods.py +0 -535
lamindb/_featureset_methods.py +0 -73
lamindb/_file_access.py +0 -48
lamindb/_file_methods.py +0 -319
lamindb-0.45a1.dist-info/RECORD +0 -36
/lamindb/{_transform_methods.py → _transform.py} +0 -0
{lamindb-0.45a1.dist-info → lamindb-0.46a1.dist-info}/LICENSE +0 -0
{lamindb-0.45a1.dist-info → lamindb-0.46a1.dist-info}/WHEEL +0 -0
{lamindb-0.45a1.dist-info → lamindb-0.46a1.dist-info}/entry_points.txt +0 -0

lamindb/_from_values.py CHANGED Viewed

@@ -1,20 +1,17 @@
-from typing import Any, Dict, Iterable, List, Optional, TypeVar, Union
+from typing import Any, Dict, Iterable, List, Tuple, Union
-import numpy as np
 import pandas as pd
-from django.db.models import Q
+from django.core.exceptions import FieldDoesNotExist
 from django.db.models.query_utils import DeferredAttribute as Field
 from lamin_logger import colors, logger
-from lamindb_setup.dev import deprecated
-from lnschema_core.models import BaseORM
+from lnschema_core.models import ORM
+from lnschema_core.types import ListLike
 from ._select import select
 from .dev._settings import settings
-ListLike = TypeVar("ListLike", pd.Series, list, np.array)
-# The base function for `from_iter` and `from_bionty`
+# The base function for `from_values`
 def get_or_create_records(
     iterable: ListLike,
     field: Field,
@@ -47,7 +44,7 @@ def get_or_create_records(
             if len(unmapped_values) > 0:
                 for i in unmapped_values:
                     records.append(model(**{field_name: i}, **kwargs))
-                logger.hint(
+                logger.info(
                     "Created"
                     f" {colors.red(f'{len(unmapped_values)} {model.__name__} records')}"
                     f" with a single field {colors.red(f'{field_name}')}"
@@ -57,103 +54,20 @@ def get_or_create_records(
         settings.upon_create_search_names = upon_create_search_names
-@deprecated("ORM.from_iter()")
-def parse(
-    iterable: Union[ListLike, pd.DataFrame],
-    field: Union[Field, Dict[str, Field]],
-    *,
-    species: Optional[str] = None,
-) -> List[BaseORM]:
-    """Parse identifiers and create records through lookups for a given field.
-    Guide: :doc:`/biology/registries`.
-    Args:
-        iterable: `Union[ListLike, pd.DataFrame]` A `ListLike` of identifiers or
-            a `DataFrame`.
-        field: `Union[Field, Dict[str, Field]]` If `iterable` is `ListLike`, a
-            `BaseORM` field to look up.
-            If `iterable` is `DataFrame`, a dict of `{column_name1: field1,
-            column_name2: field2}`.
-        species: `Optional[str]` Either `"human"`, `"mouse"`, or any other
-            `name` of `Bionty.Species`. If `None`, will use default species in
-            bionty for each entity.
-    Returns:
-        A list of records.
-    For every `value` in an iterable of identifiers and a given `ORM.field`,
-    this function performs:
-    1. It checks whether the value already exists in the database
-       (`ORM.select(field=value)`). If so, it adds the queried record to
-       the returned list and skips step 2. Otherwise, proceed with 2.
-    2. If the `ORM` is from `lnschema_bionty`, it checks whether there is an
-       exact match in the underlying ontology (`Bionty.inspect(value, field)`).
-       If so, it creates a record from Bionty and adds it to the returned list.
-       Otherwise, it create a record that populates a single field using `value`
-       and adds the record to the returned list.
-    """
-    upon_create_search_names = settings.upon_create_search_names
-    settings.upon_create_search_names = False
-    try:
-        if isinstance(iterable, pd.DataFrame):
-            # check the field must be a dictionary
-            if not isinstance(field, dict):
-                raise TypeError("field must be a dictionary of {column_name: Field}!")
-            # check only one single model class is passed
-            class_mapper = {f.field.name: f.field.model for f in field.values()}
-            if len(set(class_mapper.values())) > 1:
-                raise NotImplementedError("fields must from the same entity!")
-            model = list(class_mapper.values())[0]
-            df = _map_columns_to_fields(df=iterable, field=field)
-            df_records = df.to_dict(orient="records")
-            # make sure to only return 1 existing entry for each row
-            queryset = get_existing_records_multifields(
-                df_records=df_records, model=model
-            )
-            records = queryset.list()
-            df_records_new = [
-                i for i in df_records if not queryset.filter(**i).exists()
-            ]
-            if len(records) > 0:
-                logger.hint(
-                    "Returned"
-                    f" {colors.green(f'{len(records)} existing {model.__name__} DB records')}"  # noqa
-                )
-            if len(df_records_new) > 0:
-                logger.hint(
-                    "Created"
-                    f" {colors.purple(f'{len(df_records_new)} {model.__name__} records')} with"  # noqa
-                    f" {df.shape[1]} fields"
-                )
-                records += [model(**i) for i in df_records_new]
-            return records
-        else:
-            if not isinstance(field, Field):
-                raise TypeError("field must be an ORM field, e.g., `CellType.name`!")
-            return get_or_create_records(
-                iterable=iterable, field=field, species=species
-            )
-    finally:
-        settings.upon_create_search_names = upon_create_search_names
-def index_iterable(iterable: Iterable) -> pd.Index:
-    idx = pd.Index(iterable).unique()
-    # No entries are made for NAs, '', None
-    # returns an ordered unique not null list
-    return idx[(idx != "") & (~idx.isnull())]
 def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}):
     field_name = field.field.name
     model = field.field.model
+    condition: Dict = {}
+    if _has_species_field(model):
+        from lnschema_bionty._bionty import create_or_get_species_record
+        species_record = create_or_get_species_record(
+            species=kwargs.get("species"), orm=model
+        )
+        if species_record is not None:
+            kwargs.update({"species": species_record})
+            condition.update({"species__name": species_record.name})
     # map synonyms based on the DB reference
     syn_mapper = model.map_synonyms(
@@ -173,22 +87,21 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
     # if necessary, create records for the values in kwargs
     # k:v -> k:v_record
     # kwargs is used to deal with species
-    condition = {f"{field_name}__in": iterable_idx.values}
-    kwargs, condition = _species_kwargs(orm=model, kwargs=kwargs, condition=condition)
+    condition.update({f"{field_name}__in": iterable_idx.values})
     stmt = select(model, **condition)
     records = stmt.list()  # existing records
     n_name = len(records) - len(syn_mapper)
     if n_name > 0:
-        logger.hint(
+        logger.info(
             "Returned"
             f" {colors.green(f'{n_name} existing {model.__name__} DB records')} that"
             f" matched {colors.green(f'{field_name}')} field"
         )
     # make sure that synonyms logging appears after the field logging
     if len(syn_msg) > 0:
-        logger.hint(syn_msg)
+        logger.info(syn_msg)
     existing_values = iterable_idx.intersection(stmt.values_list(field_name, flat=True))
     nonexist_values = iterable_idx.difference(existing_values)
@@ -196,33 +109,6 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
     return records, nonexist_values
-def get_existing_records_multifields(
-    df_records: List, model: BaseORM, kwargs: Dict = {}
-):
-    q = Q(**df_records[0])
-    for df_record in df_records[1:]:
-        q = q.__getattribute__("__or__")(Q(**df_record))
-    kwargs, condition = _species_kwargs(orm=model, kwargs=kwargs)
-    stmt = model.select(**condition).filter(q)
-    return stmt
-def _species_kwargs(orm: BaseORM, kwargs: Dict = {}, condition: Dict = {}):
-    """Create records based on the kwargs."""
-    if kwargs.get("species") is not None:
-        from lnschema_bionty._bionty import create_or_get_species_record
-        species_record = create_or_get_species_record(
-            species=kwargs.get("species"), orm=orm
-        )
-        if species_record is not None:
-            kwargs.update({"species": species_record})
-            condition.update({"species__name": species_record.name})
-    return kwargs, condition
 def create_records_from_bionty(
     iterable_idx: pd.Index,
     field: Field,
@@ -232,10 +118,10 @@ def create_records_from_bionty(
     field_name = field.field.name
     records: List = []
     # populate additional fields from bionty
-    from lnschema_bionty._bionty import get_bionty_object, get_bionty_source_record
+    from lnschema_bionty._bionty import get_bionty_source_record
     # create the corresponding bionty object from model
-    bionty_object = get_bionty_object(orm=model, species=kwargs.get("species"))
+    bionty_object = model.bionty(species=kwargs.get("species"))
     # add bionty_source record to the kwargs
     kwargs.update({"bionty_source": get_bionty_source_record(bionty_object)})
@@ -261,7 +147,7 @@ def create_records_from_bionty(
     mapped_values = iterable_idx.intersection(bionty_df[field_name])
     if len(mapped_values) > 0:
-        bionty_kwargs = _bulk_create_dicts_from_df(
+        bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
             keys=mapped_values, column_name=field_name, df=bionty_df
         )
         for bk in bionty_kwargs:
@@ -282,46 +168,57 @@ def create_records_from_bionty(
                 f" {colors.purple(f'{n_name} {model.__name__} records from Bionty')} that"  # noqa
                 f" matched {colors.purple(f'{field_name}')} field"
             )
-            logger.hint(msg + source_msg)
+            logger.info(msg + source_msg)
         # make sure that synonyms logging appears after the field logging
         if len(msg_syn) > 0:
-            logger.hint(msg_syn + source_msg)
+            logger.info(msg_syn + source_msg)
+        # warning about multi matches
+        if len(multi_msg) > 0:
+            logger.warning(multi_msg)
     # return the values that are not found in the bionty reference
     unmapped_values = iterable_idx.difference(mapped_values)
     return records, unmapped_values
-def _filter_bionty_df_columns(model: BaseORM, bionty_object: Any) -> pd.DataFrame:
+def index_iterable(iterable: Iterable) -> pd.Index:
+    idx = pd.Index(iterable).unique()
+    # No entries are made for NAs, '', None
+    # returns an ordered unique not null list
+    return idx[(idx != "") & (~idx.isnull())]
+def _filter_bionty_df_columns(model: ORM, bionty_object: Any) -> pd.DataFrame:
     bionty_df = pd.DataFrame()
     if bionty_object is not None:
         model_field_names = {i.name for i in model._meta.fields}
+        # parents needs to be added here as relationships aren't in fields
+        model_field_names.add("parents")
         bionty_df = bionty_object.df().reset_index()
+        # rename definition to description for the lnschema_bionty
+        bionty_df.rename(columns={"definition": "description"}, inplace=True)
         bionty_df = bionty_df.loc[:, bionty_df.columns.isin(model_field_names)]
     return bionty_df
 def _bulk_create_dicts_from_df(
     keys: Union[set, List], column_name: str, df: pd.DataFrame
-) -> dict:
+) -> Tuple[Dict, str]:
     """Get fields from a DataFrame for many rows."""
+    multi_msg = ""
     if df.index.name != column_name:
-        df = df.set_index(column_name)
-    # keep the last record (assuming most recent) if duplicated
-    df = df[~df.index.duplicated(keep="last")]
-    return df.loc[list(keys)].reset_index().to_dict(orient="records")
-def _map_columns_to_fields(df: pd.DataFrame, field: dict) -> pd.DataFrame:
-    """Subset dataframe to mappable fields columns and clean up."""
-    column_mapper = {colname: f.field.name for colname, f in field.items()}
-    # subset to columns containing fields
-    df = df.copy()
-    if df.index.name is not None:
-        df = df.reset_index()
-    df = df.loc[:, df.columns.isin(field.keys())]
-    df = df.rename(columns=column_mapper)
-    df = df.dropna().drop_duplicates()
-    # TODO: remove after having the auto conversion for django ORMs
-    df = df.mask(df == "", None)
-    return df
+        df = df.set_index(column_name).loc[list(keys)]
+    if not df.index.is_unique:
+        # return all records for multi-matches with a warning
+        dup = df.index[df.index.duplicated()].unique().tolist()
+        multi_msg = f"Multiple matches found in Bionty for: {dup}"
+    return df.reset_index().to_dict(orient="records"), multi_msg
+def _has_species_field(orm: ORM) -> bool:
+    try:
+        orm._meta.get_field("species")
+        return True
+    except FieldDoesNotExist:
+        return False

lamindb 0.45a1__py3-none-any.whl → 0.46a1__py3-none-any.whl

lamindb 0.45a1py3-none-any.whl → 0.46a1py3-none-any.whl