PyPI - lamindb - Versions diffs - 0.74.3__py3-none-any.whl → 0.75.1__py3-none-any.whl - Mend

lamindb 0.74.3py3-none-any.whl → 0.75.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

lamindb/__init__.py +1 -1
lamindb/_artifact.py +85 -43
lamindb/_can_validate.py +100 -35
lamindb/_collection.py +36 -28
lamindb/_curate.py +432 -181
lamindb/_feature_set.py +5 -5
lamindb/_filter.py +3 -3
lamindb/_finish.py +29 -23
lamindb/_from_values.py +47 -66
lamindb/_is_versioned.py +1 -1
lamindb/_parents.py +38 -13
lamindb/_record.py +41 -42
lamindb/_save.py +7 -7
lamindb/_transform.py +27 -16
lamindb/_view.py +13 -11
lamindb/core/__init__.py +2 -0
lamindb/core/_data.py +18 -20
lamindb/core/_feature_manager.py +50 -50
lamindb/core/_label_manager.py +17 -19
lamindb/core/_mapped_collection.py +1 -1
lamindb/core/_run_context.py +6 -8
lamindb/core/datasets/_core.py +7 -7
lamindb/core/exceptions.py +11 -0
lamindb/core/schema.py +5 -5
lamindb/core/storage/__init__.py +12 -2
lamindb/core/storage/_anndata_accessor.py +735 -0
lamindb/core/storage/_backed_access.py +77 -747
lamindb/core/storage/_valid_suffixes.py +16 -2
lamindb/core/storage/paths.py +9 -14
lamindb/core/types.py +3 -0
lamindb/core/versioning.py +1 -1
lamindb/integrations/__init__.py +1 -0
lamindb/integrations/_vitessce.py +68 -31
{lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/METADATA +5 -5
lamindb-0.75.1.dist-info/RECORD +58 -0
lamindb-0.74.3.dist-info/RECORD +0 -57
{lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/LICENSE +0 -0
{lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/WHEEL +0 -0

lamindb/_feature_set.py CHANGED Viewed

@@ -118,7 +118,7 @@ def from_values(
     name: str | None = None,
     mute: bool = False,
     organism: Record | str | None = None,
-    public_source: Record | None = None,
+    source: Record | None = None,
     raise_validation_error: bool = True,
 ) -> FeatureSet:
     """{}"""  # noqa: D415
@@ -139,7 +139,7 @@ def from_values(
         not_validated_values = values_array[~validated]
         msg = (
             f"These values could not be validated: {not_validated_values.tolist()}\n"
-            f"If there are no typos, add them to their registry: {registry}"
+            f"If there are no typos, add them to their registry: {registry.__name__}"
         )
         if raise_validation_error:
             raise ValidationError(msg)
@@ -149,7 +149,7 @@ def from_values(
         validated_values,
         field=field,
         organism=organism,
-        public_source=public_source,
+        source=source,
     )
     feature_set = FeatureSet(
         features=validated_features,
@@ -168,7 +168,7 @@ def from_df(
     name: str | None = None,
     mute: bool = False,
     organism: Record | str | None = None,
-    public_source: Record | None = None,
+    source: Record | None = None,
 ) -> FeatureSet | None:
     """{}"""  # noqa: D415
     registry = field.field.model
@@ -189,7 +189,7 @@ def from_df(
             df.columns[validated],
             field=field,
             organism=organism,
-            public_source=public_source,
+            source=source,
         )
         feature_set = FeatureSet(
             features=validated_features,

lamindb/_filter.py CHANGED Viewed

@@ -21,9 +21,9 @@ def filter(Record: type[Record], **expressions) -> QuerySet:
         ):
             visibility = "visibility"
             if not any(e.startswith(visibility) for e in expressions):
-                expressions[
-                    visibility
-                ] = VisibilityChoice.default.value  # default visibility
+                expressions[visibility] = (
+                    VisibilityChoice.default.value
+                )  # default visibility
             # if visibility is None, do not apply a filter
             # otherwise, it would mean filtering for NULL values, which doesn't make
             # sense for a non-NULLABLE column

lamindb/_finish.py CHANGED Viewed

@@ -80,8 +80,8 @@ def save_run_context_core(
     # for scripts, things are easy
     is_consecutive = True
-    is_notebook = transform.type == TransformType.notebook
-    source_code_path = filepath
+    is_notebook = transform.type == "notebook"
+    _source_code_artifact_path = filepath
     # for notebooks, we need more work
     if is_notebook:
         try:
@@ -134,12 +134,12 @@ def save_run_context_core(
         )
         # strip the output from the notebook to create the source code file
         # first, copy the notebook file to a temporary file in the cache
-        source_code_path = ln_setup.settings.storage.cache_dir / filepath.name
-        shutil.copy2(filepath, source_code_path)  # copy
+        _source_code_artifact_path = ln_setup.settings.storage.cache_dir / filepath.name
+        shutil.copy2(filepath, _source_code_artifact_path)  # copy
         subprocess.run(
             [
                 "nbstripout",
-                source_code_path,
+                _source_code_artifact_path,
                 "--extra-keys",
                 "metadata.version metadata.kernelspec metadata.language_info metadata.pygments_lexer metadata.name metadata.file_extension",
             ],
@@ -152,31 +152,34 @@ def save_run_context_core(
         transform_family = transform.versions
     if len(transform_family) > 0:
         for prev_transform in transform_family.order_by("-created_at"):
-            if prev_transform.latest_report_id is not None:
-                prev_report = prev_transform.latest_report
-            if prev_transform.source_code_id is not None:
-                prev_source = prev_transform.source_code
+            if (
+                prev_transform.latest_run is not None
+                and prev_transform.latest_run.report_id is not None
+            ):
+                prev_report = prev_transform.latest_run.report
+            if prev_transform._source_code_artifact_id is not None:
+                prev_source = prev_transform._source_code_artifact
     ln.settings.creation.artifact_silence_missing_run_warning = True
     # track source code
-    if transform.source_code_id is not None:
+    if transform._source_code_artifact_id is not None:
         # check if the hash of the transform source code matches
         # (for scripts, we already run the same logic in track() - we can deduplicate the call at some point)
-        hash, _ = hash_file(source_code_path)  # ignore hash_type for now
-        if hash != transform.source_code.hash:
+        hash, _ = hash_file(_source_code_artifact_path)  # ignore hash_type for now
+        if hash != transform._source_code_artifact.hash:
             if os.getenv("LAMIN_TESTING") is None:
                 # in test, auto-confirm overwrite
                 response = input(
-                    f"You are about to replace (overwrite) existing source code (hash '{transform.source_code.hash}') for transform version"
+                    f"You are about to replace (overwrite) existing source code (hash '{transform._source_code_artifact.hash}') for transform version"
                     f" '{transform.version}'. Proceed? (y/n)"
                 )
             else:
                 response = "y"
             if response == "y":
-                transform.source_code.replace(source_code_path)
-                transform.source_code.save(upload=True)
+                transform._source_code_artifact.replace(_source_code_artifact_path)
+                transform._source_code_artifact.save(upload=True)
                 logger.success(
-                    f"replaced transform.source_code: {transform.source_code}"
+                    f"replaced transform._source_code_artifact: {transform._source_code_artifact}"
                 )
             else:
                 logger.warning("Please re-run `ln.track()` to make a new version")
@@ -184,17 +187,19 @@ def save_run_context_core(
         else:
             logger.important("source code is already saved")
     else:
-        source_code = ln.Artifact(
-            source_code_path,
+        _source_code_artifact = ln.Artifact(
+            _source_code_artifact_path,
             description=f"Source of transform {transform.uid}",
             version=transform.version,
             is_new_version_of=prev_source,
             visibility=0,  # hidden file
             run=False,
         )
-        source_code.save(upload=True, print_progress=False)
-        transform.source_code = source_code
-        logger.debug(f"saved transform.source_code: {transform.source_code}")
+        _source_code_artifact.save(upload=True, print_progress=False)
+        transform._source_code_artifact = _source_code_artifact
+        logger.debug(
+            f"saved transform._source_code_artifact: {transform._source_code_artifact}"
+        )
     # track environment
     env_path = ln_setup.settings.storage.cache_dir / f"run_env_pip_{run.uid}.txt"
@@ -257,8 +262,9 @@ def save_run_context_core(
             run.report = report_file
         run.is_consecutive = is_consecutive
         run.save()
-        transform.latest_report = run.report
-        logger.debug(f"saved transform.latest_report: {transform.latest_report}")
+        logger.debug(
+            f"saved transform.latest_run.report: {transform.latest_run.report}"
+        )
     transform.save()
     # finalize

lamindb/_from_values.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Any, Iterable
+from typing import TYPE_CHECKING, Iterable
 import pandas as pd
 from django.core.exceptions import FieldDoesNotExist
@@ -19,9 +19,9 @@ def get_or_create_records(
     field: StrField,
     *,
     create: bool = False,
-    from_public: bool = False,
+    from_source: bool = False,
     organism: Record | str | None = None,
-    public_source: Record | None = None,
+    source: Record | None = None,
     mute: bool = False,
 ) -> list[Record]:
     """Get or create records from iterables."""
@@ -34,8 +34,8 @@ def get_or_create_records(
     kwargs: dict = {}
     if organism is not None:
         kwargs["organism"] = organism
-    if public_source is not None:
-        kwargs["public_source"] = public_source
+    if source is not None:
+        kwargs["source"] = source
     settings.creation.search_names = False
     try:
         iterable_idx = index_iterable(iterable)
@@ -47,8 +47,17 @@ def get_or_create_records(
         # new records to be created based on new values
         if len(nonexist_values) > 0:
-            if from_public:
-                records_bionty, unmapped_values = create_records_from_public(
+            if source:
+                from_source = not source.in_db
+            elif (
+                records
+                and hasattr(records[0], "source_id")
+                and records[0].source_id
+                and records[0].source.in_db
+            ):
+                from_source = False
+            if from_source:
+                records_bionty, unmapped_values = create_records_from_source(
                     iterable_idx=nonexist_values,
                     field=field,
                     msg=msg,
@@ -58,7 +67,7 @@ def get_or_create_records(
                 if len(records_bionty) > 0:
                     msg = ""
                 for record in records_bionty:
-                    record._from_public = True
+                    record._from_source = True
                 records += records_bionty
             else:
                 unmapped_values = nonexist_values
@@ -75,7 +84,7 @@ def get_or_create_records(
                         f"{colors.red('did not create')} {name} record{s} for "
                         f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
                     )
-        if Record.__module__.startswith("lnschema_bionty.") or Record == ULabel:
+        if Record.__module__.startswith("bionty.") or Record == ULabel:
             if isinstance(iterable, pd.Series):
                 feature = iterable.name
             feature_name = None
@@ -100,8 +109,8 @@ def get_existing_records(
     model = field.field.model
     condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
     # existing records matching is agnostic to the bionty source
-    if "public_source" in condition:
-        condition.pop("public_source")
+    if "source" in condition:
+        condition.pop("source")
     # standardize based on the DB reference
     # log synonyms mapped terms
@@ -109,7 +118,7 @@ def get_existing_records(
         iterable_idx,
         field=field,
         organism=kwargs.get("organism"),
-        public_source=kwargs.get("public_source"),
+        source=kwargs.get("source"),
         mute=True,
     )
     syn_mapper = result.synonyms_mapper
@@ -174,7 +183,7 @@ def get_existing_records(
     return records, nonexist_values, msg
-def create_records_from_public(
+def create_records_from_source(
     iterable_idx: pd.Index,
     field: StrField,
     msg: str = "",
@@ -184,7 +193,8 @@ def create_records_from_public(
     model = field.field.model
     records: list = []
     # populate additional fields from bionty
-    from lnschema_bionty._bionty import get_public_source_record
+    from bionty._bionty import get_source_record
+    from bionty.core._bionty import filter_bionty_df_columns
     # create the corresponding bionty object from model
     try:
@@ -195,17 +205,20 @@ def create_records_from_public(
                 organism = "human"
             elif iterable_idx[0].startswith("ENSMUSG"):
                 organism = "mouse"
-        public_ontology = model.public(
-            organism=organism, public_source=kwargs.get("public_source")
-        )
+        public_ontology = model.public(organism=organism, source=kwargs.get("source"))
     except Exception:
         # for custom records that are not created from public sources
         return records, iterable_idx
-    # add public_source record to the kwargs
-    kwargs.update({"public_source": get_public_source_record(public_ontology)})
+    # add source record to the kwargs
+    source_record = get_source_record(public_ontology)
+    if source_record is not None and source_record.in_db:
+        # skips the creation of records from public if the source is already in the db
+        return records, iterable_idx
+    kwargs.update({"source": source_record})
     # filter the columns in bionty df based on fields
-    bionty_df = _filter_bionty_df_columns(model=model, public_ontology=public_ontology)
+    bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
     # standardize in the bionty reference
     result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
@@ -301,43 +314,6 @@ def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
     return print_values
-def _filter_bionty_df_columns(model: Record, public_ontology: Any) -> pd.DataFrame:
-    bionty_df = pd.DataFrame()
-    if public_ontology is not None:
-        model_field_names = {i.name for i in model._meta.fields}
-        # parents needs to be added here as relationships aren't in fields
-        model_field_names.add("parents")
-        bionty_df = public_ontology.df().reset_index()
-        if model.__name__ == "Gene":
-            # groupby ensembl_gene_id and concat ncbi_gene_ids
-            groupby_id_col = (
-                "ensembl_gene_id" if "ensembl_gene_id" in bionty_df else "stable_id"
-            )
-            bionty_df.drop(
-                columns=["hgnc_id", "mgi_id", "index"], errors="ignore", inplace=True
-            )
-            bionty_df.drop_duplicates([groupby_id_col, "ncbi_gene_id"], inplace=True)
-            bionty_df["ncbi_gene_id"] = bionty_df["ncbi_gene_id"].fillna("")
-            bionty_df = (
-                bionty_df.groupby(groupby_id_col)
-                .agg(
-                    {
-                        "symbol": "first",
-                        "ncbi_gene_id": "|".join,
-                        "biotype": "first",
-                        "description": "first",
-                        "synonyms": "first",
-                    }
-                )
-                .reset_index()
-            )
-            bionty_df.rename(columns={"ncbi_gene_id": "ncbi_gene_ids"}, inplace=True)
-        # rename definition to description for the lnschema_bionty
-        bionty_df.rename(columns={"definition": "description"}, inplace=True)
-        bionty_df = bionty_df.loc[:, bionty_df.columns.isin(model_field_names)]
-    return bionty_df
 def _bulk_create_dicts_from_df(
     keys: set | list, column_name: str, df: pd.DataFrame
 ) -> tuple[dict, str]:
@@ -359,9 +335,9 @@ def _bulk_create_dicts_from_df(
     return df.reset_index().to_dict(orient="records"), multi_msg
-def _has_organism_field(orm: Record) -> bool:
+def _has_organism_field(registry: type[Record]) -> bool:
     try:
-        orm._meta.get_field("organism")
+        registry._meta.get_field("organism")
         return True
     except FieldDoesNotExist:
         return False
@@ -370,12 +346,17 @@ def _has_organism_field(orm: Record) -> bool:
 def _get_organism_record(
     field: StrField, organism: str | Record, force: bool = False
 ) -> Record:
-    model = field.field.model
-    check = True if force else field.field.name != "ensembl_gene_id"
-    if _has_organism_field(model) and check:
-        from lnschema_bionty._bionty import create_or_get_organism_record
-        organism_record = create_or_get_organism_record(organism=organism, orm=model)
+    registry = field.field.model
+    check = True
+    if not force and hasattr(registry, "_ontology_id_field"):
+        check = field.field.name != registry._ontology_id_field
+        # e.g. bionty.CellMarker has "name" as _ontology_id_field
+        if not registry._ontology_id_field.endswith("id"):
+            check = True
+    if _has_organism_field(registry) and check:
+        from bionty._bionty import create_or_get_organism_record
+        organism_record = create_or_get_organism_record(organism=organism, orm=registry)
         if organism_record is not None:
             return organism_record

lamindb/_is_versioned.py CHANGED Viewed

@@ -16,7 +16,7 @@ def _add_to_version_family(
 ):
     old_uid = self.uid
     new_uid, version = get_uid_from_old_version(is_new_version_of, version)
-    if self.__class__.__name__ == "Artifact" and self.key_is_virtual:
+    if self.__class__.__name__ == "Artifact" and self._key_is_virtual:
         old_path = self.path
         new_path = get_new_path_from_uid(
             old_path=old_path, old_uid=old_uid, new_uid=new_uid

lamindb/_parents.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import builtins
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 import lamindb_setup as ln_setup
 from lamin_utils import logger
@@ -10,7 +10,7 @@ from lnschema_core.models import HasParents, format_field_value
 from lamindb._utils import attach_func_to_class_method
-from ._record import get_default_str_field
+from ._record import get_name_field
 if TYPE_CHECKING:
     from lnschema_core.types import StrField
@@ -61,7 +61,7 @@ def view_parents(
     distance: int = 5,
 ):
     if field is None:
-        field = get_default_str_field(self)
+        field = get_name_field(self)
     if not isinstance(field, str):
         field = field.field.name
@@ -137,10 +137,14 @@ def view_lineage(data: Artifact | Collection, with_children: bool = True) -> Non
 def _view_parents(
-    record: Record, field: str, with_children: bool = False, distance: int = 100
+    record: Record,
+    field: str,
+    with_children: bool = False,
+    distance: int = 100,
+    attr_name: Literal["parents", "predecessors"] = "parents",
 ):
     """Graph of parents."""
-    if not hasattr(record, "parents"):
+    if not hasattr(record, attr_name):
         raise NotImplementedError(
             f"Parents view is not supported for {record.__class__.__name__}!"
         )
@@ -149,13 +153,17 @@ def _view_parents(
     df_edges = None
     df_edges_parents = _df_edges_from_parents(
-        record=record, field=field, distance=distance
+        record=record, field=field, distance=distance, attr_name=attr_name
     )
     if df_edges_parents is not None:
         df_edges = df_edges_parents
     if with_children:
         df_edges_children = _df_edges_from_parents(
-            record=record, field=field, distance=distance, children=True
+            record=record,
+            field=field,
+            distance=distance,
+            children=True,
+            attr_name=attr_name,
         )
         if df_edges_children is not None:
             if df_edges is not None:
@@ -197,12 +205,18 @@ def _view_parents(
     _view(u)
-def _get_parents(record: Record, field: str, distance: int, children: bool = False):
+def _get_parents(
+    record: Record,
+    field: str,
+    distance: int,
+    children: bool = False,
+    attr_name: Literal["parents", "predecessors"] = "parents",
+):
     """Recursively get parent records within a distance."""
     if children:
-        key = "parents"
+        key = attr_name
     else:
-        key = "children"
+        key = "children" if attr_name == "parents" else "successors"  # type: ignore
     model = record.__class__
     condition = f"{key}__{field}"
     results = model.filter(**{condition: record.__getattribute__(field)}).all()
@@ -228,12 +242,23 @@ def _get_parents(record: Record, field: str, distance: int, children: bool = Fal
 def _df_edges_from_parents(
-    record: Record, field: str, distance: int, children: bool = False
+    record: Record,
+    field: str,
+    distance: int,
+    children: bool = False,
+    attr_name: Literal["parents", "predecessors"] = "parents",
 ):
     """Construct a DataFrame of edges as the input of graphviz.Digraph."""
-    key = "children" if children else "parents"
+    if attr_name == "parents":
+        key = "children" if children else "parents"
+    else:
+        key = "successors" if children else "predecessors"
     parents = _get_parents(
-        record=record, field=field, distance=distance, children=children
+        record=record,
+        field=field,
+        distance=distance,
+        children=children,
+        attr_name=attr_name,
     )
     all = record.__class__.objects
     records = parents | all.filter(id=record.id)

lamindb 0.74.3__py3-none-any.whl → 0.75.1__py3-none-any.whl

lamindb 0.74.3py3-none-any.whl → 0.75.1py3-none-any.whl