PyPI - lamindb - Versions diffs - 0.63.5__py3-none-any.whl → 0.64.1__py3-none-any.whl - Mend

lamindb 0.63.5py3-none-any.whl → 0.64.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

lamindb/__init__.py +5 -4
lamindb/{_file.py → _artifact.py} +265 -210
lamindb/_dataset.py +87 -115
lamindb/_delete.py +2 -2
lamindb/_filter.py +2 -2
lamindb/_parents.py +7 -7
lamindb/_query_manager.py +5 -2
lamindb/_registry.py +3 -3
lamindb/_save.py +63 -63
lamindb/dev/_data.py +10 -9
lamindb/dev/_feature_manager.py +10 -10
lamindb/dev/_label_manager.py +4 -4
lamindb/dev/_run_context.py +2 -2
lamindb/dev/_settings.py +5 -4
lamindb/dev/_view_tree.py +5 -5
lamindb/dev/datasets/_core.py +6 -6
lamindb/dev/hashing.py +11 -1
lamindb/dev/storage/__init__.py +1 -1
lamindb/dev/storage/_backed_access.py +6 -6
lamindb/dev/storage/file.py +36 -31
lamindb/dev/versioning.py +3 -3
{lamindb-0.63.5.dist-info → lamindb-0.64.1.dist-info}/METADATA +5 -5
lamindb-0.64.1.dist-info/RECORD +48 -0
lamindb-0.63.5.dist-info/RECORD +0 -48
{lamindb-0.63.5.dist-info → lamindb-0.64.1.dist-info}/LICENSE +0 -0
{lamindb-0.63.5.dist-info → lamindb-0.64.1.dist-info}/WHEEL +0 -0

lamindb/_dataset.py CHANGED Viewed

@@ -1,15 +1,10 @@
 from collections import defaultdict
-from pathlib import Path
 from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
 import anndata as ad
 import pandas as pd
 from lamin_utils import logger
-from lamindb_setup._init_instance import register_storage
-from lamindb_setup.dev import StorageSettings
 from lamindb_setup.dev._docs import doc_args
-from lamindb_setup.dev._hub_utils import get_storage_region
-from lamindb_setup.dev.upath import UPath
 from lnschema_core.models import Dataset, Feature, FeatureSet
 from lnschema_core.types import AnnDataLike, DataLike, FieldAttr, VisibilityChoice
@@ -19,8 +14,8 @@ from lamindb.dev._mapped_dataset import MappedDataset
 from lamindb.dev.storage._backed_access import AnnDataAccessor, BackedAccessor
 from lamindb.dev.versioning import get_ids_from_old_version, init_uid
-from . import _TESTING, File, Run
-from ._file import parse_feature_sets_from_anndata
+from . import _TESTING, Artifact, Run
+from ._artifact import parse_feature_sets_from_anndata
 from ._registry import init_self_from_db
 from .dev._data import (
     add_transform_to_kwargs,
@@ -42,7 +37,7 @@ def __init__(
     # now we proceed with the user-facing constructor
     if len(args) > 1:
         raise ValueError("Only one non-keyword arg allowed: data")
-    data: Union[pd.DataFrame, ad.AnnData, File, Iterable[File]] = (
+    data: Union[pd.DataFrame, ad.AnnData, Artifact, Iterable[Artifact]] = (
         kwargs.pop("data") if len(args) == 0 else args[0]
     )
     meta: Optional[str] = kwargs.pop("meta") if "meta" in kwargs else None
@@ -96,70 +91,56 @@ def __init__(
     run = get_run(run)
     data_init_complete = False
-    file = None
-    files = None
-    storage = None
-    # init from directory or bucket
-    if isinstance(data, (str, Path, UPath)):
-        upath = UPath(data)
-        # below frequently times out on GCP
-        # comment this and corresponding test out
-        # if not upath.is_dir():
-        #     raise ValueError(f"Can only pass buckets or directories, not {data}")
-        upath_str = upath.as_posix().rstrip("/")
-        region = get_storage_region(upath_str)
-        storage_settings = StorageSettings(upath_str, region)
-        storage = register_storage(storage_settings)
-        hash = None
-        data_init_complete = True
+    artifact = None
+    artifacts = None
     # now handle potential metadata
     if meta is not None:
-        if not isinstance(meta, (pd.DataFrame, ad.AnnData, File)):
+        if not isinstance(meta, (pd.DataFrame, ad.AnnData, Artifact)):
             raise ValueError(
-                "meta has to be of type `(pd.DataFrame, ad.AnnData, File)`"
+                "meta has to be of type `(pd.DataFrame, ad.AnnData, Artifact)`"
             )
         data = meta
-    # init file - is either data or metadata
-    if isinstance(data, (pd.DataFrame, ad.AnnData, File)):
-        if isinstance(data, File):
-            file = data
-            if file._state.adding:
-                raise ValueError("Save file before creating dataset!")
+    # init artifact - is either data or metadata
+    if isinstance(data, (pd.DataFrame, ad.AnnData, Artifact)):
+        if isinstance(data, Artifact):
+            artifact = data
+            if artifact._state.adding:
+                raise ValueError("Save artifact before creating dataset!")
             if not feature_sets:
-                feature_sets = file.features._feature_set_by_slot
+                feature_sets = artifact.features._feature_set_by_slot
             else:
-                if len(file.features._feature_set_by_slot) > 0:
-                    logger.info("overwriting feature sets linked to file")
+                if len(artifact.features._feature_set_by_slot) > 0:
+                    logger.info("overwriting feature sets linked to artifact")
         else:
             log_hint = True if feature_sets is None else False
-            file_is_new_version_of = (
-                is_new_version_of.file if is_new_version_of is not None else None
+            artifact_is_new_version_of = (
+                is_new_version_of.artifact if is_new_version_of is not None else None
             )
-            file = File(
+            artifact = Artifact(
                 data,
                 run=run,
                 description="tmp",
                 log_hint=log_hint,
                 version=version,
-                is_new_version_of=file_is_new_version_of,
+                is_new_version_of=artifact_is_new_version_of,
             )
-            # do we really want to update the file here?
+            # do we really want to update the artifact here?
             if feature_sets:
-                file._feature_sets = feature_sets
-        hash = file.hash  # type: ignore
-        provisional_uid = file.uid  # type: ignore
-        if file.description is None or file.description == "tmp":
-            file.description = f"See dataset {provisional_uid}"  # type: ignore
+                artifact._feature_sets = feature_sets
+        hash = artifact.hash  # type: ignore
+        provisional_uid = artifact.uid  # type: ignore
+        if artifact.description is None or artifact.description == "tmp":
+            artifact.description = f"See dataset {provisional_uid}"  # type: ignore
         data_init_complete = True
     if not data_init_complete:
         if hasattr(data, "__getitem__"):
-            assert isinstance(data[0], File)  # type: ignore
-            files = data
-            hash, feature_sets = from_files(files)  # type: ignore
+            assert isinstance(data[0], Artifact)  # type: ignore
+            artifacts = data
+            hash, feature_sets = from_artifacts(artifacts)  # type: ignore
             data_init_complete = True
         else:
             raise ValueError(
-                "Only DataFrame, AnnData, folder or list of File is allowed."
+                "Only DataFrame, AnnData, Artifact or list of artifacts is allowed."
             )
     # we ignore datasets in trash containing the same hash
     if hash is not None:
@@ -183,8 +164,7 @@ def __init__(
             description=description,
             reference=reference,
             reference_type=reference_type,
-            file=file,
-            storage=storage,
+            artifact=artifact,
             hash=hash,
             run=run,
             version=version,
@@ -192,15 +172,15 @@ def __init__(
             visibility=visibility,
             **kwargs,
         )
-    dataset._files = files
+    dataset._artifacts = artifacts
     dataset._feature_sets = feature_sets
     # register provenance
     if is_new_version_of is not None:
         _track_run_input(is_new_version_of, run=run)
-    if file is not None and file.run != run:
-        _track_run_input(file, run=run)
-    elif files is not None:
-        _track_run_input(files, run=run)
+    if artifact is not None and artifact.run != run:
+        _track_run_input(artifact, run=run)
+    elif artifacts is not None:
+        _track_run_input(artifacts, run=run)
 @classmethod  # type: ignore
@@ -215,7 +195,7 @@ def from_df(
     reference: Optional[str] = None,
     reference_type: Optional[str] = None,
     version: Optional[str] = None,
-    is_new_version_of: Optional["File"] = None,
+    is_new_version_of: Optional["Artifact"] = None,
     **kwargs,
 ) -> "Dataset":
     """{}"""
@@ -250,11 +230,11 @@ def from_anndata(
     reference: Optional[str] = None,
     reference_type: Optional[str] = None,
     version: Optional[str] = None,
-    is_new_version_of: Optional["File"] = None,
+    is_new_version_of: Optional["Artifact"] = None,
     **kwargs,
 ) -> "Dataset":
     """{}"""
-    if isinstance(adata, File):
+    if isinstance(adata, Artifact):
         assert not adata._state.adding
         assert adata.accessor == "AnnData"
         adata_parse = adata.path
@@ -276,23 +256,24 @@ def from_anndata(
 # internal function, not exposed to user
-def from_files(files: Iterable[File]) -> Tuple[str, Dict[str, str]]:
-    # assert all files are already saved
+def from_artifacts(artifacts: Iterable[Artifact]) -> Tuple[str, Dict[str, str]]:
+    # assert all artifacts are already saved
     logger.debug("check not saved")
-    saved = not any([file._state.adding for file in files])
+    saved = not any([artifact._state.adding for artifact in artifacts])
     if not saved:
-        raise ValueError("Not all files are yet saved, please save them")
-    # query all feature sets of files
-    logger.debug("file ids")
-    file_ids = [file.id for file in files]
-    # query all feature sets at the same time rather than making a single query per file
-    logger.debug("feature_set_file_links")
-    feature_set_file_links = File.feature_sets.through.objects.filter(
-        file_id__in=file_ids
+        raise ValueError("Not all artifacts are yet saved, please save them")
+    # query all feature sets of artifacts
+    logger.debug("artifact ids")
+    artifact_ids = [artifact.id for artifact in artifacts]
+    # query all feature sets at the same time rather
+    # than making a single query per artifact
+    logger.debug("feature_set_artifact_links")
+    feature_set_artifact_links = Artifact.feature_sets.through.objects.filter(
+        artifact_id__in=artifact_ids
     )
     feature_sets_by_slots = defaultdict(list)
     logger.debug("slots")
-    for link in feature_set_file_links:
+    for link in feature_set_artifact_links:
         feature_sets_by_slots[link.slot].append(link.feature_set_id)
     feature_sets_union = {}
     logger.debug("union")
@@ -318,14 +299,14 @@ def from_files(files: Iterable[File]) -> Tuple[str, Dict[str, str]]:
     # validate consistency of hashes
     # we do not allow duplicate hashes
     logger.debug("hashes")
-    # file.hash is None for zarr
+    # artifact.hash is None for zarr
     # todo: more careful handling of such cases
-    hashes = [file.hash for file in files if file.hash is not None]
+    hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
     if len(hashes) != len(set(hashes)):
         seen = set()
         non_unique = [x for x in hashes if x in seen or seen.add(x)]  # type: ignore
         raise ValueError(
-            "Please pass files with distinct hashes: these ones are non-unique"
+            "Please pass artifacts with distinct hashes: these ones are non-unique"
             f" {non_unique}"
         )
     time = logger.debug("hash")
@@ -346,14 +327,14 @@ def mapped(
 ) -> "MappedDataset":
     _track_run_input(self, is_run_input)
     path_list = []
-    for file in self.files.all():
-        if file.suffix not in {".h5ad", ".zrad", ".zarr"}:
-            logger.warning(f"Ignoring file with suffix {file.suffix}")
+    for artifact in self.artifacts.all():
+        if artifact.suffix not in {".h5ad", ".zrad", ".zarr"}:
+            logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
             continue
-        elif not stream and file.suffix == ".h5ad":
-            path_list.append(file.stage())
+        elif not stream and artifact.suffix == ".h5ad":
+            path_list.append(artifact.stage())
         else:
-            path_list.append(file.path)
+            path_list.append(artifact.path)
     return MappedDataset(path_list, label_keys, join_vars, encode_labels, parallel)
@@ -362,9 +343,9 @@ def backed(
     self, is_run_input: Optional[bool] = None
 ) -> Union["AnnDataAccessor", "BackedAccessor"]:
     _track_run_input(self, is_run_input)
-    if self.file is None:
-        raise RuntimeError("Can only call backed() for datasets with a single file")
-    return self.file.backed()
+    if self.artifact is None:
+        raise RuntimeError("Can only call backed() for datasets with a single artifact")
+    return self.artifact.backed()
 # docstring handled through attach_func_to_class_method
@@ -375,25 +356,25 @@ def load(
     **kwargs,
 ) -> DataLike:
     # cannot call _track_run_input here, see comment further down
-    if self.file is not None:
+    if self.artifact is not None:
         _track_run_input(self, is_run_input)
-        return self.file.load()
+        return self.artifact.load()
     else:
-        all_files = self.files.all()
-        suffixes = [file.suffix for file in all_files]
+        all_artifacts = self.artifacts.all()
+        suffixes = [artifact.suffix for artifact in all_artifacts]
         if len(set(suffixes)) != 1:
             raise RuntimeError(
-                "Can only load datasets where all files have the same suffix"
+                "Can only load datasets where all artifacts have the same suffix"
             )
         # because we're tracking data flow on the dataset-level, here, we don't
-        # want to track it on the file-level
-        objects = [file.load(is_run_input=False) for file in all_files]
-        file_uids = [file.uid for file in all_files]
+        # want to track it on the artifact-level
+        objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
+        artifact_uids = [artifact.uid for artifact in all_artifacts]
         if isinstance(objects[0], pd.DataFrame):
             concat_object = pd.concat(objects, join=join)
         elif isinstance(objects[0], ad.AnnData):
             concat_object = ad.concat(
-                objects, join=join, label="file_uid", keys=file_uids
+                objects, join=join, label="artifact_uid", keys=artifact_uids
             )
         # only call it here because there might be errors during concat
         _track_run_input(self, is_run_input)
@@ -409,10 +390,10 @@ def delete(
         self.visibility = VisibilityChoice.trash.value
         self.save()
         logger.warning("moved dataset to trash.")
-        if self.file is not None:
-            self.file.visibility = VisibilityChoice.trash.value
-            self.file.save()
-            logger.warning("moved dataset.file to trash.")
+        if self.artifact is not None:
+            self.artifact.visibility = VisibilityChoice.trash.value
+            self.artifact.save()
+            logger.warning("moved dataset.artifact to trash.")
         return
     # permanent delete
@@ -427,38 +408,30 @@ def delete(
     if delete_record:
         super(Dataset, self).delete()
-    if self.file is not None:
-        self.file.delete(permanent=permanent, storage=storage)
+    if self.artifact is not None:
+        self.artifact.delete(permanent=permanent, storage=storage)
 # docstring handled through attach_func_to_class_method
 def save(self, *args, **kwargs) -> None:
-    if self.file is not None:
-        self.file.save()
+    if self.artifact is not None:
+        self.artifact.save()
     # we don't need to save feature sets again
     save_feature_sets(self)
     super(Dataset, self).save()
-    if hasattr(self, "_files"):
-        if self._files is not None and len(self._files) > 0:
-            self.files.set(self._files)
+    if hasattr(self, "_artifacts"):
+        if self._artifacts is not None and len(self._artifacts) > 0:
+            self.artifacts.set(self._artifacts)
     save_feature_set_links(self)
-@property  # type: ignore
-@doc_args(Dataset.path.__doc__)
-def path(self) -> Union[Path, UPath]:
-    """{}"""
-    _track_run_input(self)
-    return self.storage.path
 # docstring handled through attach_func_to_class_method
 def restore(self) -> None:
     self.visibility = VisibilityChoice.default.value
     self.save()
-    if self.file is not None:
-        self.file.visibility = VisibilityChoice.default.value
-        self.file.save()
+    if self.artifact is not None:
+        self.artifact.visibility = VisibilityChoice.default.value
+        self.artifact.save()
 METHOD_NAMES = [
@@ -485,6 +458,5 @@ if _TESTING:
 for name in METHOD_NAMES:
     attach_func_to_class_method(name, Dataset, globals())
-setattr(Dataset, "path", path)
 # this seems a Django-generated function
 delattr(Dataset, "get_visibility_display")

lamindb/_delete.py CHANGED Viewed

@@ -38,12 +38,12 @@ def delete(  # type: ignore
         Delete files (delete the metadata record and the file in storage):
-        >>> file = ln.filter(File, id=file_id).one()
+        >>> file = ln.filter(File, id=artifact_id).one()
         >>> ln.delete(file)
         >>> # deleting the record occurs automatically
         >>> # you will be asked whether to delete the file in storage
         >>> # for more control, use:
-        >>> file.delete(storage=True)
+        >>> artifact.delete(storage=True)
         Bulk delete via QuerySet:

lamindb/_filter.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Type
-from lnschema_core import Dataset, File, Registry
+from lnschema_core import Artifact, Dataset, Registry
 from lnschema_core.types import VisibilityChoice
 from lamindb._query_set import QuerySet
@@ -8,7 +8,7 @@ from lamindb._query_set import QuerySet
 def filter(Registry: Type[Registry], **expressions) -> QuerySet:
     """See :meth:`~lamindb.dev.Registry.filter`."""
-    if Registry in {File, Dataset}:
+    if Registry in {Artifact, Dataset}:
         # visibility is set to 0 unless expressions contains id or uid equality
         if not ("id" in expressions or "uid" in expressions):
             visibility = "visibility"

lamindb/_parents.py CHANGED Viewed

@@ -2,7 +2,7 @@ import builtins
 from typing import List, Optional, Set, Union
 from lamin_utils import logger
-from lnschema_core import Dataset, File, Registry, Run, Transform
+from lnschema_core import Artifact, Dataset, Registry, Run, Transform
 from lnschema_core.models import HasParents, format_field_value
 from lamindb._utils import attach_func_to_class_method
@@ -61,7 +61,7 @@ def view_parents(
     )
-def view_flow(data: Union[File, Dataset], with_children: bool = True) -> None:
+def view_flow(data: Union[Artifact, Dataset], with_children: bool = True) -> None:
     """Graph of data flow.
     Notes:
@@ -69,7 +69,7 @@ def view_flow(data: Union[File, Dataset], with_children: bool = True) -> None:
     Examples:
         >>> dataset.view_flow()
-        >>> file.view_flow()
+        >>> artifact.view_flow()
     """
     import graphviz
@@ -81,7 +81,7 @@ def view_flow(data: Union[File, Dataset], with_children: bool = True) -> None:
     data_label = _record_label(data)
     def add_node(
-        record: Union[Run, File, Dataset],
+        record: Union[Run, Artifact, Dataset],
         node_id: str,
         node_label: str,
         u: graphviz.Digraph,
@@ -257,7 +257,7 @@ def _df_edges_from_parents(
 def _record_label(record: Registry, field: Optional[str] = None):
-    if isinstance(record, File):
+    if isinstance(record, Artifact):
         if record.description is None:
             name = record.key
         else:
@@ -305,7 +305,7 @@ def _add_emoji(record: Registry, label: str):
     return f"{emoji} {label}"
-def _get_all_parent_runs(data: Union[File, Dataset]) -> List:
+def _get_all_parent_runs(data: Union[Artifact, Dataset]) -> List:
     """Get all input file/dataset runs recursively."""
     name = data._meta.model_name
     run_inputs_outputs = []
@@ -331,7 +331,7 @@ def _get_all_parent_runs(data: Union[File, Dataset]) -> List:
     return run_inputs_outputs
-def _get_all_child_runs(data: Union[File, Dataset]) -> List:
+def _get_all_child_runs(data: Union[Artifact, Dataset]) -> List:
     """Get all output file/dataset runs recursively."""
     name = data._meta.model_name
     all_runs: Set[Run] = set()

lamindb/_query_manager.py CHANGED Viewed

@@ -30,7 +30,10 @@ class QueryManager(models.Manager):
     def _track_run_input_manager(self):
         if hasattr(self, "source_field_name") and hasattr(self, "target_field_name"):
-            if self.source_field_name == "dataset" and self.target_field_name == "file":
+            if (
+                self.source_field_name == "dataset"
+                and self.target_field_name == "artifact"
+            ):
                 from lamindb.dev._data import WARNING_RUN_TRANSFORM, _track_run_input
                 from lamindb.dev._run_context import run_context
@@ -95,7 +98,7 @@ class QueryManager(models.Manager):
             target_field_name = self.target_field_name
             if (
-                source_field_name in {"file", "dataset"}
+                source_field_name in {"artifact", "dataset"}
                 and target_field_name == "feature_set"
             ):
                 return get_feature_set_by_slot(host=self.instance).get(item)

lamindb/_registry.py CHANGED Viewed

@@ -184,8 +184,8 @@ def _search(
             case_sensitive=case_sensitive,
         )
-    # search in both key and description fields for file
-    if orm._meta.model.__name__ == "File" and field is None:
+    # search in both key and description fields for Artifact
+    if orm._meta.model.__name__ == "Artifact" and field is None:
         field = ["key", "description"]
     if not isinstance(field, List):
@@ -405,7 +405,7 @@ def transfer_fk_to_default_db_bulk(records: List):
         "bionty_source",
         "initial_version",
         "latest_report",  # Transform
-        "source_file",  # Transform
+        "source_code",  # Transform
         "report",  # Run
         "file",  # Dataset
     ]:

lamindb 0.63.5__py3-none-any.whl → 0.64.1__py3-none-any.whl

lamindb 0.63.5py3-none-any.whl → 0.64.1py3-none-any.whl