PyPI - lamindb - Versions diffs - 0.69.9__py3-none-any.whl → 0.70.0__py3-none-any.whl - Mend

lamindb 0.69.9py3-none-any.whl → 0.70.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

lamindb/__init__.py +1 -1
lamindb/_annotate.py +461 -126
lamindb/_artifact.py +69 -20
lamindb/_can_validate.py +13 -18
lamindb/_collection.py +48 -44
lamindb/_feature_set.py +20 -8
lamindb/_finish.py +28 -42
lamindb/_from_values.py +23 -17
lamindb/_registry.py +7 -2
lamindb/core/__init__.py +16 -4
lamindb/core/_data.py +22 -16
lamindb/core/_feature_manager.py +80 -25
lamindb/core/_label_manager.py +1 -1
lamindb/core/_mapped_collection.py +106 -52
lamindb/core/_run_context.py +0 -1
lamindb/core/_settings.py +1 -1
lamindb/core/datasets/_core.py +42 -2
lamindb/core/storage/_backed_access.py +8 -4
lamindb/core/storage/file.py +9 -0
lamindb/core/storage/object.py +19 -0
lamindb/integrations/_vitessce.py +18 -9
{lamindb-0.69.9.dist-info → lamindb-0.70.0.dist-info}/METADATA +7 -8
{lamindb-0.69.9.dist-info → lamindb-0.70.0.dist-info}/RECORD +25 -25
{lamindb-0.69.9.dist-info → lamindb-0.70.0.dist-info}/LICENSE +0 -0
{lamindb-0.69.9.dist-info → lamindb-0.70.0.dist-info}/WHEEL +0 -0

lamindb/_finish.py CHANGED Viewed

@@ -43,39 +43,35 @@ def finish(i_saved_the_notebook: bool = False):
                 "Please pass `i_saved_the_notebook=True` to `ln.finish()`, save the notebook, and re-run this cell."
             )
             return None
-        notebook_content = read_notebook(run_context.path)  # type: ignore
-        if not check_last_cell(notebook_content, "i_saved_the_notebook"):
-            raise CallFinishInLastCell(
-                "Can only run `ln.finish(i_saved_the_notebook=True)` from the last code cell of the notebook."
-            )
         save_run_context_core(
             run=run_context.run,
             transform=run_context.transform,
             filepath=run_context.path,
             finished_at=True,
-            notebook_content=notebook_content,
         )
     else:
         # scripts
+        # save_run_context_core was already called during ln.track()
         run_context.run.finished_at = datetime.now(timezone.utc)  # update run time
         run_context.run.save()
-# do not type because we need to be aware of lnschema_core import order
 def save_run_context_core(
     *,
     run: Run,
     transform: Transform,
     filepath: Path,
     transform_family: QuerySet | None = None,
-    is_consecutive: bool = True,
     finished_at: bool = False,
-    notebook_content=None,  # nbproject.Notebook
 ) -> str | None:
     import lamindb as ln
     ln.settings.verbosity = "success"
+    # for scripts, things are easy
+    is_consecutive = True
+    source_code_path = filepath
+    # for notebooks, we need more work
     if transform.type == TransformType.notebook:
         try:
             import nbstripout
@@ -88,62 +84,52 @@ def save_run_context_core(
                 "install nbproject & nbstripout: pip install nbproject nbstripout"
             )
             return None
-        if notebook_content is None:
-            notebook_content = read_notebook(filepath)  # type: ignore
+        notebook_content = read_notebook(filepath)  # type: ignore
         is_consecutive = check_consecutiveness(notebook_content)
         if not is_consecutive:
+            msg = "   Do you still want to proceed with finishing? (y/n) "
             if os.getenv("LAMIN_TESTING") is None:
-                decide = input(
-                    "   Do you still want to proceed with publishing? (y/n) "
-                )
+                response = input(msg)
             else:
-                decide = "n"
-            if decide != "y":
-                logger.error("Aborted (non-consecutive)!")
+                response = "n"
+            if response != "y":
                 return "aborted-non-consecutive"
         # convert the notebook file to html
         # log_level is set to 40 to silence the nbconvert logging
-        result = subprocess.run(
+        subprocess.run(
             "jupyter nbconvert --to html"
             f" {filepath.as_posix()} --Application.log_level=40",
             shell=True,
+            check=True,
         )
         # move the temporary file into the cache dir in case it's accidentally
         # in an existing storage location -> we want to move associated
         # artifacts into default storage and not register them in an existing
         # location
-        filepath_html = filepath.with_suffix(".html")  # current location
+        filepath_html_orig = filepath.with_suffix(".html")  # current location
+        filepath_html = ln_setup.settings.storage.cache_dir / filepath_html_orig.name
+        # don't use Path.rename here because of cross-device link error
+        # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
         shutil.move(
-            filepath_html,  # type: ignore
-            ln_setup.settings.storage.cache_dir / filepath_html.name,
-        )  # move; don't use Path.rename here because of cross-device link error
-        # see https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
-        filepath_html = (
-            ln_setup.settings.storage.cache_dir / filepath_html.name
-        )  # adjust location
-        assert result.returncode == 0
-        # copy the notebook file to a temporary file
+            filepath_html_orig,  # type: ignore
+            filepath_html,
+        )
+        # strip the output from the notebook to create the source code file
+        # first, copy the notebook file to a temporary file in the cache
         source_code_path = ln_setup.settings.storage.cache_dir / filepath.name
         shutil.copy2(filepath, source_code_path)  # copy
-        result = subprocess.run(f"nbstripout {source_code_path}", shell=True)
-        assert result.returncode == 0
-    else:
-        source_code_path = filepath
+        subprocess.run(f"nbstripout {source_code_path}", shell=True, check=True)
     # find initial versions of source codes and html reports
-    initial_report = None
-    initial_source = None
+    prev_report = None
+    prev_source = None
     if transform_family is None:
         transform_family = transform.versions
     if len(transform_family) > 0:
         for prev_transform in transform_family.order_by("-created_at"):
-            # check for id to avoid query
             if prev_transform.latest_report_id is not None:
-                # any previous latest report of this transform is OK!
-                initial_report = prev_transform.latest_report
+                prev_report = prev_transform.latest_report
             if prev_transform.source_code_id is not None:
-                # any previous source code id is OK!
-                initial_source = prev_transform.source_code
+                prev_source = prev_transform.source_code
     ln.settings.silence_file_run_transform_warning = True
     # register the source code
     if transform.source_code is not None:
@@ -173,7 +159,7 @@ def save_run_context_core(
             source_code_path,
             description=f"Source of transform {transform.uid}",
             version=transform.version,
-            is_new_version_of=initial_source,
+            is_new_version_of=prev_source,
             visibility=0,  # hidden file
             run=False,
         )
@@ -207,7 +193,7 @@ def save_run_context_core(
             report_file = ln.Artifact(
                 filepath_html,
                 description=f"Report of run {run.uid}",
-                is_new_version_of=initial_report,
+                is_new_version_of=prev_report,
                 visibility=0,  # hidden file
                 run=False,
             )

lamindb/_from_values.py CHANGED Viewed

@@ -19,19 +19,26 @@ def get_or_create_records(
     field: StrField,
     *,
     from_public: bool = False,
-    **kwargs,
+    organism: Registry | str | None = None,
+    public_source: Registry | None = None,
 ) -> list[Registry]:
     """Get or create records from iterables."""
     upon_create_search_names = settings.upon_create_search_names
-    settings.upon_create_search_names = False
     feature: Feature = None
+    organism = _get_organism_record(field, organism)
+    kwargs: dict = {}
+    if organism is not None:
+        kwargs["organism"] = organism
+    if public_source is not None:
+        kwargs["public_source"] = public_source
+    settings.upon_create_search_names = False
     try:
         Registry = field.field.model
         iterable_idx = index_iterable(iterable)
         # returns existing records & non-existing values
         records, nonexist_values, msg = get_existing_records(
-            iterable_idx=iterable_idx, field=field, kwargs=kwargs
+            iterable_idx=iterable_idx, field=field, **kwargs
         )
         # new records to be created based on new values
@@ -78,26 +85,14 @@ def get_or_create_records(
 def get_existing_records(
     iterable_idx: pd.Index,
     field: StrField,
-    kwargs: dict = None,
+    **kwargs,
 ):
-    if kwargs is None:
-        kwargs = {}
     model = field.field.model
     condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
     # existing records matching is agnostic to the bionty source
     if "public_source" in condition:
         condition.pop("public_source")
-    if _has_organism_field(model):
-        from lnschema_bionty._bionty import create_or_get_organism_record
-        organism_record = create_or_get_organism_record(
-            organism=kwargs.get("organism"), orm=model
-        )
-        if organism_record is not None:
-            kwargs.update({"organism": organism_record})
-            condition.update({"organism": organism_record})
     # standardize based on the DB reference
     # log synonyms mapped terms
     result = model.inspect(
@@ -252,7 +247,8 @@ def index_iterable(iterable: Iterable) -> pd.Index:
 def _print_values(names: list, n: int = 20) -> str:
-    print_values = ", ".join([f"'{name}'" for name in names[:n]])
+    names = list(set(names))
+    print_values = ", ".join([f"'{name}'" for name in names[:n] if name != "None"])
     if len(names) > n:
         print_values += ", ..."
     return print_values
@@ -322,3 +318,13 @@ def _has_organism_field(orm: Registry) -> bool:
         return True
     except FieldDoesNotExist:
         return False
+def _get_organism_record(field: StrField, organism: str | Registry) -> Registry:
+    model = field.field.model
+    if _has_organism_field(model):
+        from lnschema_bionty._bionty import create_or_get_organism_record
+        organism_record = create_or_get_organism_record(organism=organism, orm=model)
+        if organism_record is not None:
+            return organism_record

lamindb/_registry.py CHANGED Viewed

@@ -129,7 +129,11 @@ def __init__(orm: Registry, *args, **kwargs):
 @classmethod  # type:ignore
 @doc_args(Registry.from_values.__doc__)
 def from_values(
-    cls, values: ListLike, field: StrField | None = None, **kwargs
+    cls,
+    values: ListLike,
+    field: StrField | None = None,
+    organism: Registry | str | None = None,
+    public_source: Registry | None = None,
 ) -> list[Registry]:
     """{}."""
     from_public = True if cls.__module__.startswith("lnschema_bionty.") else False
@@ -138,7 +142,8 @@ def from_values(
         iterable=values,
         field=getattr(cls, field_str),
         from_public=from_public,
-        **kwargs,
+        organism=organism,
+        public_source=public_source,
     )

lamindb/core/__init__.py CHANGED Viewed

@@ -14,14 +14,21 @@ Registries:
    LabelManager
    IsTree
    IsVersioned
-   DataFrameAnnotator
-   AnnDataAnnotator
-   AnnotateLookup
    CanValidate
    HasParents
    InspectResult
    fields
+Annotators:
+.. autosummary::
+   :toctree: .
+   DataFrameAnnotator
+   AnnDataAnnotator
+   MuDataAnnotator
+   AnnotateLookup
 Classes:
 .. autosummary::
@@ -53,7 +60,12 @@ from lnschema_core.models import (
     Registry,
 )
-from lamindb._annotate import AnnDataAnnotator, AnnotateLookup, DataFrameAnnotator
+from lamindb._annotate import (
+    AnnDataAnnotator,
+    AnnotateLookup,
+    DataFrameAnnotator,
+    MuDataAnnotator,
+)
 from lamindb._query_manager import QueryManager
 from lamindb._query_set import QuerySet, RecordsList
 from lamindb.core._feature_manager import FeatureManager

lamindb/core/_data.py CHANGED Viewed

@@ -94,6 +94,23 @@ def save_feature_set_links(self: Artifact | Collection) -> None:
         bulk_create(links, ignore_conflicts=True)
+def format_repr(value: Registry, exclude: list[str] | str | None = None) -> str:
+    if isinstance(exclude, str):
+        exclude = [exclude]
+    exclude_fields = set() if exclude is None else set(exclude)
+    exclude_fields.update(["created_at", "updated_at"])
+    fields = [
+        f
+        for f in value.__repr__(include_foreign_keys=False).split(", ")
+        if not any(f"{excluded_field}=" in f for excluded_field in exclude_fields)
+    ]
+    repr = ", ".join(fields)
+    if not repr.endswith(")"):
+        repr += ")"
+    return repr
 @doc_args(Data.describe.__doc__)
 def describe(self: Data):
     """{}."""
@@ -109,17 +126,7 @@ def describe(self: Data):
         else:
             direct_fields.append(f.name)
-    # Display Provenance
-    # display line by line the foreign key fields
-    from lamindb._parents import _transform_emoji
-    emojis = {
-        "storage": "🗃️",
-        "created_by": "👤",
-        "transform": _transform_emoji(self.transform),
-        "run": "👣",
-        "artifact": "📄",
-    }
+    # provenance
     if len(foreign_key_fields) > 0:  # always True for Artifact and Collection
         record_msg = f"{colors.green(model_name)}{__repr__(self, include_foreign_keys=False).lstrip(model_name)}"
         msg += f"{record_msg}\n\n"
@@ -127,17 +134,16 @@ def describe(self: Data):
         msg += f"{colors.green('Provenance')}:\n  "
         related_msg = "".join(
             [
-                f"{emojis.get(i, '📎')} {i}: {self.__getattribute__(i)}\n  "
-                for i in foreign_key_fields
-                if self.__getattribute__(i) is not None
+                f"📎 {field}: {format_repr(self.__getattribute__(field))}\n  "
+                for field in foreign_key_fields
+                if self.__getattribute__(field) is not None
             ]
         )
         msg += related_msg
     # input of
-    # can only access many-to-many once record is saved
     if self.id is not None and self.input_of.exists():
         values = [format_field_value(i.started_at) for i in self.input_of.all()]
-        msg += f"⬇️ input_of ({colors.italic('core.Run')}): {values}\n    "
+        msg += f"📎 input_of ({colors.italic('core.Run')}): {values}\n    "
     msg = msg.rstrip(" ")  # do not use removesuffix as we need to remove 2 or 4 spaces
     msg += print_features(self)
     msg += print_labels(self)

lamindb/core/_feature_manager.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from itertools import compress
-from typing import TYPE_CHECKING, Iterable
+from typing import TYPE_CHECKING, Iterable, Optional
 import anndata as ad
 from anndata import AnnData
@@ -91,6 +91,8 @@ def get_feature_set_links(host: Artifact | Collection) -> QuerySet:
 def print_features(self: Data) -> str:
     from lamindb._from_values import _print_values
+    from ._data import format_repr
     msg = ""
     features_lookup = Feature.objects.using(self._state.db).lookup().dict()
     for slot, feature_set in self.features._feature_set_by_slot.items():
@@ -98,12 +100,16 @@ def print_features(self: Data) -> str:
             features = feature_set.members
             name_field = get_default_str_field(features[0])
             feature_names = [getattr(feature, name_field) for feature in features]
-            msg += f"  {colors.bold(slot)}: {feature_set}\n"
+            msg += (
+                f"  {colors.bold(slot)}: {format_repr(feature_set, exclude='hash')}\n"
+            )
             print_values = _print_values(feature_names, n=20)
             msg += f"    {print_values}\n"
         else:
             df_slot = feature_set.features.df()
-            msg += f"  {colors.bold(slot)}: {feature_set}\n"
+            msg += (
+                f"  {colors.bold(slot)}: {format_repr(feature_set, exclude='hash')}\n"
+            )
             for _, row in df_slot.iterrows():
                 if row["type"] == "category" and row["registries"] is not None:
                     labels = self.labels.get(
@@ -133,9 +139,10 @@ def print_features(self: Data) -> str:
 def parse_feature_sets_from_anndata(
     adata: AnnData,
-    var_field: FieldAttr,
+    var_field: FieldAttr | None = None,
     obs_field: FieldAttr = Feature.name,
-    **kwargs,
+    mute: bool = False,
+    organism: str | Registry | None = None,
 ) -> dict:
     data_parse = adata
     if not isinstance(adata, AnnData):  # is a path
@@ -149,29 +156,36 @@ def parse_feature_sets_from_anndata(
             data_parse = ad.read(filepath, backed="r")
         type = "float"
     else:
-        type = convert_numpy_dtype_to_lamin_feature_type(adata.X.dtype)
+        type = (
+            "float"
+            if adata.X is None
+            else convert_numpy_dtype_to_lamin_feature_type(adata.X.dtype)
+        )
     feature_sets = {}
-    logger.info("parsing feature names of X stored in slot 'var'")
-    logger.indent = "   "
-    feature_set_var = FeatureSet.from_values(
-        data_parse.var.index,
-        var_field,
-        type=type,
-        **kwargs,
-    )
-    if feature_set_var is not None:
-        feature_sets["var"] = feature_set_var
-        logger.save(f"linked: {feature_set_var}")
-    logger.indent = ""
-    if feature_set_var is None:
-        logger.warning("skip linking features to artifact in slot 'var'")
+    if var_field is not None:
+        logger.info("parsing feature names of X stored in slot 'var'")
+        logger.indent = "   "
+        feature_set_var = FeatureSet.from_values(
+            data_parse.var.index,
+            var_field,
+            type=type,
+            mute=mute,
+            organism=organism,
+        )
+        if feature_set_var is not None:
+            feature_sets["var"] = feature_set_var
+            logger.save(f"linked: {feature_set_var}")
+        logger.indent = ""
+        if feature_set_var is None:
+            logger.warning("skip linking features to artifact in slot 'var'")
     if len(data_parse.obs.columns) > 0:
         logger.info("parsing feature names of slot 'obs'")
         logger.indent = "   "
         feature_set_obs = FeatureSet.from_df(
             df=data_parse.obs,
             field=obs_field,
-            **kwargs,
+            mute=mute,
+            organism=organism,
         )
         if feature_set_obs is not None:
             feature_sets["obs"] = feature_set_obs
@@ -224,7 +238,7 @@ class FeatureManager:
             slot = "columns" if slot is None else slot
         self._add_feature_set(feature_set=FeatureSet(features=features), slot=slot)
-    def add_from_df(self, field: FieldAttr = Feature.name, **kwargs):
+    def add_from_df(self, field: FieldAttr = Feature.name, organism: str | None = None):
         """Add features from DataFrame."""
         if isinstance(self._host, Artifact):
             assert self._host.accessor == "DataFrame"
@@ -235,7 +249,7 @@ class FeatureManager:
         # parse and register features
         registry = field.field.model
         df = self._host.load()
-        features = registry.from_values(df.columns, field=field, **kwargs)
+        features = registry.from_values(df.columns, field=field, organism=organism)
         if len(features) == 0:
             logger.error(
                 "no validated features found in DataFrame! please register features first!"
@@ -252,7 +266,8 @@ class FeatureManager:
         self,
         var_field: FieldAttr,
         obs_field: FieldAttr | None = Feature.name,
-        **kwargs,
+        mute: bool = False,
+        organism: str | Registry | None = None,
     ):
         """Add features from AnnData."""
         if isinstance(self._host, Artifact):
@@ -263,13 +278,53 @@ class FeatureManager:
         # parse and register features
         adata = self._host.load()
         feature_sets = parse_feature_sets_from_anndata(
-            adata, var_field=var_field, obs_field=obs_field, **kwargs
+            adata,
+            var_field=var_field,
+            obs_field=obs_field,
+            mute=mute,
+            organism=organism,
         )
         # link feature sets
         self._host._feature_sets = feature_sets
         self._host.save()
+    def add_from_mudata(
+        self,
+        var_fields: dict[str, FieldAttr],
+        obs_fields: dict[str, FieldAttr] = None,
+        mute: bool = False,
+        organism: str | Registry | None = None,
+    ):
+        """Add features from MuData."""
+        if obs_fields is None:
+            obs_fields = {}
+        if isinstance(self._host, Artifact):
+            assert self._host.accessor == "MuData"
+        else:
+            raise NotImplementedError()
+        # parse and register features
+        mdata = self._host.load()
+        feature_sets = {}
+        obs_features = features = Feature.from_values(mdata.obs.columns)
+        if len(obs_features) > 0:
+            feature_sets["obs"] = FeatureSet(features=features)
+        for modality, field in var_fields.items():
+            modality_fs = parse_feature_sets_from_anndata(
+                mdata[modality],
+                var_field=field,
+                obs_field=obs_fields.get(modality, Feature.name),
+                mute=mute,
+                organism=organism,
+            )
+            for k, v in modality_fs.items():
+                feature_sets[f"['{modality}'].{k}"] = v
+        # link feature sets
+        self._host._feature_sets = feature_sets
+        self._host.save()
     def _add_feature_set(self, feature_set: FeatureSet, slot: str):
         """Add new feature set to a slot.

lamindb/core/_label_manager.py CHANGED Viewed

@@ -49,7 +49,7 @@ def print_labels(self: Data):
             n = labels.count()
             field = get_default_str_field(labels)
             print_values = _print_values(labels.list(field), n=10)
-            labels_msg += f"  🏷️ {related_name} ({n}, {colors.italic(related_model)}): {print_values}\n"
+            labels_msg += f"  📎 {related_name} ({n}, {colors.italic(related_model)}): {print_values}\n"
     if len(labels_msg) > 0:
         return f"{colors.green('Labels')}:\n{labels_msg}"
     else:

lamindb 0.69.9__py3-none-any.whl → 0.70.0__py3-none-any.whl

lamindb 0.69.9py3-none-any.whl → 0.70.0py3-none-any.whl