PyPI - lamindb - Versions diffs - 0.32.0rc1__py2.py3-none-any.whl → 0.33.0__py2.py3-none-any.whl - Mend

lamindb 0.32.0rc1py2.py3-none-any.whl → 0.33.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

lamindb/__init__.py +21 -14
lamindb/_check_versions.py +6 -6
lamindb/_context.py +196 -0
lamindb/_delete.py +13 -7
lamindb/_load.py +23 -33
lamindb/_nb.py +19 -84
lamindb/_record.py +55 -21
lamindb/dev/db/_add.py +22 -7
lamindb/schema/__init__.py +1 -15
lamindb-0.33.0.dist-info/METADATA +236 -0
{lamindb-0.32.0rc1.dist-info → lamindb-0.33.0.dist-info}/RECORD +14 -16
{lamindb-0.32.0rc1.dist-info → lamindb-0.33.0.dist-info}/WHEEL +1 -1
lamindb/knowledge/__init__.py +0 -34
lamindb/knowledge/_core.py +0 -71
lamindb/knowledge/_lookup.py +0 -18
lamindb-0.32.0rc1.dist-info/METADATA +0 -178
{lamindb-0.32.0rc1.dist-info → lamindb-0.33.0.dist-info}/LICENSE +0 -0
{lamindb-0.32.0rc1.dist-info → lamindb-0.33.0.dist-info}/entry_points.txt +0 -0

lamindb/__init__.py CHANGED Viewed

@@ -14,6 +14,20 @@ and in-memory data objects (`DataFrame`, `AnnData`, etc.).
    DObject
    DFolder
+Data objects are transformed by runs:
+.. autosummary::
+   :toctree: .
+   Run
+Tracking data by features:
+.. autosummary::
+   :toctree: .
+   Features
 Query & manipulate data:
 .. autosummary::
@@ -44,13 +58,6 @@ Schema - entities and their relations:
    schema
-Track Jupyter notebooks:
-.. autosummary::
-   :toctree: .
-   nb
 Setup:
 .. autosummary::
@@ -63,11 +70,12 @@ Developer API:
 .. autosummary::
    :toctree: .
+   context
    settings
    dev
 """
-__version__ = "0.32.0rc1"  # denote a release candidate for 0.1.0 with 0.1rc1
+__version__ = "0.33.0"  # denote a release candidate for 0.1.0 with 0.1rc1
 # prints warning of python versions
 from lamin_logger import logger as _logger
@@ -104,12 +112,11 @@ else:
             " instance."
         )
-from lnschema_core import DFolder  # noqa
-from lnschema_core import DObject  # noqa
+from lnschema_core import DFolder, DObject, Features, Run  # noqa
 dobject_doc = """Data objects in storage & memory.
-- Guide: :doc:`/guide/ingest`
+- Guide: :doc:`/guide/track`
 - FAQ: :doc:`/faq/ingest`
 A `DObject` is typically instantiated from data using the arguments below.
@@ -119,14 +126,13 @@ fields directly.
 Args:
    data: Filepath or in-memory data.
    name: Name of the data object, required if an in-memory object is passed.
-   features_ref: Reference against which to link features.
-   source: The source of the data object (a :class:`~lamindb.schema.Run`).
+   source: The source of the data object (a :class:`~lamindb.Run`).
    id: The id of the dobject.
    format: Whether to use `h5ad` or `zarr` to store an `AnnData` object.
 Data objects (`dobjects`) represent atomic datasets in object storage:
 jointly measured observations of variables (features).
-They are generated by running code, instances of :class:`~lamindb.schema.Run`.
+They are generated by running code, instances of :class:`~lamindb.Run`.
 A `dobject` may contain a single observation, for instance, a single image.
@@ -162,6 +168,7 @@ DObject.__doc__ = dobject_doc
 from . import dev  # noqa
 from . import schema  # noqa
 from . import setup  # noqa
+from ._context import context  # noqa
 from ._delete import delete  # noqa
 from ._nb import nb  # noqa
 from ._settings import settings

lamindb/_check_versions.py CHANGED Viewed

@@ -4,14 +4,14 @@ from lnschema_core import __version__ as lnschema_core_v
 from nbproject import __version__ as nbproject_v
 from packaging import version
-if version.parse(lndb_v) != version.parse("0.37.1"):
-    raise RuntimeError("Upgrade lndb! pip install lndb==0.37.1")
+if version.parse(lndb_v) < version.parse("0.37.4"):
+    raise RuntimeError("Upgrade lndb! pip install lndb>=0.37.4")
-if version.parse(lnschema_core_v) != version.parse("0.29.1"):
-    raise RuntimeError("lamindb needs lnschema_core==0.29.1")
+if version.parse(lnschema_core_v) != version.parse("0.29.5"):
+    raise RuntimeError("lamindb needs lnschema_core==0.29.5")
-if version.parse(nbproject_v) < version.parse("0.8.2"):
-    raise RuntimeError("lamindb needs nbproject>=0.8.2")
+if version.parse(nbproject_v) < version.parse("0.8.3"):
+    raise RuntimeError("lamindb needs nbproject>=0.8.3")
 # ensure that the lamin package is not installed
 try:

lamindb/_context.py ADDED Viewed

@@ -0,0 +1,196 @@
+from pathlib import Path
+from typing import List, Optional, Union
+import lnschema_core
+import nbproject
+from lamin_logger import logger
+from lndb import settings
+from lndb.dev import InstanceSettings
+from lnschema_core import Notebook, Pipeline, Run, dev
+from nbproject._is_run_from_ipython import is_run_from_ipython
+class context:
+    """Global run context.
+    Set through `ln.Run(global_context=True)`.
+    Often, you'll want to call: `ln.Run(global_context=True, load_latest)`.
+    """
+    instance: Optional[InstanceSettings] = None
+    """Current instance."""
+    notebook: Optional[Notebook] = None
+    """Current notebook."""
+    pipeline: Optional[Pipeline] = None
+    """Current pipeline."""
+    run: Optional[Run] = None
+    """Current run."""
+    @classmethod
+    def _track_notebook(
+        cls,
+        *,
+        id: Optional[str] = None,
+        v: Optional[str] = "0",
+        name: Optional[str] = None,
+        filepath: Optional[str] = None,
+        pypackage: Union[str, List[str], None] = None,
+        editor: Optional[str] = None,
+    ):
+        """Track notebook.
+        Args:
+            id: Pass a notebook id manually.
+            v: Pass a notebook version manually.
+            name: Pass a notebook name manually.
+            pypackage: One or more python packages to track.
+            filepath: Filepath of notebook. Only needed if automatic inference fails.
+            editor: Editor environment. Only needed if automatic inference fails.
+                Pass `'lab'` for jupyter lab and `'notebook'` for jupyter notebook,
+                this can help to identify the correct mechanism for interactivity
+                when automatic inference fails.
+        """
+        cls.instance = settings.instance
+        # original location of this code was _nb
+        # legacy code here, see duplicated version in _run
+        if id is None and name is None:
+            nbproject_failed_msg = (
+                "Auto-retrieval of notebook name & title failed.\nPlease paste error"
+                " at: https://github.com/laminlabs/nbproject/issues/new \n\nFix: Run"
+                f" ln.nb.header(id={dev.id.notebook()}, name='my-notebook-name')"
+            )
+            try:
+                nbproject.header(
+                    pypackage=pypackage, filepath=filepath, env=editor, display=False
+                )
+            except Exception:
+                raise RuntimeError(nbproject_failed_msg)
+            # this contains filepath if the header was run successfully
+            from nbproject._header import _filepath
+            id = nbproject.meta.store.id
+            v = nbproject.meta.store.version
+            name = Path(_filepath).stem
+            title = nbproject.meta.live.title
+        elif id is None or name is None:
+            # Both id and name need to be passed if passing it manually
+            raise RuntimeError("Fix: Pass both id & name to ln.nb.header().")
+        else:
+            title = None
+        import lamindb as ln
+        import lamindb.schema as lns
+        notebook = ln.select(
+            lns.Notebook,
+            id=id,
+            v=v,
+        ).one_or_none()
+        if notebook is None:
+            notebook = lns.Notebook(
+                id=id,
+                v=v,
+                name=name,
+                title=title,
+            )
+            notebook = ln.add(notebook)
+            logger.info(f"Added notebook: {notebook}")
+        else:
+            logger.info(f"Loaded notebook: {notebook}")
+            if notebook.name != name or notebook.title != title:
+                response = input(
+                    "Updated notebook name and/or title: Do you want to assign a new id"
+                    " or version? (y/n)"
+                )
+                if response == "y":
+                    print("Notebook metadata will be re-initialized.")
+                    new_id, new_v = None, None
+                    response = input("Do you want to generate a new id? (y/n)")
+                    if response == "y":
+                        new_id = lnschema_core.dev.id.notebook()
+                    response = input(
+                        "Do you want to set a new version (e.g. '1.1')? Type 'n' for"
+                        " 'no'. (version/n)"
+                    )
+                    if new_v != "n":
+                        if new_v == "y":
+                            response = input("Please type the version: ")
+                        new_v = response
+                    if new_id is not None or new_v is not None:
+                        nbproject.meta.store.id = new_id
+                        nbproject.meta.store.version = new_v
+                        nbproject.meta.store.write()
+                        # at this point, depending on the editor, the process
+                        # might crash that is OK as upon re-running, the
+                        # notebook will have new metadata and will be registered
+                        # in the db in case the python process does not exit, we
+                        # need a new Notebook record
+                        notebook = lns.Notebook(id=id, v=v)
+                notebook.name = name
+                notebook.title = title
+                ln.add(notebook)
+        # at this point, we have a notebook object
+        cls.notebook = notebook
+    @classmethod
+    def _track_pipeline(
+        cls,
+        name: str,
+        *,
+        version: Optional[str] = None,
+    ):
+        """Track pipeline.
+        Args:
+            name: Pipeline name.
+            version: Pipeline version. If `None`, load latest (sort by created_at).
+        """
+        cls.instance = settings.instance
+        import lamindb as ln
+        import lamindb.schema as lns
+        if version is not None:
+            pipeline = ln.select(lns.Pipeline, name=name, v=version).one()
+        else:
+            pipeline = (
+                ln.select(lns.Pipeline, name=name)
+                .order_by(lns.Pipeline.created_at.desc())
+                .first()
+            )
+            if pipeline is None:
+                response = input(
+                    f"Did not find any pipeline record with name '{name}'. Create a new"
+                    " one? (y/n)"
+                )
+                if response == "y":
+                    pipeline = lns.Pipeline(name=name)
+        cls.pipeline = pipeline
+    @classmethod
+    def _track_notebook_pipeline(
+        cls, *, pipeline_name: Optional[str] = None, load_latest=True
+    ):
+        """Track notebook/pipeline and run.
+        When called from within a Python script, pass `pipeline_name`.
+        Args:
+            pipeline_name: Pipeline name.
+            load_latest: Load the latest run of the notebook or pipeline.
+        """
+        cls.instance = settings.instance
+        logger.info(f"Instance: {cls.instance.identifier}")
+        logger.info(f"User: {settings.user.handle}")
+        if is_run_from_ipython and pipeline_name is None:
+            if context.notebook is None:
+                cls._track_notebook()
+        else:
+            if pipeline_name is None:
+                raise ValueError(
+                    "Pass a pipeline name: ln.context.track(pipeline_name='...')"
+                )
+            cls._track_pipeline(name=pipeline_name)
+            logger.info(f"Pipeline: {cls.pipeline}")

lamindb/_delete.py CHANGED Viewed

@@ -48,14 +48,20 @@ def delete(  # type: ignore
     Example:
-    >>> # Delete by record
-    >>> experiment = ln.select(Experiment, id=experiment_id)
+    1) Delete by record
+    >>> experiment = ln.select(Experiment, id=experiment_id).one()
     >>> ln.delete(experiment)
-    >>> # Delete data objects
-    >>> dobject = ln.select(DObject, id=dobject_id)
-    >>> ln.delete(dobject)
-    >>> # Delete by fields
-    >>> ln.delete(DObject, id=dobject_id)
+    2) Delete by fields
+    >>> ln.delete(Experiment, id=experiment_id)
+    >>> # the result of is equivalent to 1)
+    3) Delete data objects (deleting the metadata record and the storage file)
+    >>> dobject = ln.select(DObject, id=dobject_id).one()
+    >>> # deleting the metadata record occurs automatically
+    >>> # you will be asked whether to delete the file from storage
+    >>> # or pass boolean values to `delete_data_from_storage`
+    >>> ln.delete(dobject, delete_data_from_storage)
     Args:
         record: One or multiple records as instances of `SQLModel`.

lamindb/_load.py CHANGED Viewed

@@ -1,46 +1,36 @@
-import lnschema_core as core
+from typing import Optional
 from lamin_logger import logger
-from lndb import settings as setup_settings
+from lnschema_core import DObject
+from sqlalchemy.orm.session import object_session
+from lamindb._context import context
 from ._settings import settings
 from .dev._core import filepath_from_dobject
 from .dev.file import load_to_memory
-def populate_runin(dobject: core.DObject, run: core.Run):
-    setup_settings.instance._cloud_sqlite_locker.lock()
-    with setup_settings.instance.session() as ss:
-        result = ss.get(core.link.RunIn, (run.id, dobject.id))
-        if result is None:
-            ss.add(
-                core.link.RunIn(
-                    run_id=run.id,
-                    dobject_id=dobject.id,
-                )
-            )
-            ss.commit()
-            logger.info(f"Added dobject ({dobject.id}) as input for run ({run.id}).")
-            setup_settings.instance._update_cloud_sqlite_file()
-    setup_settings.instance._cloud_sqlite_locker.unlock()
 # this is exposed to the user as DObject.load
-def load(dobject: core.DObject, stream: bool = False, is_run_input: bool = False):
+def load(dobject: DObject, stream: bool = False, is_run_input: Optional[bool] = None):
     if stream and dobject.suffix not in (".h5ad", ".zarr"):
         logger.warning(f"Ignoring stream option for a {dobject.suffix} object.")
-    filepath = filepath_from_dobject(dobject)
-    # TODO: better design to track run inputs
-    if settings.track_run_inputs_upon_load or is_run_input:
-        from lamindb import nb
-        if nb.run is None:
-            logger.warning(
-                "Input tracking for runs through `load` is currently only implemented"
-                " for notebooks."
+    if is_run_input is None:
+        track_run_input = settings.track_run_inputs_upon_load
+    else:
+        track_run_input = is_run_input
+    if track_run_input:
+        if object_session(dobject) is None:
+            raise ValueError("Need to load with session open to track as input.")
+        if context.run is None:
+            raise ValueError(
+                "No global run context set. Call ln.context.track() or pass input run"
+                " directly."
             )
         else:
-            populate_runin(dobject, nb.run)
-    # TODO: enable track usage
+            dobject.targets.append(context.run)
+            session = object_session(dobject)
+            session.add(dobject)
+            session.commit()
     # track_usage(dobject.id, "load")
-    return load_to_memory(filepath, stream=stream)
+    return load_to_memory(filepath_from_dobject(dobject), stream=stream)

lamindb/_nb.py CHANGED Viewed

@@ -1,12 +1,13 @@
-from pathlib import Path
 from typing import List, Optional, Union
 import nbproject as _nb
 from lamin_logger import logger
-from lndb import settings
-from lnschema_core import Notebook, Run, dev
+from lnschema_core import Notebook, Run
+from ._context import context
+# this whole class is deprecated, see lamindb.context instead!
 class nb:
     """Manage Jupyter notebooks.
@@ -30,7 +31,7 @@ class nb:
         id: Optional[str] = None,
         v: Optional[str] = "0",
         name: Optional[str] = None,
-    ):
+    ) -> Run:
         """Track the notebook & display metadata.
         Call without arguments in most settings.
@@ -54,88 +55,22 @@ class nb:
             v: Pass a notebook version manually.
             name: Pass a notebook name manually.
         """
-        if id is None and name is None:
-            nbproject_failed_msg = (
-                "Auto-retrieval of notebook name & title failed.\nPlease paste error"
-                " at: https://github.com/laminlabs/nbproject/issues/new \n\nFix: Run"
-                f" ln.nb.header(id={dev.id.notebook()}, name='my-notebook-name')"
-            )
-            try:
-                _nb.header(pypackage=pypackage, filepath=filepath, env=env)
-            except Exception:
-                raise RuntimeError(nbproject_failed_msg)
-            # this contains filepath if the header was run successfully
-            from nbproject._header import _filepath
-            id = _nb.meta.store.id
-            v = _nb.meta.store.version
-            name = Path(_filepath).stem
-            title = _nb.meta.live.title
-        elif id is None or name is None:
-            # Both id and name need to be passed if passing it manually
-            raise RuntimeError("Fix: Pass both id & name to ln.nb.header().")
-        else:
-            title = None
-        logger.info(f"Instance: {settings.instance.owner}/{settings.instance.name}")
-        import lamindb as ln
-        import lamindb.schema as lns
-        notebook = ln.select(
-            lns.Notebook,
-            id=id,
-            v=v,
-        ).one_or_none()
-        if notebook is None:
-            notebook = lns.Notebook(
-                id=id,
-                v=v,
-                name=name,
-                title=title,
-            )
-            notebook = ln.add(notebook)
-            logger.info(f"Added notebook: {notebook.id} v{notebook.v}")
-        else:
-            logger.info(f"Loaded notebook: {notebook.id} v{notebook.v}")
-            if notebook.name != name or notebook.title != title:
-                notebook.name = name
-                notebook.title = title
-                ln.add(notebook)
-                logger.info("Updated notebook name or title.")
-        # at this point, we have a notebook object
+        logger.warning(
+            "DeprecationWarning: Please replace ln.nb.header() with ln.Run()"
+        )
+        context._track_notebook(
+            pypackage=pypackage, filepath=filepath, id=id, v=v, name=name, editor=env
+        )
+        notebook = context.notebook
         cls.notebook = notebook
-        # check user input
-        # if isinstance(run, lns.Run):
-        # This here might be something we may want in the future
-        # but checking all the cases in which that run record has integrity
-        # is quite a bit of code - not now!
-        #     run_test = ln.select(lns.Run, id=run.id).one_or_none()
-        #     if run_test is None:
-        #         logger.info("Passed run does not exist, adding it")
-        #         ln.add(run)
-        if run is None:
-            # retrieve the latest run
-            run = (
-                ln.select(lns.Run, notebook_id=notebook.id, notebook_v=notebook.v)
-                .order_by(lns.Run.created_at.desc())
-                .first()
-            )
-            if run is not None:
-                logger.info(f"Loaded run: {run.id}")  # type: ignore
-        elif run != "new":
-            raise ValueError("Fix: ln.nb.header(run='new')!")
-        # create a new run if doesn't exist yet or is requested by the user ("new")
-        if run is None or run == "new":
-            run = lns.Run(notebook_id=notebook.id, notebook_v=notebook.v)
-            run = ln.add(run)  # type: ignore
-            logger.info(f"Added run: {run.id}")  # type: ignore
-        # at this point, we have a run object
+        if run == "new":
+            run = Run(global_context=True)
+        elif run is None:
+            run = Run(global_context=True, load_latest=True)
+        else:
+            raise ValueError("Pass 'new' to ln.nb.header().")
         cls.run = run
+        return run
     @classmethod
     def publish(cls, version: str = None, i_confirm_i_saved: bool = False):

lamindb/_record.py CHANGED Viewed

@@ -29,12 +29,10 @@ NO_SOURCE_ERROR = """
 Error: Please link a data source using the `source` argument.
 Fix: Link a data source by passing a run, e.g., via
-pipeline = ln.select("My ingestion pipeline").one()
 run = lns.Run(pipeline=pipeline)
 dobject = ln.DObject(..., source=run)
-Or, if you're in a notebook, call `ln.nb.header()` at the top, which creates
-a global run context for the notebook.
+Or, by calling ln.context.track(), which sets a global run context.
 More details: https://lamin.ai/docs/faq/ingest
 """
@@ -73,9 +71,11 @@ def serialize(
     return memory_rep, filepath, name, suffix
-def get_hash(local_filepath, suffix):
+def get_hash(local_filepath, suffix, check_hash: bool = True):
     if suffix != ".zarr":  # if not streamed
         hash = hash_file(local_filepath)
+        if not check_hash:
+            return hash
         result = select(lns_DObject, hash=hash).all()
         if len(result) > 0:
             msg = f"A dobject with same hash is already in the DB: {result}"
@@ -171,19 +171,19 @@ def parse_features(
     ).one_or_none()
     if features is not None:
         return features  # features already exists!
-    features = Features(id=features_hash, type=features_ref.entity)
-    records = get_features_records(parsing_id, features_ref, df_curated)
-    if isinstance(features_ref, Gene):
-        for record in records:
-            features.genes.append(record)
-    elif isinstance(features_ref, Protein):
-        for record in records:
-            features.proteins.append(record)
-    elif isinstance(features_ref, CellMarker):
-        for record in records:
-            features.cell_markers.append(record)
+    else:
+        features = Features(id=features_hash, type=features_ref.entity)
+        records = get_features_records(parsing_id, features_ref, df_curated)
+        if isinstance(features_ref, Gene):
+            for record in records:
+                features.genes.append(record)
+        elif isinstance(features_ref, Protein):
+            for record in records:
+                features.proteins.append(record)
+        elif isinstance(features_ref, CellMarker):
+            for record in records:
+                features.cell_markers.append(record)
     return features
@@ -204,11 +204,15 @@ def get_features(dobject_privates, features_ref):
 def get_run(run: Optional[Run]) -> Run:
     if run is None:
-        from . import nb
+        from ._context import context
-        run = nb.run
+        run = context.run
         if run is None:
             raise ValueError(NO_SOURCE_ERROR)
+    # the following ensures that queried objects (within __init__)
+    # behave like queried objects, only example right now: Run
+    if run._ln_identity_key is not None:
+        run._sa_instance_state.key = run._ln_identity_key
     return run
@@ -216,6 +220,7 @@ def get_path_size_hash(
     filepath: Union[Path, UPath],
     memory_rep: Optional[Union[pd.DataFrame, ad.AnnData]],
     suffix: str,
+    check_hash: bool = True,
 ):
     cloudpath = None
     localpath = None
@@ -250,18 +255,20 @@ def get_path_size_hash(
         else:
             size = path.stat().st_size
             localpath = filepath
-            hash = get_hash(filepath, suffix)
+            hash = get_hash(filepath, suffix, check_hash=check_hash)
     return localpath, cloudpath, size, hash
+# expose to user via ln.DObject
 def get_dobject_kwargs_from_data(
     data: Union[Path, UPath, str, pd.DataFrame, ad.AnnData],
     *,
     name: Optional[str] = None,
-    features_ref: Optional[Union[CellMarker, Gene, Protein]] = None,
     source: Optional[Run] = None,
     format: Optional[str] = None,
+    # backward compat
+    features_ref: Optional[Union[CellMarker, Gene, Protein]] = None,
 ):
     run = get_run(source)
     memory_rep, filepath, name, suffix = serialize(data, name, format)
@@ -278,10 +285,17 @@ def get_dobject_kwargs_from_data(
         _memory_rep=memory_rep,
     )
+    # TODO: remove later
+    # backward compat
     if features_ref is not None:
+        logger.warning(
+            "DeprecationWarning: `features_ref` is deprecated, please use"
+            " `ln.Features`!"
+        )
         features = [get_features(dobject_privates, features_ref)]  # has to be list!
     else:
         features = []
     dobject_kwargs = dict(
         name=name,
         suffix=suffix,
@@ -292,9 +306,29 @@ def get_dobject_kwargs_from_data(
         source=run,
         features=features,
     )
     return dobject_kwargs, dobject_privates
+# expose to user via ln.Features
+def get_features_from_data(
+    data: Union[Path, UPath, str, pd.DataFrame, ad.AnnData],
+    reference: Union[CellMarker, Gene, Protein],
+    format: Optional[str] = None,
+):
+    memory_rep, filepath, _, suffix = serialize(data, "features", format)
+    localpath, cloudpath, _, _ = get_path_size_hash(
+        filepath, memory_rep, suffix, check_hash=False
+    )
+    dobject_privates = dict(
+        _local_filepath=localpath,
+        _cloud_filepath=cloudpath,
+        _memory_rep=memory_rep,
+    )
+    return get_features(dobject_privates, reference)
 def to_b64_str(bstr: bytes):
     b64 = base64.urlsafe_b64encode(bstr).decode().strip("=")
     return b64

lamindb 0.32.0rc1__py2.py3-none-any.whl → 0.33.0__py2.py3-none-any.whl

lamindb 0.32.0rc1py2.py3-none-any.whl → 0.33.0py2.py3-none-any.whl