PyPI - torchmonarch-nightly - Versions diffs - 2025.8.2__cp311-cp311-manylinux2014_x86_64.whl → 2025.9.3__cp311-cp311-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.8.2__cp311-cp311-manylinux2014_x86_64.whl → 2025.9.3__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

monarch/_rust_bindings.so +0 -0
monarch/_src/actor/actor_mesh.py +414 -216
monarch/_src/actor/allocator.py +75 -6
monarch/_src/actor/bootstrap_main.py +7 -4
monarch/_src/actor/code_sync/__init__.py +2 -0
monarch/_src/actor/debugger/__init__.py +7 -0
monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
monarch/_src/actor/endpoint.py +27 -45
monarch/_src/actor/future.py +86 -24
monarch/_src/actor/host_mesh.py +125 -0
monarch/_src/actor/logging.py +94 -0
monarch/_src/actor/pickle.py +25 -0
monarch/_src/actor/proc_mesh.py +423 -156
monarch/_src/actor/python_extension_methods.py +90 -0
monarch/_src/actor/shape.py +8 -1
monarch/_src/actor/source_loader.py +45 -0
monarch/_src/actor/telemetry/__init__.py +172 -0
monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
monarch/_src/debug_cli/__init__.py +7 -0
monarch/_src/debug_cli/debug_cli.py +43 -0
monarch/_src/tensor_engine/rdma.py +64 -9
monarch/_testing.py +1 -3
monarch/actor/__init__.py +24 -4
monarch/common/_C.so +0 -0
monarch/common/device_mesh.py +14 -0
monarch/common/future.py +10 -0
monarch/common/remote.py +14 -25
monarch/common/tensor.py +12 -0
monarch/debug_cli/__init__.py +7 -0
monarch/debug_cli/__main__.py +12 -0
monarch/fetch.py +2 -2
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +4 -2
monarch/mesh_controller.py +34 -14
monarch/monarch_controller +0 -0
monarch/tools/colors.py +25 -0
monarch/tools/commands.py +42 -7
monarch/tools/components/hyperactor.py +1 -1
monarch/tools/config/__init__.py +31 -4
monarch/tools/config/defaults.py +13 -3
monarch/tools/config/environment.py +45 -0
monarch/tools/config/workspace.py +165 -0
monarch/tools/mesh_spec.py +2 -0
monarch/utils/__init__.py +9 -0
monarch/utils/utils.py +78 -0
tests/error_test_binary.py +5 -3
tests/python_actor_test_binary.py +52 -0
tests/test_actor_error.py +142 -14
tests/test_alloc.py +1 -1
tests/test_allocator.py +59 -72
tests/test_debugger.py +639 -45
tests/test_env_before_cuda.py +4 -4
tests/test_mesh_trait.py +38 -0
tests/test_python_actors.py +965 -75
tests/test_rdma.py +7 -6
tests/test_tensor_engine.py +6 -6
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/METADATA +82 -4
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/RECORD +63 -47
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/top_level.txt +0 -0

monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} RENAMED Viewed

@@ -8,9 +8,12 @@
 import bdb
 import inspect
 import io
+import linecache
+import os
 import pdb  # noqa
 import socket
 import sys
+from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Dict, TYPE_CHECKING
@@ -19,7 +22,7 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
 from monarch._src.actor.sync_state import fake_sync_state
 if TYPE_CHECKING:
-    from monarch._src.actor.debugger import DebugClient
+    from monarch._src.actor.debugger.debugger import DebugController
 @dataclass
@@ -29,31 +32,41 @@ class DebuggerWrite:
     lineno: int | None
+@contextmanager
+def _debug_controller_request_ctx():
+    try:
+        with fake_sync_state():
+            yield
+    except Exception as e:
+        raise bdb.BdbQuit from e
 class PdbWrapper(pdb.Pdb):
     def __init__(
         self,
         rank: int,
         coords: Dict[str, int],
         actor_id: ActorId,
-        client_ref: "DebugClient",
+        controller: "DebugController",
         header: str | None = None,
     ):
         self.rank = rank
         self.coords = coords
         self.header = header
         self.actor_id = actor_id
-        self.client_ref = client_ref
+        self.controller = controller
         # pyre-ignore
         super().__init__(stdout=WriteWrapper(self), stdin=ReadWrapper.create(self))
         self._first = True
     def set_trace(self, frame=None):
-        self.client_ref.debugger_session_start.broadcast(
-            self.rank,
-            self.coords,
-            socket.getfqdn(socket.gethostname()),
-            self.actor_id.actor_name,
-        )
+        with _debug_controller_request_ctx():
+            self.controller.debugger_session_start.call_one(
+                self.rank,
+                self.coords,
+                socket.getfqdn(socket.gethostname()),
+                self.actor_id.actor_name,
+            ).get()
         if self.header:
             self.message(self.header)
         super().set_trace(frame)
@@ -69,10 +82,35 @@ class PdbWrapper(pdb.Pdb):
         else:
             super().do_clear(arg)
+    def lookupmodule(self, filename):
+        filename = super().lookupmodule(filename)
+        if (
+            filename is not None
+            and not os.path.exists(filename)
+            and filename not in linecache.cache
+        ):
+            from monarch._src.actor.actor_mesh import ActorError
+            from monarch._src.actor.source_loader import load_remote_source
+            try:
+                with fake_sync_state():
+                    source = load_remote_source(filename)
+                    if source:
+                        linecache.cache[filename] = (
+                            len(source),
+                            None,
+                            source.splitlines(keepends=True),
+                            filename,
+                        )
+            except ActorError as e:
+                self.error(f"Failed querying root client host for source code: {e}")
+        return filename
     def end_debug_session(self):
-        self.client_ref.debugger_session_end.broadcast(
-            self.actor_id.actor_name, self.rank
-        )
+        with _debug_controller_request_ctx():
+            self.controller.debugger_session_end.call_one(
+                self.actor_id.actor_name, self.rank
+            ).get()
         # Once the debug client actor is notified of the session being over,
         # we need to prevent any additional requests being sent for the session
         # by redirecting stdin and stdout.
@@ -91,8 +129,8 @@ class ReadWrapper(io.RawIOBase):
         self.session = session
     def readinto(self, b):
-        with fake_sync_state():
-            response = self.session.client_ref.debugger_read.call_one(
+        with _debug_controller_request_ctx():
+            response = self.session.controller.debugger_read.call_one(
                 self.session.actor_id.actor_name, self.session.rank, len(b)
             ).get()
             if response == "detach":
@@ -128,15 +166,16 @@ class WriteWrapper:
             function = f"{inspect.getmodulename(self.session.curframe.f_code.co_filename)}.{self.session.curframe.f_code.co_name}"
             # pyre-ignore
             lineno = self.session.curframe.f_lineno
-        self.session.client_ref.debugger_write.broadcast(
-            self.session.actor_id.actor_name,
-            self.session.rank,
-            DebuggerWrite(
-                s.encode(),
-                function,
-                lineno,
-            ),
-        )
+        with _debug_controller_request_ctx():
+            self.session.controller.debugger_write.call_one(
+                self.session.actor_id.actor_name,
+                self.session.rank,
+                DebuggerWrite(
+                    s.encode(),
+                    function,
+                    lineno,
+                ),
+            ).get()
     def flush(self):
         pass

monarch/_src/actor/endpoint.py CHANGED Viewed

@@ -11,7 +11,6 @@ from abc import ABC, abstractmethod
 from operator import mul
 from typing import (
     Any,
-    AsyncGenerator,
     Awaitable,
     Callable,
     cast,
@@ -31,36 +30,25 @@ from typing import (
     TypeVar,
 )
+from monarch._rust_bindings.monarch_hyperactor.shape import Extent
 from monarch._src.actor.future import Future
 from monarch._src.actor.tensor_engine_shim import _cached_propagation, fake_call
 if TYPE_CHECKING:
     from monarch._src.actor.actor_mesh import (
-        ActorMeshRef,
+        ActorMesh,
+        HyOncePortReceiver,
         HyPortReceiver,
-        OncePortReceiver,
         Port,
-        PortTuple,
+        PortReceiver,
         ValueMesh,
     )
 P = ParamSpec("P")
 R = TypeVar("R")
-Selection = Literal["all", "choose"] | int
-class Extent:
-    def __init__(self, labels: Sequence[str], sizes: Sequence[int]) -> None:
-        self.labels = labels
-        self.sizes = sizes
-    @property
-    def nelements(self) -> int:
-        return functools.reduce(mul, self.sizes, 1)
-    def __str__(self) -> str:
-        return str(dict(zip(self.labels, self.sizes)))
+Selection = Literal["all", "choose"]
 Propagator = Any
@@ -90,9 +78,10 @@ class Endpoint(ABC, Generic[P, R]):
         """
         pass
-    @abstractmethod
-    def _port(self, once: bool = False) -> "PortTuple[R]":
-        pass
+    def _port(self, once: bool = False) -> "Tuple[Port[R], PortReceiver[R]]":
+        from monarch._src.actor.actor_mesh import Channel
+        return Channel[R].open(once)
     @abstractmethod
     def _call_name(self) -> Any:
@@ -101,7 +90,7 @@ class Endpoint(ABC, Generic[P, R]):
         """
         pass
-    def _supervise(self, r: "HyPortReceiver | OncePortReceiver") -> Any:
+    def _supervise(self, r: "HyPortReceiver | HyOncePortReceiver") -> Any:
         return r
     # the following are all 'adverbs' or different ways to handle the
@@ -115,17 +104,14 @@ class Endpoint(ABC, Generic[P, R]):
         Load balanced RPC-style entrypoint for request/response messaging.
         """
-        from monarch._src.actor.actor_mesh import port
-        p, r = port(self, once=True)
+        p, r = self._port(once=True)
         # pyre-ignore
         self._send(args, kwargs, port=p, selection="choose")
         return r.recv()
     def call_one(self, *args: P.args, **kwargs: P.kwargs) -> Future[R]:
-        from monarch._src.actor.actor_mesh import port
-        p, r = port(self, once=True)
+        p, r = self._port(once=True)
         # pyre-ignore
         extent = self._send(args, kwargs, port=p, selection="choose")
         if extent.nelements != 1:
@@ -135,9 +121,10 @@ class Endpoint(ABC, Generic[P, R]):
         return r.recv()
     def call(self, *args: P.args, **kwargs: P.kwargs) -> "Future[ValueMesh[R]]":
-        from monarch._src.actor.actor_mesh import ranked_port, ValueMesh
+        from monarch._src.actor.actor_mesh import ValueMesh
-        p, r = ranked_port(self)
+        p, unranked = self._port()
+        r = unranked.ranked()
         # pyre-ignore
         extent = self._send(args, kwargs, port=p)
@@ -157,29 +144,24 @@ class Endpoint(ABC, Generic[P, R]):
         return Future(coro=process())
-    def _stream(
+    def stream(
         self, *args: P.args, **kwargs: P.kwargs
-    ) -> Generator[Coroutine[Any, Any, R], None, None]:
+    ) -> Generator[Future[R], None, None]:
         """
         Broadcasts to all actors and yields their responses as a stream / generator.
         This enables processing results from multiple actors incrementally as
         they become available. Returns an async generator of response values.
         """
-        from monarch._src.actor.actor_mesh import port
-        p, r = port(self)
-        # pyre-ignore
+        p, r = self._port()
+        # type: ignore
         extent = self._send(args, kwargs, port=p)
-        for _ in range(extent.nelements):
-            # pyre-ignore
-            yield r._recv()
-    def stream(
-        self, *args: P.args, **kwargs: P.kwargs
-    ) -> Generator[Future[R], None, None]:
-        for coro in self._stream(*args, **kwargs):
-            yield Future(coro=coro)
+        def _stream():
+            for _ in range(extent.nelements):
+                yield r.recv()
+        return _stream()
     def broadcast(self, *args: P.args, **kwargs: P.kwargs) -> None:
         """
@@ -261,12 +243,12 @@ class EndpointProperty(Generic[P, R]):
 class NotAnEndpoint:
     """
-    Used as the dynamic value of functions on an ActorMeshRef that were not marked as endpoints.
+    Used as the dynamic value of functions on an ActorMesh that were not marked as endpoints.
     This is used both to give a better error message (since we cannot prevent the type system from thinking they are methods),
     and to provide the oppurtunity for someone to do endpoint(x.foo) on something that wasn't marked as an endpoint.
     """
-    def __init__(self, ref: "ActorMeshRef", name: str):
+    def __init__(self, ref: "ActorMesh", name: str):
         self._ref = ref
         self._name = name

monarch/_src/actor/future.py CHANGED Viewed

@@ -6,6 +6,7 @@
 import asyncio
 import traceback
+import warnings
 from functools import partial
 from typing import (
     Any,
@@ -19,9 +20,13 @@ from typing import (
     TypeVar,
 )
-from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask
+from monarch._rust_bindings.monarch_hyperactor.pytokio import (
+    is_tokio_thread,
+    PythonTask,
+    Shared,
+)
-from typing_extensions import Self
+from typing_extensions import deprecated, Self
 R = TypeVar("R")
@@ -78,7 +83,11 @@ class _Asyncio(NamedTuple):
     fut: asyncio.Future
-_Status = _Unawaited | _Complete | _Exception | _Asyncio
+class _Tokio(NamedTuple):
+    shared: Shared
+_Status = _Unawaited | _Complete | _Exception | _Asyncio | _Tokio
 class Future(Generic[R]):
@@ -107,31 +116,60 @@ class Future(Generic[R]):
                 return cast("R", value)
             case _Exception(exe=exe):
                 raise exe
+            case _Tokio(_):
+                raise ValueError(
+                    "already converted into a pytokio.Shared object, use 'await' from a PythonTask coroutine to get the value."
+                )
             case _:
                 raise RuntimeError("unknown status")
     def __await__(self) -> Generator[Any, Any, R]:
-        match self._status:
-            case _Unawaited(coro=coro):
-                loop = asyncio.get_running_loop()
-                fut = loop.create_future()
-                self._status = _Asyncio(fut)
-                async def mark_complete():
-                    try:
-                        func, value = fut.set_result, await coro
-                    except Exception as e:
-                        func, value = fut.set_exception, e
-                    loop.call_soon_threadsafe(func, value)
-                PythonTask.from_coroutine(mark_complete()).spawn()
-                return fut.__await__()
-            case _Asyncio(fut=fut):
-                return fut.__await__()
-            case _:
-                raise ValueError(
-                    "already converted into a synchronous future, use 'get' to get the value."
-                )
+        if asyncio._get_running_loop() is not None:
+            match self._status:
+                case _Unawaited(coro=coro):
+                    loop = asyncio.get_running_loop()
+                    fut = loop.create_future()
+                    self._status = _Asyncio(fut)
+                    async def mark_complete():
+                        try:
+                            func, value = fut.set_result, await coro
+                        except Exception as e:
+                            func, value = fut.set_exception, e
+                        loop.call_soon_threadsafe(func, value)
+                    PythonTask.from_coroutine(mark_complete()).spawn()
+                    return fut.__await__()
+                case _Asyncio(fut=fut):
+                    return fut.__await__()
+                case _Tokio(_):
+                    raise ValueError(
+                        "already converted into a tokio future, but being awaited from the asyncio loop."
+                    )
+                case _:
+                    raise ValueError(
+                        "already converted into a synchronous future, use 'get' to get the value."
+                    )
+        elif is_tokio_thread():
+            match self._status:
+                case _Unawaited(coro=coro):
+                    shared = coro.spawn()
+                    self._status = _Tokio(shared)
+                    return shared.__await__()
+                case _Tokio(shared=shared):
+                    return shared.__await__()
+                case _Asyncio(_):
+                    raise ValueError(
+                        "already converted into asyncio future, but being awaited from the tokio loop."
+                    )
+                case _:
+                    raise ValueError(
+                        "already converted into a synchronous future, use 'get' to get the value."
+                    )
+        else:
+            raise ValueError(
+                "__await__ with no active event loop (either asyncio or tokio)"
+            )
     # compatibility with old tensor engine Future objects
     # hopefully we do not need done(), add_callback because
@@ -145,3 +183,27 @@ class Future(Generic[R]):
             return None
         except Exception as e:
             return e
+class DeprecatedNotAFuture:
+    """
+    We used to return Future[Alloc] and Future[Actor] and Future[ProcMesh].
+    Now the only Futures are generated as responses to messages.
+    This polyfills the await/get methods to those objects and raises the deprecation
+    warning that we are going to remove this.
+    """
+    def get(self) -> "Self":
+        cls = type(self)
+        typ = f"{cls.__module__}.{cls.__qualname__}"
+        warnings.warn(
+            f"This get()/await can be removed. get() and await is deprecated for {typ}, we directly return {typ} instead of Future[{typ}].\n",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self
+    def __await__(self) -> "Generator[Any, Any, Self]":
+        yield from ()
+        return self

monarch/_src/actor/host_mesh.py ADDED Viewed

@@ -0,0 +1,125 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import warnings
+from math import prod
+from typing import Callable, Dict, Optional, Tuple
+from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
+from monarch._src.actor.actor_mesh import context
+from monarch._src.actor.allocator import AllocateMixin, AllocHandle, LocalAllocator
+from monarch._src.actor.proc_mesh import _get_bootstrap_args, ProcessAllocator, ProcMesh
+from monarch._src.actor.shape import MeshTrait, NDSlice, Shape
+def this_host() -> "HostMesh":
+    """
+    The current machine.
+    This is just shorthand for looking it up via the context
+    """
+    return context().actor_instance.proc.host_mesh
+def this_proc() -> "ProcMesh":
+    """
+    The current singleton process that this specific actor is
+    running on
+    """
+    return context().actor_instance.proc
+def create_local_host_mesh() -> "HostMesh":
+    cmd, args, env = _get_bootstrap_args()
+    return HostMesh(Shape.unity(), ProcessAllocator(cmd, args, env))
+class HostMesh(MeshTrait):
+    def __init__(self, shape: Shape, allocator: AllocateMixin):
+        self._allocator = allocator
+        self._shape = shape
+        self._spawned = 0
+    def _alloc(self, hosts: int, gpus: int) -> "AllocHandle":
+        spec: AllocSpec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
+        return self._allocator.allocate(spec)
+    def spawn_procs(
+        self,
+        per_host: Optional[Dict[str, int]] = None,
+        bootstrap: Optional[Callable[[], None]] = None,
+    ) -> "ProcMesh":
+        """
+        Start new processes on this host mesh. By default this starts one proc
+        on each host in the mesh. Additional procs can be started using `per_host` to
+        specify the local shape, e.g.
+            per_host = {'gpus': 8}
+        Will create a proc mesh with an additional 'gpus' dimension.
+        `bootstrap` is a function that will be run at startup on each proc and can be used to e.g.
+        configure CUDA or NCCL. We guarantee that CUDA has not been initialized before boostrap is called.
+        """
+        if per_host is None:
+            per_host = {}
+        if self._spawned > 0 and len(self._ndslice) > 1:
+            warnings.warn(
+                "spawning multiple procs on the same host mesh is kinda fake at the moment, there is no guarentee that the two different spawns will be on shared hosts",
+                stacklevel=2,
+            )
+        self._spawned += 1
+        hosts = len(self._ndslice)
+        flat_per_host = prod(per_host.values())
+        alloc_handle = self._alloc(hosts, flat_per_host)
+        new_extent = dict(zip(self._labels, self._ndslice.sizes))
+        conflicting_keys = set(per_host.keys()) & set(new_extent.keys())
+        if conflicting_keys:
+            raise ValueError(
+                f"host mesh already has dims {', '.join(sorted(conflicting_keys))}"
+            )
+        new_extent.update(per_host)
+        return ProcMesh.from_alloc(alloc_handle.reshape(new_extent), bootstrap)
+    @property
+    def _ndslice(self) -> NDSlice:
+        return self._shape.ndslice
+    @property
+    def _labels(self) -> Tuple[str, ...]:
+        return tuple(self._shape.labels)
+    def _new_with_shape(self, shape: Shape) -> "HostMesh":
+        warnings.warn(
+            "Slicing a host mesh is kinda fake at the moment, there is no guarentee that procs in the slice will end up on the corresponding hosts",
+            stacklevel=2,
+        )
+        return HostMesh(
+            Shape(self._labels, NDSlice.new_row_major(self._ndslice.sizes)),
+            self._allocator,
+        )
+def fake_in_process_host() -> "HostMesh":
+    return HostMesh(Shape.unity(), LocalAllocator())
+def hosts_from_config(name: str):
+    """
+    Get the host mesh 'name' from the monarch configuration for the project.
+    This config can be modified so that the same code can create meshes from scheduler sources,
+    and different sizes etc.
+    WARNING: This function is a standin so that our getting_started example code works. The real implementation
+    needs an RFC design.
+    """
+    shape = Shape(["hosts"], NDSlice.new_row_major([2]))
+    return HostMesh(shape, ProcessAllocator(*_get_bootstrap_args()))

monarch/_src/actor/logging.py ADDED Viewed

@@ -0,0 +1,94 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import gc
+import logging
+from typing import Callable
+from monarch._rust_bindings.monarch_extension.logging import LoggingMeshClient
+from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
+from monarch._src.actor.future import Future
+IN_IPYTHON = False
+try:
+    # Check if we are in ipython environment
+    # pyre-ignore[21]
+    from IPython import get_ipython
+    # pyre-ignore[21]
+    from IPython.core.interactiveshell import ExecutionResult
+    IN_IPYTHON = get_ipython() is not None
+except ImportError:
+    pass
+class LoggingManager:
+    def __init__(self) -> None:
+        self._logging_mesh_client: LoggingMeshClient | None = None
+        self._ipython_flush_logs_handler: Callable[..., None] | None = None
+    async def init(self, proc_mesh: HyProcMesh, stream_to_client: bool) -> None:
+        if self._logging_mesh_client is not None:
+            return
+        self._logging_mesh_client = await LoggingMeshClient.spawn(proc_mesh=proc_mesh)
+        self._logging_mesh_client.set_mode(
+            stream_to_client=stream_to_client,
+            aggregate_window_sec=3 if stream_to_client else None,
+            level=logging.INFO,
+        )
+        if IN_IPYTHON:
+            # For ipython environment, a cell can end fast with threads running in background.
+            # Flush all the ongoing logs proactively to avoid missing logs.
+            assert self._logging_mesh_client is not None
+            logging_client: LoggingMeshClient = self._logging_mesh_client
+            ipython = get_ipython()
+            # pyre-ignore[11]
+            def flush_logs(_: ExecutionResult) -> None:
+                try:
+                    Future(coro=logging_client.flush().spawn().task()).get(3)
+                except TimeoutError:
+                    # We need to prevent failed proc meshes not coming back
+                    pass
+            # Force to recycle previous undropped proc_mesh.
+            # Otherwise, we may end up with unregisterd dead callbacks.
+            gc.collect()
+            # Store the handler reference so we can unregister it later
+            self._ipython_flush_logs_handler = flush_logs
+            ipython.events.register("post_run_cell", flush_logs)
+    async def logging_option(
+        self,
+        stream_to_client: bool = True,
+        aggregate_window_sec: int | None = 3,
+        level: int = logging.INFO,
+    ) -> None:
+        if level < 0 or level > 255:
+            raise ValueError("Invalid logging level: {}".format(level))
+        assert self._logging_mesh_client is not None
+        self._logging_mesh_client.set_mode(
+            stream_to_client=stream_to_client,
+            aggregate_window_sec=aggregate_window_sec,
+            level=level,
+        )
+    def stop(self) -> None:
+        if self._ipython_flush_logs_handler is not None:
+            assert IN_IPYTHON
+            ipython = get_ipython()
+            assert ipython is not None
+            ipython.events.unregister("post_run_cell", self._ipython_flush_logs_handler)
+            self._ipython_flush_logs_handler = None