PyPI - torchmonarch-nightly - Versions diffs - 2025.7.1__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.25__cp312-cp312-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.7.1__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.25__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

monarch/__init__.py +13 -9
monarch/_rust_bindings.so +0 -0
monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
monarch/_src/actor/actor_mesh.py +874 -0
monarch/{allocator.py → _src/actor/allocator.py} +26 -17
monarch/_src/actor/bootstrap_main.py +73 -0
monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
monarch/_src/actor/code_sync/auto_reload.py +223 -0
monarch/_src/actor/debugger.py +565 -0
monarch/_src/actor/endpoint.py +270 -0
monarch/_src/actor/event_loop.py +97 -0
monarch/_src/actor/future.py +100 -0
monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
monarch/_src/actor/proc_mesh.py +500 -0
monarch/_src/actor/sync_state.py +18 -0
monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
monarch/_src/actor/tensor_engine_shim.py +56 -0
monarch/_src/tensor_engine/rdma.py +180 -0
monarch/_testing.py +3 -2
monarch/actor/__init__.py +51 -0
monarch/actor_mesh.py +6 -765
monarch/bootstrap_main.py +8 -47
monarch/common/client.py +1 -1
monarch/common/controller_api.py +2 -1
monarch/common/device_mesh.py +12 -2
monarch/common/messages.py +12 -1
monarch/common/recording.py +4 -3
monarch/common/remote.py +135 -52
monarch/common/tensor.py +2 -1
monarch/controller/backend.py +2 -2
monarch/controller/controller.py +2 -1
monarch/controller/rust_backend/controller.py +2 -1
monarch/fetch.py +3 -5
monarch/mesh_controller.py +201 -139
monarch/monarch_controller +0 -0
monarch/opaque_module.py +4 -6
monarch/opaque_object.py +3 -3
monarch/proc_mesh.py +6 -309
monarch/python_local_mesh.py +1 -1
monarch/rust_backend_mesh.py +2 -1
monarch/rust_local_mesh.py +4 -2
monarch/sim_mesh.py +10 -19
monarch/simulator/command_history.py +1 -1
monarch/simulator/interface.py +2 -1
monarch/simulator/mock_controller.py +1 -1
monarch/simulator/simulator.py +1 -1
monarch/tensor_engine/__init__.py +23 -0
monarch/tensor_worker_main.py +3 -1
monarch/tools/cli.py +3 -1
monarch/tools/commands.py +95 -35
monarch/tools/mesh_spec.py +55 -0
monarch/tools/utils.py +38 -0
monarch/worker/worker.py +1 -1
monarch/world_mesh.py +2 -1
monarch_supervisor/python_executable.py +6 -3
tests/error_test_binary.py +48 -10
tests/test_actor_error.py +370 -21
tests/test_alloc.py +1 -1
tests/test_allocator.py +373 -17
tests/test_controller.py +2 -0
tests/test_debugger.py +416 -0
tests/test_env_before_cuda.py +162 -0
tests/test_python_actors.py +184 -333
tests/test_rdma.py +198 -0
tests/test_remote_functions.py +40 -12
tests/test_rust_backend.py +7 -5
tests/test_sim_backend.py +1 -4
tests/test_tensor_engine.py +55 -1
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
monarch/_monarch/hyperactor/__init__.py +0 -58
monarch/_monarch/worker/debugger.py +0 -117
monarch/_monarch/worker/logging.py +0 -107
monarch/debugger.py +0 -379
monarch/future.py +0 -76
monarch/rdma.py +0 -162
torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
/monarch/{_monarch/worker → _src}/__init__.py +0 -0
/monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
/monarch/{common → _src/actor}/shape.py +0 -0
/monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0

tests/test_allocator.py CHANGED Viewed

@@ -6,14 +6,17 @@
 # pyre-strict
+import asyncio
 import contextlib
 import importlib.resources
+import logging
 import math
 import os
 import subprocess
 import sys
 import unittest
 from datetime import timedelta
+from time import sleep
 from typing import Generator, Optional
 from unittest import mock
@@ -24,22 +27,29 @@ import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from monarch._rust_bindings.hyperactor_extension.alloc import (
-    AllocConstraints,
-    AllocSpec,
-)
+from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
 from monarch._rust_bindings.monarch_hyperactor.channel import (
     ChannelAddr,
     ChannelTransport,
 )
-from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
-from monarch.allocator import (
+from monarch._src.actor.actor_mesh import MonarchContext
+from monarch._src.actor.allocator import (
     ALLOC_LABEL_PROC_MESH_NAME,
+    LocalAllocator,
     RemoteAllocator,
     StaticRemoteAllocInitializer,
     TorchXRemoteAllocInitializer,
 )
-from monarch.proc_mesh import ProcMesh
+from monarch._src.actor.sync_state import fake_sync_state
+from monarch.actor import (
+    Actor,
+    current_rank,
+    current_size,
+    endpoint,
+    ProcMesh,
+    ValueMesh,
+)
 from monarch.tools.mesh_spec import MeshSpec, ServerSpec
 from monarch.tools.network import get_sockaddr
@@ -49,6 +59,19 @@ from torchx.specs import AppState
 _100_MILLISECONDS = timedelta(milliseconds=100)
 SERVER_READY = "monarch.tools.commands.server_ready"
+UNUSED = "__UNUSED__"
+class EnvCheckActor(Actor):
+    """Actor that checks for the presence of an environment variable"""
+    def __init__(self) -> None:
+        pass
+    @endpoint
+    async def get_env_var(self, var_name: str) -> str:
+        """Return the value of the specified environment variable or 'NOT_SET' if not found"""
+        return os.environ.get(var_name, "NOT_SET")
 class TestActor(Actor):
@@ -57,6 +80,8 @@ class TestActor(Actor):
     def __init__(self) -> None:
         self.rank: int = current_rank().rank
         self.world_size: int = math.prod(current_size().values())
+        self.logger: logging.Logger = logging.getLogger("test_actor")
+        self.logger.setLevel(logging.INFO)
     @endpoint
     async def compute_world_size(self, master_addr: str, master_port: int) -> int:
@@ -71,17 +96,33 @@ class TestActor(Actor):
         finally:
             dist.destroy_process_group()
+    @endpoint
+    async def log(self, message: str) -> None:
+        print(f"Stdout LogMessage from print: {message}")
+        sys.stderr.write(f"Stderr LogMessage from print: {message}\n")
+        self.logger.info(f"LogMessage from logger: {message}")
 @contextlib.contextmanager
-def remote_process_allocator(addr: Optional[str] = None) -> Generator[str, None, None]:
-    with importlib.resources.path(__package__, "") as package_path:
+def remote_process_allocator(
+    addr: Optional[str] = None, timeout: Optional[int] = None
+) -> Generator[str, None, None]:
+    """Start a remote process allocator on addr. If timeout is not None, have it
+    timeout after that many seconds if no messages come in"""
+    with importlib.resources.as_file(
+        importlib.resources.files(__package__)
+    ) as package_path:
         addr = addr or ChannelAddr.any(ChannelTransport.Unix)
+        args = [
+            "process_allocator",
+            f"--addr={addr}",
+        ]
+        if timeout is not None:
+            args.append(f"--timeout-sec={timeout}")
         process_allocator = subprocess.Popen(
-            args=[
-                "process_allocator",
-                f"--addr={addr}",
-            ],
+            args=args,
             env={
                 # prefix PATH with this test module's directory to
                 # give 'process_allocator' and 'monarch_bootstrap' binary resources
@@ -102,6 +143,82 @@ def remote_process_allocator(addr: Optional[str] = None) -> Generator[str, None,
                 process_allocator.kill()
+class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
+    @classmethod
+    def tearDownClass(cls) -> None:
+        cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
+    async def test_setup_lambda_with_multiple_env_vars(self) -> None:
+        """Test that the setup lambda can set multiple environment variables"""
+        env_vars: dict[str, str] = {
+            "TEST_ENV_VAR_1": "value_1",
+            "TEST_ENV_VAR_2": "value_2",
+            "TEST_ENV_VAR_3": "value_3",
+        }
+        def setup_multiple_env_vars(ctx: MonarchContext) -> None:
+            for name, value in env_vars.items():
+                os.environ[name] = value
+        spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
+        allocator = LocalAllocator()
+        alloc = await allocator.allocate(spec)
+        proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_multiple_env_vars)
+        try:
+            actor = await proc_mesh.spawn("env_check", EnvCheckActor)
+            for name, expected_value in env_vars.items():
+                actual_value = await actor.get_env_var.call_one(name)
+                self.assertEqual(
+                    actual_value,
+                    expected_value,
+                    f"Environment variable {name} was not set correctly",
+                )
+        finally:
+            await proc_mesh.stop()
+    async def test_setup_lambda_with_context_info(self) -> None:
+        """Test that the setup lambda can access context information"""
+        context_var_name: str = "PROC_MESH_CONTEXT_INFO"
+        def setup_with_context(ctx: MonarchContext) -> None:
+            context_info = f"proc_id:{ctx.proc_id},point_rank:{ctx.point.rank}"
+            os.environ[context_var_name] = context_info
+        spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
+        allocator = LocalAllocator()
+        alloc = await allocator.allocate(spec)
+        proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_context)
+        try:
+            actor = await proc_mesh.spawn("env_check", EnvCheckActor)
+            context_info = await actor.get_env_var.call_one(context_var_name)
+            self.assertNotEqual(
+                context_info,
+                "NOT_SET",
+                "Context information was not stored in the environment variable",
+            )
+            self.assertIn(
+                "proc_id:", context_info, "Context information does not contain proc_id"
+            )
+            self.assertIn(
+                "point_rank:0",
+                context_info,
+                f"Context information {context_info} does not contain point_rank",
+            )
+        finally:
+            await proc_mesh.stop()
 class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
     @classmethod
     def setUpClass(cls) -> None:
@@ -153,7 +270,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
             """test initializer that returns an empty list of addresses"""
             async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
-                _ = match_labels  # Suppress unused variable warning
+                _ = match_labels
                 return []
         empty_initializer = EmptyAllocInitializer()
@@ -191,6 +308,209 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
             self.assert_computed_world_size(values, world_size)
+    async def test_stop_proc_mesh_blocking(self) -> None:
+        spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=StaticRemoteAllocInitializer(host1, host2),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            alloc = await allocator.allocate(spec)
+            proc_mesh = await ProcMesh.from_alloc(alloc)
+            # XXX - it is not clear why this trying to use
+            # async code in a sync context.
+            with fake_sync_state():
+                actor = proc_mesh.spawn("test_actor", TestActor).get()
+                proc_mesh.stop().get()
+            with self.assertRaises(
+                RuntimeError, msg="`ProcMesh` has already been stopped"
+            ):
+                proc_mesh.spawn("test_actor", TestActor).get()
+            del actor
+    async def test_wrong_address(self) -> None:
+        hosts = 1
+        gpus = 1
+        spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
+        # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
+        with remote_process_allocator():
+            wrong_host = ChannelAddr.any(ChannelTransport.Unix)
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=StaticRemoteAllocInitializer(wrong_host),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            alloc = await allocator.allocate(spec)
+            with self.assertRaisesRegex(
+                Exception, r"no process has ever been allocated.*"
+            ):
+                await ProcMesh.from_alloc(alloc)
+    async def test_init_failure(self) -> None:
+        class FailInitActor(Actor):
+            def __init__(self) -> None:
+                if current_rank().rank == 0:
+                    raise RuntimeError("fail on init")
+            @endpoint
+            def dummy(self) -> None:
+                pass
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            allocator = RemoteAllocator(
+                world_id="helloworld",
+                initializer=StaticRemoteAllocInitializer(host1, host2),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            spec = AllocSpec(AllocConstraints(), host=2, gpu=2)
+            proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
+            actor_mesh = await proc_mesh.spawn("actor", FailInitActor)
+            with self.assertRaisesRegex(
+                Exception,
+                r"(?s)fail on init",
+            ):
+                await actor_mesh.dummy.call()
+    async def test_stop_proc_mesh(self) -> None:
+        spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
+        # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=StaticRemoteAllocInitializer(host1, host2),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            alloc = await allocator.allocate(spec)
+            proc_mesh = await ProcMesh.from_alloc(alloc)
+            actor = await proc_mesh.spawn("test_actor", TestActor)
+            await proc_mesh.stop()
+            with self.assertRaises(
+                RuntimeError, msg="`ProcMesh` has already been stopped"
+            ):
+                await proc_mesh.spawn("test_actor", TestActor)
+            # TODO(agallagher): It'd be nice to test that this just fails
+            # immediately, trying to access the wrapped actor mesh, but right
+            # now we doing casting without accessing the wrapped type.
+            del actor
+    async def test_stop_proc_mesh_context_manager(self) -> None:
+        spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
+        # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=StaticRemoteAllocInitializer(host1, host2),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            alloc = await allocator.allocate(spec)
+            proc_mesh = await ProcMesh.from_alloc(alloc)
+            with self.assertRaises(ValueError, msg="foo"):
+                async with proc_mesh:
+                    actor = await proc_mesh.spawn("test_actor", TestActor)
+                    # Ensure that proc mesh is stopped when context manager exits.
+                    raise ValueError("foo")
+            with self.assertRaises(
+                RuntimeError, msg="`ProcMesh` has already been stopped"
+            ):
+                await proc_mesh.spawn("test_actor", TestActor)
+            # TODO(agallagher): It'd be nice to test that this just fails
+            # immediately, trying to access the wrapped actor mesh, but right
+            # now we doing casting without accessing the wrapped type.
+            del actor
+    async def test_setup_lambda_sets_env_vars(self) -> None:
+        """Test that the setup lambda can set environment variables during proc_mesh allocation"""
+        test_var_name: str = "TEST_ENV_VAR_FOR_PROC_MESH"
+        test_var_value: str = "test_value_123"
+        def setup_env_vars(ctx: MonarchContext) -> None:
+            os.environ[test_var_name] = test_var_value
+        hosts = 2
+        gpus = 4
+        spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=StaticRemoteAllocInitializer(host1, host2),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            alloc = await allocator.allocate(spec)
+            proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_env_vars)
+            try:
+                actor = await proc_mesh.spawn("env_check", EnvCheckActor)
+                env_var_values = await actor.get_env_var.call(test_var_name)
+                env_var_value = env_var_values.item(host=0, gpu=0)
+                self.assertEqual(
+                    env_var_value,
+                    test_var_value,
+                    f"Environment variable {test_var_name} was not set correctly",
+                )
+            finally:
+                await proc_mesh.stop()
+    async def test_stop_proc_mesh_context_manager_multiple_times(self) -> None:
+        spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
+        # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=StaticRemoteAllocInitializer(host1, host2),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            alloc = await allocator.allocate(spec)
+            proc_mesh = await ProcMesh.from_alloc(alloc)
+            # We can nest multiple context managers on the same mesh, the innermost
+            # one closes the mesh and it cannot be used after that.
+            async with proc_mesh:
+                async with proc_mesh:
+                    actor = await proc_mesh.spawn("test_actor", TestActor)
+                with self.assertRaises(
+                    RuntimeError, msg="`ProcMesh` has already been stopped"
+                ):
+                    await proc_mesh.spawn("test_actor", TestActor)
+                # Exiting a second time should not raise an error.
+            # TODO(agallagher): It'd be nice to test that this just fails
+            # immediately, trying to access the wrapped actor mesh, but right
+            # now we doing casting without accessing the wrapped type.
+            del actor
+    async def test_remote_allocator_with_no_connection(self) -> None:
+        spec = AllocSpec(AllocConstraints(), host=1, gpu=4)
+        with remote_process_allocator(timeout=1) as host1:
+            # Wait 3 seconds without making any processes, make sure it dies.
+            await asyncio.sleep(3)
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=StaticRemoteAllocInitializer(host1),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            with self.assertRaisesRegex(
+                Exception, "no process has ever been allocated on"
+            ):
+                alloc = await allocator.allocate(spec)
+                await ProcMesh.from_alloc(alloc)
     async def test_stacked_1d_meshes(self) -> None:
         # create two stacked actor meshes on the same host
         # each actor mesh running on separate process-allocators
@@ -244,7 +564,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
         # but there are more than 1 mesh (hence ambiguous which mesh to allocate on)
         server = ServerSpec(
-            name="__UNUSED__",
+            name=UNUSED,
+            scheduler=UNUSED,
             state=AppState.RUNNING,
             meshes=[MeshSpec(name="x", num_hosts=1), MeshSpec(name="y", num_hosts=1)],
         )
@@ -262,7 +583,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
     @pytest.mark.oss_skip  # pyre-ignore[56] TODO T228752279
     async def test_torchx_remote_alloc_initializer_no_match_label_1_mesh(self) -> None:
         server = ServerSpec(
-            name="__UNUSED__",
+            name=UNUSED,
+            scheduler=UNUSED,
             state=AppState.RUNNING,
             meshes=[
                 MeshSpec(
@@ -295,7 +617,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
     @pytest.mark.oss_skip  # pyre-ignore[56] TODO T228752279
     async def test_torchx_remote_alloc_initializer_with_match_label(self) -> None:
         server = ServerSpec(
-            name="__UNUSED__",
+            name=UNUSED,
+            scheduler=UNUSED,
             state=AppState.RUNNING,
             meshes=[
                 MeshSpec(
@@ -338,6 +661,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
         server = ServerSpec(
             name="test",
+            scheduler=UNUSED,
             state=AppState.RUNNING,
             meshes=[
                 MeshSpec(
@@ -363,3 +687,35 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
                     )
                 )
                 await ProcMesh.from_alloc(alloc)
+    async def test_log(self) -> None:
+        # create a mesh to log to both stdout and stderr
+        with remote_process_allocator() as host:
+            allocator = RemoteAllocator(
+                world_id="test_actor_logger",
+                initializer=StaticRemoteAllocInitializer(host),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            spec = AllocSpec(AllocConstraints(), host=1, gpu=2)
+            proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
+            # Generate aggregated log every 1 second.
+            await proc_mesh.logging_option(True, 1)
+            actor = await proc_mesh.spawn("actor", TestActor)
+            # Run for 4 seconds, every second generates 5 logs, so we expect to see
+            # 2 actors x 5 logs/actor/sec * 1 sec = 10 logs per aggregation.
+            for _ in range(20):
+                await actor.log.call("Expect to see [10 processes]")
+                sleep(0.2)
+            # Generate aggregated log every 2 seconds.
+            await proc_mesh.logging_option(True, 2)
+            # Run for 8 seconds, every second generates 5 logs, so we expect to see
+            # 2 actors x 5 logs/actor/sec * 2 sec = 20 logs per aggregation.
+            for _ in range(40):
+                await actor.log.call("Expect to see [20 processes]")
+                sleep(0.2)
+            print("======== All Done ========")

tests/test_controller.py CHANGED Viewed

@@ -653,6 +653,8 @@ def test_panicking_worker():
             _ = fetch_shard(torch.ones(2, 3)).result()
+# TODO - re-enable after resolving T232206970
+@pytest.mark.oss_skip
 def test_timeout_warning(caplog):
     timeout = 3
     with local_rust_device_mesh(