PyPI - torchmonarch-nightly - Versions diffs - 2025.7.1__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.26__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.7.1__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.26__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

monarch/__init__.py +13 -9
monarch/_rust_bindings.so +0 -0
monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
monarch/_src/actor/actor_mesh.py +878 -0
monarch/{allocator.py → _src/actor/allocator.py} +26 -17
monarch/_src/actor/bootstrap_main.py +73 -0
monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
monarch/_src/actor/code_sync/auto_reload.py +223 -0
monarch/_src/actor/debugger.py +565 -0
monarch/_src/actor/endpoint.py +303 -0
monarch/_src/actor/event_loop.py +97 -0
monarch/_src/actor/future.py +100 -0
monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
monarch/_src/actor/proc_mesh.py +508 -0
monarch/_src/actor/sync_state.py +18 -0
monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
monarch/_src/actor/tensor_engine_shim.py +59 -0
monarch/_src/tensor_engine/rdma.py +180 -0
monarch/_testing.py +3 -2
monarch/actor/__init__.py +53 -0
monarch/actor_mesh.py +6 -765
monarch/bootstrap_main.py +8 -47
monarch/common/client.py +1 -1
monarch/common/controller_api.py +2 -1
monarch/common/device_mesh.py +12 -2
monarch/common/messages.py +21 -1
monarch/common/recording.py +4 -3
monarch/common/remote.py +135 -52
monarch/common/tensor.py +2 -1
monarch/controller/backend.py +2 -2
monarch/controller/controller.py +2 -1
monarch/controller/rust_backend/controller.py +2 -1
monarch/fetch.py +3 -5
monarch/gradient/_gradient_generator.so +0 -0
monarch/mesh_controller.py +263 -139
monarch/monarch_controller +0 -0
monarch/opaque_module.py +4 -6
monarch/opaque_object.py +3 -3
monarch/proc_mesh.py +6 -309
monarch/python_local_mesh.py +1 -1
monarch/rust_backend_mesh.py +2 -1
monarch/rust_local_mesh.py +4 -2
monarch/sim_mesh.py +10 -19
monarch/simulator/command_history.py +1 -1
monarch/simulator/interface.py +2 -1
monarch/simulator/mock_controller.py +1 -1
monarch/simulator/simulator.py +1 -1
monarch/tensor_engine/__init__.py +23 -0
monarch/tensor_worker_main.py +3 -1
monarch/tools/cli.py +3 -1
monarch/tools/commands.py +129 -47
monarch/tools/components/hyperactor.py +5 -3
monarch/tools/config/__init__.py +18 -1
monarch/tools/config/defaults.py +2 -2
monarch/tools/mesh_spec.py +59 -1
monarch/tools/utils.py +38 -0
monarch/worker/worker.py +1 -1
monarch/world_mesh.py +2 -1
monarch_supervisor/python_executable.py +6 -3
tests/error_test_binary.py +48 -10
tests/test_actor_error.py +370 -21
tests/test_alloc.py +1 -1
tests/test_allocator.py +369 -17
tests/test_controller.py +2 -0
tests/test_debugger.py +416 -0
tests/test_env_before_cuda.py +161 -0
tests/test_python_actors.py +184 -333
tests/test_rdma.py +198 -0
tests/test_remote_functions.py +40 -12
tests/test_rust_backend.py +7 -5
tests/test_sim_backend.py +1 -4
tests/test_tensor_engine.py +81 -1
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/METADATA +39 -1
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/RECORD +84 -72
torchmonarch_nightly-2025.7.26.dist-info/entry_points.txt +3 -0
monarch/_monarch/hyperactor/__init__.py +0 -58
monarch/_monarch/worker/debugger.py +0 -117
monarch/_monarch/worker/logging.py +0 -107
monarch/debugger.py +0 -379
monarch/future.py +0 -76
monarch/rdma.py +0 -162
torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
/monarch/{_monarch/worker → _src}/__init__.py +0 -0
/monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
/monarch/{common → _src/actor}/shape.py +0 -0
/monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/top_level.txt +0 -0

tests/test_debugger.py ADDED Viewed

@@ -0,0 +1,416 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import asyncio
+import re
+import sys
+from unittest.mock import AsyncMock, MagicMock, patch
+import monarch
+import monarch.actor as actor
+import pytest
+import torch
+from monarch._src.actor.actor_mesh import Actor, current_rank
+from monarch._src.actor.debugger import (
+    Attach,
+    Cast,
+    Continue,
+    DebugClient,
+    DebugCommand,
+    DebugSession,
+    Help,
+    ListCommand,
+    Quit,
+)
+from monarch._src.actor.endpoint import endpoint
+from monarch._src.actor.proc_mesh import proc_mesh
+needs_cuda = pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="CUDA not available",
+)
+def _bad_rank():
+    raise ValueError("bad rank")
+def _debugee_actor_internal(rank):
+    if rank == 0:
+        breakpoint()  # noqa
+        rank += 1
+        rank += 1
+        return rank
+    elif rank == 1:
+        breakpoint()  # noqa
+        rank += 2
+        rank += 2
+        return rank
+    elif rank == 2:
+        breakpoint()  # noqa
+        rank += 3
+        rank += 3
+        _bad_rank()
+    elif rank == 3:
+        breakpoint()  # noqa
+        rank += 4
+        rank += 4
+        return rank
+class DebugeeActor(Actor):
+    @endpoint
+    async def to_debug(self):
+        rank = current_rank().rank
+        return _debugee_actor_internal(rank)
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="Not enough GPUs, this test requires at least 2 GPUs",
+)
+async def test_debug() -> None:
+    input_mock = AsyncMock()
+    input_mock.side_effect = [
+        "attach 1",
+        "n",
+        "n",
+        "n",
+        "n",
+        "detach",
+        "attach 1",
+        "detach",
+        "quit",
+        "cast ranks(0,3) n",
+        "cast ranks(0,3) n",
+        # Attaching to 0 and 3 ensures that when we call "list"
+        # the next time, their function/lineno info will be
+        # up-to-date.
+        "attach 0",
+        "detach",
+        "attach 3",
+        "detach",
+        "quit",
+        "attach 2",
+        "c",
+        "detach",
+        "quit",
+        "attach 2",
+        "bt",
+        "c",
+        "quit",
+        "continue",
+    ]
+    outputs = []
+    def _patch_output(msg):
+        nonlocal outputs
+        outputs.append(msg)
+    with patch(
+        "monarch._src.actor.debugger._debugger_input", side_effect=input_mock
+    ), patch("monarch._src.actor.debugger._debugger_output", new=_patch_output):
+        proc = await proc_mesh(hosts=2, gpus=2)
+        debugee = await proc.spawn("debugee", DebugeeActor)
+        debug_client = actor.debug_client()
+        fut = debugee.to_debug.call()
+        await debug_client.wait_pending_session.call_one()
+        breakpoints = []
+        for i in range(10):
+            breakpoints = await debug_client.list.call_one()
+            if len(breakpoints) == 4:
+                break
+            await asyncio.sleep(1)
+            if i == 9:
+                raise RuntimeError("timed out waiting for breakpoints")
+        initial_linenos = {}
+        for i in range(len(breakpoints)):
+            rank, coords, _, _, function, lineno = breakpoints[i]
+            initial_linenos[rank] = lineno
+            assert rank == i
+            assert coords == {"hosts": rank // 2, "gpus": rank % 2}
+            assert function == "test_debugger._debugee_actor_internal"
+            assert lineno == breakpoints[0][5] + 5 * rank
+        await debug_client.enter.call_one()
+        # Check that when detaching and re-attaching to a session, the last portion of the output is repeated
+        expected_last_output = [
+            r"--Return--",
+            r"\n",
+            r"> (/.*/)+test_debugger.py\(\d+\)to_debug\(\)->5\n-> return _debugee_actor_internal\(rank\)",
+            r"\n",
+            r"\(Pdb\) ",
+        ]
+        output_len = len(expected_last_output)
+        assert outputs[-2 * output_len : -output_len] == outputs[-output_len:]
+        for real_output, expected_output in zip(
+            outputs[-output_len:], expected_last_output
+        ):
+            assert re.match(expected_output, real_output) is not None
+        breakpoints = await debug_client.list.call_one()
+        for i in range(len(breakpoints)):
+            if i == 1:
+                assert breakpoints[i][4] == "test_debugger.to_debug"
+            else:
+                assert breakpoints[i][4] == "test_debugger._debugee_actor_internal"
+                assert breakpoints[i][5] == initial_linenos[i]
+        await debug_client.enter.call_one()
+        breakpoints = await debug_client.list.call_one()
+        for i in range(len(breakpoints)):
+            if i == 1:
+                assert breakpoints[i][4] == "test_debugger.to_debug"
+            elif i in (0, 3):
+                assert breakpoints[i][4] == "test_debugger._debugee_actor_internal"
+                assert breakpoints[i][5] == initial_linenos[i] + 2
+            else:
+                assert breakpoints[i][4] == "test_debugger._debugee_actor_internal"
+                assert breakpoints[i][5] == initial_linenos[i]
+        await debug_client.enter.call_one()
+        breakpoints = await debug_client.list.call_one()
+        assert len(breakpoints) == 4
+        # Expect post-mortem debugging for rank 2
+        assert breakpoints[2][4] == "test_debugger._bad_rank"
+        await debug_client.enter.call_one()
+        expected_last_output = [
+            r"\s*(/.*/)+test_debugger.py\(\d+\)_debugee_actor_internal\(\)\n-> _bad_rank\(\)",
+            r"\n",
+            r'> (/.*/)+test_debugger.py\(\d+\)_bad_rank\(\)\n-> raise ValueError\("bad rank"\)',
+            r"\n",
+            r"\(Pdb\) ",
+        ]
+        for output, expected_output in zip(
+            outputs[-len(expected_last_output) :], expected_last_output
+        ):
+            assert re.match(expected_output, output) is not None
+        breakpoints = await debug_client.list.call_one()
+        assert len(breakpoints) == 3
+        for i, rank in enumerate((0, 1, 3)):
+            assert breakpoints[i][0] == rank
+        await debug_client.enter.call_one()
+        breakpoints = await debug_client.list.call_one()
+        assert len(breakpoints) == 0
+        with pytest.raises(
+            monarch._src.actor.actor_mesh.ActorError, match="ValueError: bad rank"
+        ):
+            await fut
+async def test_cast_input_and_wait() -> None:
+    debug_client = DebugClient()
+    mock_sessions = {}
+    for host in range(3):
+        for gpu in range(8):
+            rank = host * 8 + gpu
+            mock_session = MagicMock(spec=DebugSession)
+            mock_session.attach = AsyncMock()
+            mock_session.rank = rank
+            mock_session.coords = {"hosts": host, "gpus": gpu}
+            mock_sessions[rank] = mock_session
+    debug_client.sessions = mock_sessions
+    # Cast to a single rank
+    await debug_client._cast_input_and_wait("n", 2)
+    mock_sessions[2].attach.assert_called_once_with("n", suppress_output=True)
+    for rank, session in mock_sessions.items():
+        if rank != 2:
+            session.attach.assert_not_called()
+    for session in mock_sessions.values():
+        session.attach.reset_mock()
+    # Cast to a list of ranks
+    ranks = [1, 3, 5]
+    await debug_client._cast_input_and_wait("n", ranks)
+    for rank in ranks:
+        mock_sessions[rank].attach.assert_called_once_with("n", suppress_output=True)
+    for rank, session in mock_sessions.items():
+        if rank not in ranks:
+            session.attach.assert_not_called()
+    for session in mock_sessions.values():
+        session.attach.reset_mock()
+    # Cast to a range of ranks
+    ranks = range(2, 24, 3)
+    await debug_client._cast_input_and_wait("n", ranks)
+    for rank in ranks:
+        mock_sessions[rank].attach.assert_called_once_with("n", suppress_output=True)
+    for rank, session in mock_sessions.items():
+        if rank not in ranks:
+            session.attach.assert_not_called()
+    for session in mock_sessions.values():
+        session.attach.reset_mock()
+    # Cast to all ranks
+    await debug_client._cast_input_and_wait("n", None)
+    for session in mock_sessions.values():
+        session.attach.assert_called_once_with("n", suppress_output=True)
+    for session in mock_sessions.values():
+        session.attach.reset_mock()
+    # Cast using dimension filtering with a single value
+    await debug_client._cast_input_and_wait("n", {"hosts": 1})
+    for session in mock_sessions.values():
+        if session.coords["hosts"] == 1:
+            session.attach.assert_called_once_with("n", suppress_output=True)
+        else:
+            session.attach.assert_not_called()
+    for session in mock_sessions.values():
+        session.attach.reset_mock()
+    # Cast using dimension filtering with a list
+    await debug_client._cast_input_and_wait("n", {"hosts": [0, 2]})
+    for _rank, session in mock_sessions.items():
+        if session.coords["hosts"] in [0, 2]:
+            session.attach.assert_called_once_with("n", suppress_output=True)
+        else:
+            session.attach.assert_not_called()
+    for session in mock_sessions.values():
+        session.attach.reset_mock()
+    # Cast using dimension filtering with a range
+    await debug_client._cast_input_and_wait("n", {"gpus": range(5, 8)})
+    for session in mock_sessions.values():
+        if session.coords["gpus"] in range(5, 8):
+            session.attach.assert_called_once_with("n", suppress_output=True)
+        else:
+            session.attach.assert_not_called()
+    for session in mock_sessions.values():
+        session.attach.reset_mock()
+    # Cast using multiple dimension filters
+    await debug_client._cast_input_and_wait(
+        "n", {"hosts": [1, 3], "gpus": range(0, sys.maxsize, 3)}
+    )
+    for session in mock_sessions.values():
+        if session.coords["hosts"] in [1, 3] and session.coords["gpus"] in range(
+            0, sys.maxsize, 3
+        ):
+            session.attach.assert_called_once_with("n", suppress_output=True)
+        else:
+            session.attach.assert_not_called()
+    for session in mock_sessions.values():
+        session.attach.reset_mock()
+    # Cast with non-existent dimension
+    await debug_client._cast_input_and_wait("n", {"hosts": 0, "gpus": 0, "foo": 0})
+    for session in mock_sessions.values():
+        session.attach.assert_not_called()
+@pytest.mark.parametrize(
+    ["user_input", "expected_output"],
+    [
+        ("attach 1", Attach(1)),
+        ("a 100", Attach(100)),
+        ("list", ListCommand()),
+        ("l", ListCommand()),
+        ("help", Help()),
+        ("h", Help()),
+        ("quit", Quit()),
+        ("q", Quit()),
+        ("continue", Continue()),
+        ("c", Continue()),
+        ("cast ranks(123) b 25", Cast(ranks=123, command="b 25")),
+        ("cast ranks(12,34,56) b 25", Cast(ranks=[12, 34, 56], command="b 25")),
+        ("cast ranks(:) b 25", Cast(ranks=range(0, sys.maxsize), command="b 25")),
+        ("cast ranks(:123) b 25", Cast(ranks=range(0, 123), command="b 25")),
+        ("cast ranks(123:) b 25", Cast(ranks=range(123, sys.maxsize), command="b 25")),
+        ("cast ranks(123:456) b 25", Cast(ranks=range(123, 456), command="b 25")),
+        ("cast ranks(::) b 25", Cast(ranks=range(0, sys.maxsize), command="b 25")),
+        (
+            "cast ranks(::123) b 25",
+            Cast(ranks=range(0, sys.maxsize, 123), command="b 25"),
+        ),
+        ("cast ranks(123::) b 25", Cast(ranks=range(123, sys.maxsize), command="b 25")),
+        ("cast ranks(:123:) b 25", Cast(ranks=range(0, 123), command="b 25")),
+        ("cast ranks(:456:123) b 25", Cast(ranks=range(0, 456, 123), command="b 25")),
+        (
+            "cast ranks(456::123) b 25",
+            Cast(ranks=range(456, sys.maxsize, 123), command="b 25"),
+        ),
+        ("cast ranks(123:456:) b 25", Cast(ranks=range(123, 456), command="b 25")),
+        (
+            "cast ranks(456:789:123) b 25",
+            Cast(ranks=range(456, 789, 123), command="b 25"),
+        ),
+        ("cast ranks(dim1=123) up 2", Cast(ranks={"dim1": 123}, command="up 2")),
+        (
+            "cast ranks(dim1=123, dim2=(12,34,56), dim3=15::2) up 2",
+            Cast(
+                ranks={
+                    "dim1": 123,
+                    "dim2": [12, 34, 56],
+                    "dim3": range(15, sys.maxsize, 2),
+                },
+                command="up 2",
+            ),
+        ),
+    ],
+)
+async def test_debug_command_parser_valid_inputs(user_input, expected_output):
+    assert DebugCommand.parse(user_input) == expected_output
+@pytest.mark.parametrize(
+    "invalid_input",
+    [
+        "",
+        "attch 1",
+        "attach",
+        "cast rnks(123) b 25",
+        "cast ranks() b 25",
+        "cast ranks(1ab) b 25",
+        "cast ranks(1,a,3) b 25",
+        "cast ranks(a:2:4) b 25",
+        "cast ranks(1,2,3",
+        "cast ranks(1,2,3)) b 25",
+        "cast ranks(1,) b 25",
+        "cast ranks(1,2,) b 25",
+        "cast ranks(,1,2) b 25",
+        "cast ranks(1,,2) b 25",
+        "cast ranks(:::) b 25",
+        "cast ranks(:123::) b 25",
+        "cast ranks(1:2:3,4) b 25",
+        "cast ranks(dim1=) b 25",
+        "cast ranks(dim1=123, dim2=) b 25",
+        "cast ranks(dim1=123, dim2=(12,34,56) b 25",
+        "cast ranks(dim1=123, dim2=(,12,34,56) b 25",
+        "cast ranks(dim1=123, dim2=(12,,34,56) b 25",
+        "cast ranks(dim1=123, dim2=(12,34,56), dim3=15::2 b 25",
+        "cast ranks(dim1=123,) b 25",
+    ],
+)
+async def test_debug_command_parser_invalid_inputs(invalid_input):
+    assert DebugCommand.parse(invalid_input) is None

tests/test_env_before_cuda.py ADDED Viewed

@@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import os
+import sys
+import unittest
+from typing import Dict, List
+import cloudpickle
+import torch
+from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
+from monarch._src.actor.allocator import LocalAllocator
+from monarch._src.actor.proc_mesh import proc_mesh
+from monarch.actor import Actor, endpoint, ProcMesh
+class CudaInitTestActor(Actor):
+    """Actor that initializes CUDA and checks environment variables"""
+    def __init__(self) -> None:
+        self.env_vars_before_init: Dict[str, str] = {}
+        self.cuda_initialized: bool = False
+    @endpoint
+    async def init_cuda_and_check_env(self, env_var_names: List[str]) -> Dict[str, str]:
+        """
+        Check environment variables before initializing CUDA
+        Returns the values of the environment variables
+        """
+        for var_name in env_var_names:
+            self.env_vars_before_init[var_name] = os.environ.get(var_name, "NOT_SET")
+        if torch.cuda.is_available():
+            torch.cuda.init()
+            self.cuda_initialized = True
+        return self.env_vars_before_init
+    @endpoint
+    async def is_cuda_initialized(self) -> bool:
+        """Return whether CUDA was initialized"""
+        return self.cuda_initialized
+class TestEnvBeforeCuda(unittest.IsolatedAsyncioTestCase):
+    """Test that the env vars are setup before cuda init"""
+    @classmethod
+    def setUpClass(cls) -> None:
+        cloudpickle.register_pickle_by_value(sys.modules[CudaInitTestActor.__module__])
+    @classmethod
+    def tearDownClass(cls) -> None:
+        cloudpickle.unregister_pickle_by_value(
+            sys.modules[CudaInitTestActor.__module__]
+        )
+    async def test_lambda_sets_env_vars_before_cuda_init(self) -> None:
+        """Test that environment variables are set by lambda before CUDA initialization"""
+        cuda_env_vars: Dict[str, str] = {
+            "CUDA_VISIBLE_DEVICES": "0",
+            "CUDA_CACHE_PATH": "/tmp/cuda_cache_test",
+            "CUDA_LAUNCH_BLOCKING": "1",
+        }
+        def setup_cuda_env() -> None:
+            for name, value in cuda_env_vars.items():
+                os.environ[name] = value
+        spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
+        allocator = LocalAllocator()
+        alloc = await allocator.allocate(spec)
+        proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_cuda_env)
+        try:
+            actor = await proc_mesh.spawn("cuda_init", CudaInitTestActor)
+            env_vars = await actor.init_cuda_and_check_env.call_one(
+                list(cuda_env_vars.keys())
+            )
+            await actor.is_cuda_initialized.call_one()
+            for name, expected_value in cuda_env_vars.items():
+                self.assertEqual(
+                    env_vars.get(name),
+                    expected_value,
+                    f"Environment variable {name} was not set correctly before CUDA initialization",
+                )
+        finally:
+            await proc_mesh.stop()
+    async def test_proc_mesh_with_lambda_env(self) -> None:
+        """Test that proc_mesh function works with lambda for env parameter"""
+        cuda_env_vars: Dict[str, str] = {
+            "CUDA_DEVICE_ORDER": "PCI_BUS_ID",
+            "CUDA_MODULE_LOADING": "LAZY",
+            "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+        }
+        def setup_cuda_env() -> None:
+            for name, value in cuda_env_vars.items():
+                os.environ[name] = value
+        proc_mesh_instance = await proc_mesh(gpus=1, hosts=1, setup=setup_cuda_env)
+        try:
+            actor = await proc_mesh_instance.spawn("cuda_init", CudaInitTestActor)
+            env_vars = await actor.init_cuda_and_check_env.call_one(
+                list(cuda_env_vars.keys())
+            )
+            for name, expected_value in cuda_env_vars.items():
+                self.assertEqual(
+                    env_vars.get(name),
+                    expected_value,
+                    f"Environment variable {name} was not set correctly before CUDA initialization",
+                )
+        finally:
+            await proc_mesh_instance.stop()
+    async def test_proc_mesh_with_dictionary_env(self) -> None:
+        """Test that proc_mesh function works with dictionary for env parameter"""
+        cuda_env_vars: Dict[str, str] = {
+            "CUDA_DEVICE_ORDER": "PCI_BUS_ID",
+            "CUDA_MODULE_LOADING": "LAZY",
+            "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+        }
+        proc_mesh_instance = await proc_mesh(gpus=1, hosts=1, env=cuda_env_vars)
+        try:
+            actor = await proc_mesh_instance.spawn("cuda_init", CudaInitTestActor)
+            env_vars = await actor.init_cuda_and_check_env.call_one(
+                list(cuda_env_vars.keys())
+            )
+            self.assertEqual(
+                env_vars.get("CUDA_DEVICE_ORDER"),
+                "PCI_BUS_ID",
+            )
+            self.assertEqual(
+                env_vars.get("CUDA_MODULE_LOADING"),
+                "LAZY",
+            )
+            self.assertEqual(
+                env_vars.get("CUDA_DEVICE_MAX_CONNECTIONS"),
+                "1",
+            )
+        finally:
+            await proc_mesh_instance.stop()