PyPI - torchmonarch-nightly - Versions diffs - 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

monarch/__init__.py +189 -0
monarch/_monarch/__init__.py +5 -0
monarch/_monarch/hyperactor/__init__.py +74 -0
monarch/_monarch/selection/__init__.py +13 -0
monarch/_monarch/worker/__init__.py +0 -0
monarch/_monarch/worker/debugger.py +117 -0
monarch/_monarch/worker/logging.py +107 -0
monarch/_rust_bindings.so +0 -0
monarch/_testing.py +198 -0
monarch/actor_mesh.py +692 -0
monarch/allocator.py +62 -0
monarch/bootstrap_main.py +75 -0
monarch/builtins/__init__.py +14 -0
monarch/builtins/log.py +22 -0
monarch/builtins/random.py +69 -0
monarch/cached_remote_function.py +257 -0
monarch/common/_C.pyi +11 -0
monarch/common/_C.so +0 -0
monarch/common/__init__.py +0 -0
monarch/common/_coalescing.py +308 -0
monarch/common/_device_utils.py +18 -0
monarch/common/_tensor_to_table.py +172 -0
monarch/common/base_tensor.py +28 -0
monarch/common/borrows.py +143 -0
monarch/common/client.py +646 -0
monarch/common/constants.py +10 -0
monarch/common/context_manager.py +40 -0
monarch/common/controller_api.py +104 -0
monarch/common/device_mesh.py +443 -0
monarch/common/fake.py +55 -0
monarch/common/function.py +160 -0
monarch/common/function_caching.py +164 -0
monarch/common/future.py +168 -0
monarch/common/invocation.py +125 -0
monarch/common/mast.py +221 -0
monarch/common/messages.py +572 -0
monarch/common/mock_cuda.py +41 -0
monarch/common/opaque_ref.py +98 -0
monarch/common/pickle_flatten.py +48 -0
monarch/common/pipe.py +152 -0
monarch/common/process_group.py +55 -0
monarch/common/recording.py +127 -0
monarch/common/reference.py +33 -0
monarch/common/remote.py +304 -0
monarch/common/selection.py +9 -0
monarch/common/shape.py +204 -0
monarch/common/stream.py +111 -0
monarch/common/tensor.py +793 -0
monarch/common/tensor_factory.py +31 -0
monarch/common/tree.py +73 -0
monarch/controller/__init__.py +7 -0
monarch/controller/backend.py +223 -0
monarch/controller/controller.py +223 -0
monarch/controller/debugger.py +47 -0
monarch/controller/history.py +90 -0
monarch/controller/rust_backend/__init__.py +7 -0
monarch/controller/rust_backend/controller.py +245 -0
monarch/fetch.py +55 -0
monarch/future.py +25 -0
monarch/gradient/__init__.py +11 -0
monarch/gradient/_gradient_generator.pyi +22 -0
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +185 -0
monarch/memory.py +43 -0
monarch/monarch_controller +0 -0
monarch/notebook.py +761 -0
monarch/opaque_module.py +235 -0
monarch/opaque_object.py +88 -0
monarch/parallel/__init__.py +9 -0
monarch/parallel/pipelining/__init__.py +7 -0
monarch/parallel/pipelining/runtime.py +847 -0
monarch/parallel/pipelining/schedule_ir.py +692 -0
monarch/parallel/pipelining/scheduler.py +249 -0
monarch/proc_mesh.py +188 -0
monarch/profiler.py +160 -0
monarch/python_local_mesh.py +107 -0
monarch/random.py +61 -0
monarch/rdma.py +190 -0
monarch/remote_class.py +114 -0
monarch/rust_backend_mesh.py +280 -0
monarch/rust_local_mesh.py +1402 -0
monarch/sim_mesh.py +357 -0
monarch/simulator/__init__.py +7 -0
monarch/simulator/command_history.py +424 -0
monarch/simulator/config.py +21 -0
monarch/simulator/interface.py +59 -0
monarch/simulator/ir.py +770 -0
monarch/simulator/mock_controller.py +214 -0
monarch/simulator/profiling.py +424 -0
monarch/simulator/simulator.py +1052 -0
monarch/simulator/task.py +255 -0
monarch/simulator/tensor.py +373 -0
monarch/simulator/trace.py +395 -0
monarch/simulator/utils.py +41 -0
monarch/simulator/worker.py +389 -0
monarch/tensor_worker_main.py +260 -0
monarch/tensorboard.py +84 -0
monarch/timer/__init__.py +21 -0
monarch/timer/example_monarch.py +78 -0
monarch/timer/example_spmd.py +55 -0
monarch/timer/execution_timer.py +199 -0
monarch/timer/execution_timer_test.py +131 -0
monarch/tools/__init__.py +7 -0
monarch/tools/cli.py +167 -0
monarch/tools/commands.py +189 -0
monarch/tools/components/__init__.py +7 -0
monarch/tools/components/hyperactor.py +57 -0
monarch/tools/config/__init__.py +20 -0
monarch/tools/config/defaults.py +54 -0
monarch/tools/mesh_spec.py +121 -0
monarch/worker/__init__.py +7 -0
monarch/worker/_testing_function.py +481 -0
monarch/worker/compiled_block.py +270 -0
monarch/worker/debugger.py +125 -0
monarch/worker/lines.py +47 -0
monarch/worker/monitor.py +53 -0
monarch/worker/worker.py +1191 -0
monarch/world_mesh.py +34 -0
monarch_supervisor/__init__.py +1044 -0
monarch_supervisor/_testing.py +44 -0
monarch_supervisor/function_call.py +30 -0
monarch_supervisor/host.py +386 -0
monarch_supervisor/launchers.py +145 -0
monarch_supervisor/log_pstree.py +48 -0
monarch_supervisor/logging.py +103 -0
monarch_supervisor/python_executable.py +42 -0
tests/__init__.py +0 -0
tests/dispatch_bench.py +124 -0
tests/dispatch_bench_helper.py +25 -0
tests/error_test_binary.py +139 -0
tests/simulator/__init__.py +0 -0
tests/simulator/test_profiling.py +136 -0
tests/simulator/test_simulator.py +411 -0
tests/simulator/test_task.py +64 -0
tests/simulator/test_worker.py +102 -0
tests/sleep_binary.py +35 -0
tests/test_actor_error.py +112 -0
tests/test_alloc.py +25 -0
tests/test_coalescing.py +492 -0
tests/test_controller.py +835 -0
tests/test_device_mesh.py +132 -0
tests/test_fault_tolerance.py +398 -0
tests/test_future.py +94 -0
tests/test_grad_generator.py +121 -0
tests/test_mock_cuda.py +74 -0
tests/test_pdb_actor.py +110 -0
tests/test_python_actors.py +372 -0
tests/test_remote_functions.py +1271 -0
tests/test_rust_backend.py +182 -0
tests/test_signal_safe_block_on.py +103 -0
tests/test_sim_backend.py +54 -0
torchmonarch_nightly-2025.6.4.dist-info/METADATA +94 -0
torchmonarch_nightly-2025.6.4.dist-info/RECORD +157 -0
torchmonarch_nightly-2025.6.4.dist-info/WHEEL +5 -0
torchmonarch_nightly-2025.6.4.dist-info/entry_points.txt +3 -0
torchmonarch_nightly-2025.6.4.dist-info/licenses/LICENSE +29 -0
torchmonarch_nightly-2025.6.4.dist-info/top_level.txt +3 -0

monarch_supervisor/logging.py ADDED Viewed

@@ -0,0 +1,103 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import os
+import socket
+import sys
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def _handle_unhandled_exception(*args):
+    logger.error("Uncaught exception", exc_info=args)
+_glog_level_to_abbr = {
+    "DEBUG": "V",  # V is for VERBOSE in glog
+    "INFO": "I",
+    "WARNING": "W",
+    "ERROR": "E",
+    "CRITICAL": "C",
+}
+def fix_exception_lines(tb_lines):
+    formatted_lines = []
+    for line in tb_lines:
+        # Replace the standard file and line format with the custom format
+        if line.startswith("  File"):
+            # Extract the filename and line number
+            parts = line.split(",")
+            file_info = parts[0].strip()[6:-1]  # Remove '  File "' and '"'
+            line_info = parts[1].strip()[5:]  # Remove 'line '
+            new_line = f"  File {file_info}:{line_info}"
+            if len(parts) > 2:
+                new_line += ", " + ",".join(parts[2:]).strip()
+            formatted_lines.append(new_line)
+        else:
+            formatted_lines.append(line.strip())
+    return formatted_lines
+class _Formatter(logging.Formatter):
+    def __init__(self, suffix):
+        self.suffix = suffix
+    def format(self, record):
+        message = record.getMessage()
+        asctime = self.formatTime(record, "%m%d %H:%M:%S")
+        lines = message.strip().split("\n")
+        if record.exc_info:
+            exc_info = fix_exception_lines(
+                self.formatException(record.exc_info).split("\n")
+            )
+            lines.extend(exc_info)
+        if record.stack_info:
+            stack_info = self.formatStack(record.stack_info)
+            lines.extend(stack_info.strip().split("\n"))
+        shortlevel = _glog_level_to_abbr.get(record.levelname, record.levelname[0])
+        prefix = (
+            f"{shortlevel}{asctime}.{int(record.msecs*1000):06d} "
+            f"{record.filename}:"
+            f"{record.lineno}]{self.suffix}"
+        )
+        return "\n".join(f"{prefix} {line}" for line in lines)
+def initialize_logging(process_name=None):
+    log_folder = os.environ.get("TORCH_MONARCH_LOG_FOLDER")
+    log_level = os.environ.get("TORCH_MONARCH_LOG_LEVEL", "INFO")
+    suffix = "" if process_name is None else f" {process_name}:"
+    handler = None
+    if log_folder is not None:
+        log_folder_path = Path(log_folder)
+        log_folder_path.mkdir(parents=True, exist_ok=True)
+        safe_process_name = (
+            process_name.replace("/", "_") if process_name else "logfile.log"
+        )
+        log_file_name = f"{safe_process_name}.log"
+        log_file_path = log_folder_path / log_file_name
+        handler = logging.FileHandler(log_file_path)
+    else:
+        handler = logging.StreamHandler()
+    handler.setFormatter(_Formatter(suffix))
+    handler.setLevel(log_level)
+    logging.root.setLevel(log_level)
+    logging.root.addHandler(handler)
+    sys.excepthook = _handle_unhandled_exception
+def gethostname():
+    """Get the hostname of the machine."""
+    hostname = socket.gethostname()
+    hostname = hostname.replace(".facebook.com", "")
+    return hostname

monarch_supervisor/python_executable.py ADDED Viewed

@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib.resources
+import os
+import sys
+try:
+    from __manifest__ import fbmake  # noqa
+    IN_PAR = True
+except ImportError:
+    IN_PAR = False
+PYTHON_EXECUTABLE: str
+if IN_PAR:
+    # The worker bootstrap binary will import this supervisor lib. When that
+    # happens don't try to search for the bootstrap binary again, just use the
+    # current executable.
+    import __main__ as main_module  # @manual
+    if hasattr(main_module, "__MONARCH_TENSOR_WORKER_ENV__"):
+        PYTHON_EXECUTABLE = os.environ["FB_XAR_INVOKED_NAME"]
+    else:
+        try:
+            with importlib.resources.path(
+                "monarch_tensor_worker_env", "worker_env"
+            ) as path:
+                if not path.exists():
+                    raise ImportError()
+                PYTHON_EXECUTABLE = str(path)
+        except ImportError:
+            raise ImportError(
+                "Monarch worker env not found, please define a custom 'monarch_tensor_worker_env' or "
+                "add '//monarch/python/monarch_supervisor/worker:default_worker_env' "
+                "to your binary dependencies in TARGETS"
+            )
+else:
+    PYTHON_EXECUTABLE = sys.executable

tests/__init__.py ADDED Viewed

File without changes

tests/dispatch_bench.py ADDED Viewed

@@ -0,0 +1,124 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import sys
+import torch
+import torch.utils.benchmark as benchmark
+# this function helps get a local device mesh for testing
+from monarch._testing import mock_mesh
+from monarch.builtins.log import set_logging_level_remote
+from monarch.common._coalescing import coalescing
+from monarch.common.remote import remote
+from monarch.fetch import fetch_shard
+from monarch.python_local_mesh import python_local_mesh
+from monarch_supervisor.logging import initialize_logging
+from tests.dispatch_bench_helper import run_loop, run_loop_local
+NITER = 10000
+DEFAULT_TENSOR_SIZE = (100, 100)
+initialize_logging("dispatch_bench")
+# user-defined remote functions
+log = remote("monarch.worker._testing_function.log", propagate="inspect")
+def local_run():
+    run_loop_local(NITER, DEFAULT_TENSOR_SIZE)
+def dispatch_to_worker(device_mesh, n_iter, tensor_size):
+    with device_mesh.activate():
+        result = run_loop_local(n_iter, tensor_size)
+        local_result = fetch_shard(result, {"host": 0, "gpu": 0})
+    local_result = local_result.result()
+def dispatch_to_worker_remote_function(device_mesh, n_iter, tensor_size):
+    with device_mesh.activate():
+        result = run_loop(n_iter, tensor_size)
+        local_result = fetch_shard(result, {"host": 0, "gpu": 0})
+    local_result = local_result.result()
+def dispatch_to_worker_coalescing(device_mesh, n_iter, tensor_size):
+    with device_mesh.activate():
+        with coalescing():
+            result = run_loop_local(n_iter, tensor_size)
+        local_result = fetch_shard(result, {"host": 0, "gpu": 0})
+    local_result = local_result.result()
+def main():
+    mocked = False
+    torch.set_default_device("cuda")
+    if mocked:
+        device_mesh = mock_mesh(hosts=1, gpus=1)
+    else:
+        device_mesh = python_local_mesh(hosts=1, gpus=1)
+    with device_mesh.activate():
+        torch.set_default_device("cuda")
+        set_logging_level_remote(logging.WARNING)
+    # bench 1: local compute only
+    t0 = benchmark.Timer(
+        stmt="run_loop_local(niter, tensor_size)",
+        setup="from __main__ import run_loop_local",
+        globals={"niter": NITER, "tensor_size": DEFAULT_TENSOR_SIZE},
+    )
+    local_only_results = t0.blocked_autorange(min_run_time=10)
+    print(local_only_results)
+    t1 = benchmark.Timer(
+        stmt="dispatch_to_worker(device_mesh, niter, tensor_size)",
+        setup="from __main__ import dispatch_to_worker",
+        globals={
+            "device_mesh": device_mesh,
+            "niter": NITER,
+            "tensor_size": DEFAULT_TENSOR_SIZE,
+        },
+    )
+    dispatch_to_worker_results = t1.blocked_autorange(min_run_time=10)
+    print(dispatch_to_worker_results)
+    t2 = benchmark.Timer(
+        stmt="dispatch_to_worker_remote_function(device_mesh, niter, tensor_size)",
+        setup="from __main__ import dispatch_to_worker_remote_function",
+        globals={
+            "device_mesh": device_mesh,
+            "niter": NITER,
+            "tensor_size": DEFAULT_TENSOR_SIZE,
+        },
+    )
+    dispatch_to_worker_remote_function_results = t2.blocked_autorange(min_run_time=10)
+    print(dispatch_to_worker_remote_function_results)
+    t3 = benchmark.Timer(
+        stmt="dispatch_to_worker_coalescing(device_mesh, niter, tensor_size)",
+        setup="from __main__ import dispatch_to_worker_coalescing",
+        globals={
+            "device_mesh": device_mesh,
+            "niter": NITER,
+            "tensor_size": DEFAULT_TENSOR_SIZE,
+        },
+    )
+    dispatch_to_worker_coalescing_results = t3.blocked_autorange(min_run_time=10)
+    print(dispatch_to_worker_coalescing_results)
+    device_mesh.exit()
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

tests/dispatch_bench_helper.py ADDED Viewed

@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import torch
+from monarch.common.remote import remote
+def run_loop_local(n_iters, tensor_shape=(2, 2)):
+    local = torch.zeros(*tensor_shape)
+    ones = torch.ones(*tensor_shape)
+    for _ in range(n_iters):
+        local = ones + local
+    return local
+def _run_loop(*args, **kwargs):
+    return torch.ones(args[1])
+run_loop = remote("tests.dispatch_bench_helper.run_loop_local", propagate=_run_loop)

tests/error_test_binary.py ADDED Viewed

@@ -0,0 +1,139 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import ctypes
+import sys
+from monarch._rust_bindings.monarch_extension.panic import panicking_function
+from monarch.actor_mesh import Actor, endpoint
+from monarch.proc_mesh import proc_mesh
+class ErrorActor(Actor):
+    """An actor that has endpoints cause segfaults."""
+    @endpoint
+    async def cause_segfault(self) -> None:
+        """Endpoint that causes a segmentation fault."""
+        # Create a C function pointer to an invalid memory address
+        # This will reliably cause a segmentation fault when called
+        function_type = ctypes.CFUNCTYPE(None)
+        # Use a non-zero but invalid address to avoid ctypes null pointer checks
+        invalid_address = 0xDEADBEEF
+        invalid_function = function_type(invalid_address)
+        # Calling this function will cause a segfault
+        invalid_function()
+    @endpoint
+    async def cause_panic(self) -> None:
+        """Endpoint that calls a Rust function that panics."""
+        panicking_function()
+class ErrorActorSync(Actor):
+    """An actor that has endpoints cause segfaults."""
+    @endpoint  # pyre-ignore
+    def cause_segfault(self) -> None:
+        """Endpoint that causes a segmentation fault."""
+        # Create a C function pointer to an invalid memory address
+        # This will reliably cause a segmentation fault when called
+        function_type = ctypes.CFUNCTYPE(None)
+        # Use a non-zero but invalid address to avoid ctypes null pointer checks
+        invalid_address = 0xDEADBEEF
+        invalid_function = function_type(invalid_address)
+        # Calling this function will cause a segfault
+        invalid_function()
+    @endpoint  # pyre-ignore
+    def cause_panic(self) -> None:
+        """Endpoint that calls a Rust function that panics."""
+        panicking_function()
+def _run_error_test_sync(num_procs, sync_endpoint, endpoint_name):
+    proc = proc_mesh(gpus=num_procs).get()
+    if sync_endpoint:
+        actor_class = ErrorActorSync
+    else:
+        actor_class = ErrorActor
+    error_actor = proc.spawn("error_actor", actor_class).get()
+    # This output is checked in the test to make sure that the process actually got here
+    print("I actually ran")
+    sys.stdout.flush()
+    if endpoint_name == "cause_segfault":
+        endpoint = error_actor.cause_segfault
+    elif endpoint_name == "cause_panic":
+        endpoint = error_actor.cause_panic
+    else:
+        raise ValueError(f"Unknown endpoint name: {endpoint_name}")
+    # Exercise both call() and call_one() in our tests, to check that error
+    # aggregation behavior is consistent.
+    if num_procs == 1:
+        endpoint.call_one().get()
+    else:
+        endpoint.call().get()
+def _run_error_test(num_procs, sync_endpoint, endpoint_name):
+    import asyncio
+    if sync_endpoint:
+        actor_class = ErrorActorSync
+    else:
+        actor_class = ErrorActor
+    async def run_test():
+        proc = await proc_mesh(gpus=num_procs)
+        error_actor = await proc.spawn("error_actor", actor_class)
+        # This output is checked in the test to make sure that the process actually got here
+        print("I actually ran")
+        sys.stdout.flush()
+        if endpoint_name == "cause_segfault":
+            endpoint = error_actor.cause_segfault
+        elif endpoint_name == "cause_panic":
+            endpoint = error_actor.cause_panic
+        else:
+            raise ValueError(f"Unknown endpoint name: {endpoint_name}")
+        # Exercise both call() and call_one() in our tests, to check that error
+        # aggregation behavior is consistent.
+        if num_procs == 1:
+            await endpoint.call_one()
+        else:
+            await endpoint.call()
+    asyncio.run(run_test())
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-procs", type=int)
+    parser.add_argument("--sync-test-impl", type=bool)
+    parser.add_argument("--sync-endpoint", type=bool)
+    parser.add_argument("--endpoint-name", type=str)
+    args = parser.parse_args()
+    print(
+        f"Running segfault test: {args.num_procs=} {args.sync_test_impl=} {args.sync_endpoint=}, {args.endpoint_name=}"
+    )
+    if args.sync_test_impl:
+        _run_error_test_sync(args.num_procs, args.sync_endpoint, args.endpoint_name)
+    else:
+        _run_error_test(args.num_procs, args.sync_endpoint, args.endpoint_name)
+if __name__ == "__main__":
+    main()

tests/simulator/__init__.py ADDED Viewed

File without changes

tests/simulator/test_profiling.py ADDED Viewed

@@ -0,0 +1,136 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import unittest
+import pytest
+import torch
+from monarch.common import messages
+from monarch.simulator.profiling import RuntimeEstimator, RuntimeProfiler, TimingType
+# pyre-ignore-all-errors[6]
+# pyre-ignore-all-errors[16]
+class TestRuntimeEstimator(unittest.TestCase):
+    def test_user_manual_setting(self):
+        runtime = RuntimeEstimator()
+        input_tensor = torch.rand(10, 10)
+        input_tensor.ref = 1
+        input_tensor._fake = None
+        output_tensor = torch.rand(10, 10)
+        output_tensor.ref = 2
+        output_tensor._fake = None
+        send_tensor = messages.SendTensor(
+            result=output_tensor,
+            from_ranks=[1],
+            to_ranks=[2],
+            tensor=input_tensor,
+            factory=None,
+            from_stream=None,
+            to_stream=None,
+        )
+        reduce = messages.Reduce(
+            result=output_tensor,
+            local_tensor=input_tensor,
+            factory=None,
+            source_mesh=None,
+            stream=None,
+            dims=None,
+            reduction=None,
+            scatter=False,
+            inplace=False,
+            out=None,
+        )
+        call_function = messages.CallFunction(
+            ident=1,
+            result=None,
+            mutates=None,
+            function=None,
+            args=None,
+            kwargs=None,
+            stream=None,
+            device_mesh=None,
+            remote_process_groups=None,
+        )
+        self.assertEqual(runtime.get_runtime(send_tensor), 100_000)
+        self.assertEqual(runtime.get_runtime(reduce), 100_000)
+        self.assertEqual(runtime.get_runtime(call_function), 10_000)
+        self.assertEqual(runtime.get_runtime("kernel_launch"), 500)
+        self.assertEqual(runtime.get_runtime("wait_event"), 500)
+        runtime.set_custom_timing(
+            {
+                TimingType.SEND_TENSOR: 1_000,
+                TimingType.REDUCE: 2_000,
+                TimingType.CALL_FUNCTION: 3_000,
+                TimingType.KERNEL_LAUNCH: 4_000,
+                TimingType.WAIT_EVENT: 5_000,
+            }
+        )
+        self.assertEqual(runtime.get_runtime(send_tensor), 1_000)
+        self.assertEqual(runtime.get_runtime(reduce), 2_000)
+        self.assertEqual(runtime.get_runtime(call_function), 3_000)
+        self.assertEqual(runtime.get_runtime("kernel_launch"), 4_000)
+        self.assertEqual(runtime.get_runtime("wait_event"), 5_000)
+        runtime.set_custom_timing(
+            {
+                TimingType.SEND_TENSOR: lambda msg: 4_000,
+                TimingType.REDUCE: lambda msg: 5_000,
+                TimingType.CALL_FUNCTION: lambda msg: 6_000,
+                TimingType.KERNEL_LAUNCH: lambda: 8_000,
+                TimingType.WAIT_EVENT: lambda: 9_000,
+            }
+        )
+        self.assertEqual(runtime.get_runtime(send_tensor), 4_000)
+        self.assertEqual(runtime.get_runtime(reduce), 5_000)
+        self.assertEqual(runtime.get_runtime(call_function), 6_000)
+        self.assertEqual(runtime.get_runtime("kernel_launch"), 8_000)
+        self.assertEqual(runtime.get_runtime("wait_event"), 9_000)
+    @pytest.mark.oss_skip
+    def test_runtime_profiler(self) -> None:
+        m1 = torch.rand(1000, 2000).cuda()
+        m2 = torch.rand(2000, 4000).cuda()
+        m1.ref = 1
+        m2.ref = 2
+        msg = messages.CallFunction(
+            ident=1,
+            result=None,
+            mutates=None,
+            function=torch.ops.aten.mm.default,
+            args=(m1, m2),
+            kwargs=None,
+            stream=None,
+            device_mesh=None,
+            remote_process_groups=None,
+        )
+        profiler = RuntimeProfiler()
+        ret = profiler.profile_cmd(msg, ranks=[0])[0]
+        self.assertEqual(ret[0].factory.size, (1000, 4000))
+        # Should be at least 0.1 ms
+        self.assertTrue(ret[1] > 100)
+        # Should be at most 100 ms
+        self.assertTrue(ret[1] < 100_000)
+        # Change the cached profiling result to verify if cached mechanism works
+        key = next(iter(profiler.cached.keys()))
+        profiler.cached[key][0] = (profiler.cached[key][0][0], 987_654_321)
+        ret = profiler.profile_cmd(msg, ranks=[0])[0]
+        self.assertEqual(ret[0].factory.size, (1000, 4000))
+        self.assertEqual(ret[1], 987_654_321)
+if __name__ == "__main__":
+    unittest.main()