PyPI - torchmonarch-nightly - Versions diffs - 2025.6.27__cp313-cp313-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.27__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

monarch/__init__.py +189 -0
monarch/_monarch/__init__.py +5 -0
monarch/_monarch/hyperactor/__init__.py +58 -0
monarch/_monarch/selection/__init__.py +13 -0
monarch/_monarch/worker/__init__.py +0 -0
monarch/_monarch/worker/debugger.py +117 -0
monarch/_monarch/worker/logging.py +107 -0
monarch/_rust_bindings.so +0 -0
monarch/_testing.py +230 -0
monarch/actor_mesh.py +761 -0
monarch/allocator.py +220 -0
monarch/bootstrap_main.py +59 -0
monarch/builtins/__init__.py +14 -0
monarch/builtins/log.py +22 -0
monarch/builtins/random.py +68 -0
monarch/cached_remote_function.py +257 -0
monarch/code_sync.py +10 -0
monarch/common/_C.pyi +11 -0
monarch/common/_C.so +0 -0
monarch/common/__init__.py +0 -0
monarch/common/_coalescing.py +308 -0
monarch/common/_device_utils.py +18 -0
monarch/common/_tensor_to_table.py +172 -0
monarch/common/base_tensor.py +28 -0
monarch/common/borrows.py +143 -0
monarch/common/client.py +690 -0
monarch/common/constants.py +10 -0
monarch/common/context_manager.py +40 -0
monarch/common/controller_api.py +104 -0
monarch/common/device_mesh.py +417 -0
monarch/common/fake.py +55 -0
monarch/common/function.py +160 -0
monarch/common/function_caching.py +164 -0
monarch/common/future.py +168 -0
monarch/common/invocation.py +125 -0
monarch/common/mast.py +221 -0
monarch/common/messages.py +573 -0
monarch/common/mock_cuda.py +41 -0
monarch/common/opaque_ref.py +98 -0
monarch/common/pickle_flatten.py +48 -0
monarch/common/pipe.py +152 -0
monarch/common/process_group.py +55 -0
monarch/common/recording.py +127 -0
monarch/common/reference.py +33 -0
monarch/common/remote.py +297 -0
monarch/common/selection.py +9 -0
monarch/common/shape.py +229 -0
monarch/common/stream.py +114 -0
monarch/common/tensor.py +814 -0
monarch/common/tensor_factory.py +31 -0
monarch/common/tree.py +73 -0
monarch/controller/__init__.py +7 -0
monarch/controller/backend.py +223 -0
monarch/controller/controller.py +223 -0
monarch/controller/debugger.py +47 -0
monarch/controller/history.py +90 -0
monarch/controller/rust_backend/__init__.py +7 -0
monarch/controller/rust_backend/controller.py +245 -0
monarch/debugger.py +379 -0
monarch/fetch.py +55 -0
monarch/future.py +76 -0
monarch/gradient/__init__.py +11 -0
monarch/gradient/_gradient_generator.pyi +22 -0
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +185 -0
monarch/memory.py +43 -0
monarch/mesh_controller.py +271 -0
monarch/monarch_controller +0 -0
monarch/notebook.py +761 -0
monarch/opaque_module.py +235 -0
monarch/opaque_object.py +88 -0
monarch/parallel/__init__.py +9 -0
monarch/parallel/pipelining/__init__.py +7 -0
monarch/parallel/pipelining/runtime.py +847 -0
monarch/parallel/pipelining/schedule_ir.py +692 -0
monarch/parallel/pipelining/scheduler.py +249 -0
monarch/pdb_wrapper.py +135 -0
monarch/proc_mesh.py +299 -0
monarch/profiler.py +160 -0
monarch/python_local_mesh.py +107 -0
monarch/random.py +61 -0
monarch/rdma.py +162 -0
monarch/remote_class.py +114 -0
monarch/rust_backend_mesh.py +280 -0
monarch/rust_local_mesh.py +1402 -0
monarch/sim_mesh.py +359 -0
monarch/simulator/__init__.py +7 -0
monarch/simulator/command_history.py +424 -0
monarch/simulator/config.py +21 -0
monarch/simulator/interface.py +59 -0
monarch/simulator/ir.py +770 -0
monarch/simulator/mock_controller.py +214 -0
monarch/simulator/profiling.py +424 -0
monarch/simulator/simulator.py +1052 -0
monarch/simulator/task.py +255 -0
monarch/simulator/tensor.py +373 -0
monarch/simulator/trace.py +395 -0
monarch/simulator/utils.py +41 -0
monarch/simulator/worker.py +389 -0
monarch/telemetry.py +19 -0
monarch/tensor_worker_main.py +260 -0
monarch/tensorboard.py +84 -0
monarch/timer/__init__.py +21 -0
monarch/timer/example_monarch.py +78 -0
monarch/timer/example_spmd.py +55 -0
monarch/timer/execution_timer.py +199 -0
monarch/timer/execution_timer_test.py +131 -0
monarch/tools/__init__.py +7 -0
monarch/tools/cli.py +167 -0
monarch/tools/commands.py +251 -0
monarch/tools/components/__init__.py +7 -0
monarch/tools/components/hyperactor.py +58 -0
monarch/tools/config/__init__.py +20 -0
monarch/tools/config/defaults.py +54 -0
monarch/tools/mesh_spec.py +165 -0
monarch/tools/network.py +69 -0
monarch/worker/__init__.py +7 -0
monarch/worker/_testing_function.py +481 -0
monarch/worker/compiled_block.py +270 -0
monarch/worker/debugger.py +125 -0
monarch/worker/lines.py +47 -0
monarch/worker/monitor.py +53 -0
monarch/worker/worker.py +1191 -0
monarch/world_mesh.py +34 -0
monarch_supervisor/__init__.py +1044 -0
monarch_supervisor/_testing.py +44 -0
monarch_supervisor/function_call.py +30 -0
monarch_supervisor/host.py +386 -0
monarch_supervisor/launchers.py +145 -0
monarch_supervisor/log_pstree.py +48 -0
monarch_supervisor/logging.py +103 -0
monarch_supervisor/python_executable.py +42 -0
tests/__init__.py +0 -0
tests/dispatch_bench.py +124 -0
tests/dispatch_bench_helper.py +25 -0
tests/error_test_binary.py +180 -0
tests/simulator/__init__.py +0 -0
tests/simulator/test_profiling.py +136 -0
tests/simulator/test_simulator.py +411 -0
tests/simulator/test_task.py +64 -0
tests/simulator/test_worker.py +102 -0
tests/sleep_binary.py +35 -0
tests/test_actor_error.py +240 -0
tests/test_alloc.py +25 -0
tests/test_allocator.py +365 -0
tests/test_coalescing.py +492 -0
tests/test_controller.py +845 -0
tests/test_device_mesh.py +132 -0
tests/test_fault_tolerance.py +398 -0
tests/test_future.py +94 -0
tests/test_grad_generator.py +121 -0
tests/test_mock_cuda.py +74 -0
tests/test_pdb_actor.py +110 -0
tests/test_python_actors.py +736 -0
tests/test_remote_functions.py +1271 -0
tests/test_rust_backend.py +217 -0
tests/test_signal_safe_block_on.py +103 -0
tests/test_sim_backend.py +54 -0
tests/test_tensor_engine.py +52 -0
torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0

monarch/common/context_manager.py ADDED Viewed

@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from functools import wraps
+class _ContextManager:
+    def __init__(self, generator):
+        self.generator = generator
+        self.generator.send(None)
+    def __enter__(self):
+        return
+    def __exit__(self, *args):
+        try:
+            self.generator.send(None)
+        except StopIteration:
+            pass
+        else:
+            raise RuntimeError("context manager generator did not exit")
+def activate_first_context_manager(func):
+    """
+    Similar to contextlib.contextmanager but it
+    starts the context when the function is called rather than
+    than at the start of the with statement. Useful for things where
+    you want to optionally activate the context without a guard.
+    """
+    @wraps(func)
+    def helper(*args, **kwargs):
+        return _ContextManager(func(*args, **kwargs))
+    return helper

monarch/common/controller_api.py ADDED Viewed

@@ -0,0 +1,104 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from typing import Any, List, NamedTuple, Optional, Protocol, Sequence, Union
+from monarch._rust_bindings.monarch_extension.client import (  # @manual=//monarch/monarch_extension:monarch_extension
+    DebuggerMessage,
+    LogLevel,
+    WorldState,
+)
+from monarch.common.invocation import DeviceException, RemoteException, Seq
+from monarch.common.reference import Ref
+from monarch.common.shape import NDSlice
+from monarch.common.tensor import Tensor
+class LogMessage(NamedTuple):
+    level: LogLevel
+    message: str
+class MessageResult(NamedTuple):
+    """
+    Message result given a seq id of an invocation.
+    """
+    seq: Seq
+    result: Any
+    error: Optional[RemoteException | DeviceException] = None
+class TController(Protocol):
+    """
+    Controller APIs
+    """
+    # =======================================================
+    # === APIs for the client to call into the controller ===
+    # =======================================================
+    def send(
+        self,
+        ranks: Union[NDSlice, List[NDSlice]],
+        msg: NamedTuple,
+    ) -> None:
+        """
+        Send a message to a set of ranks.
+        """
+        ...
+    def drop_refs(self, refs: Sequence[Ref]) -> None:
+        """
+        Mark references as never being used again
+        """
+        ...
+    # TODO: there are a few things to do to clean up the API:
+    # 2. no need to depend on Tensors, a Referenceable; a Ref is enough.
+    # 3. support mutates as another input parameter.
+    def node(
+        self, seq: Seq, defs: Sequence["Tensor"], uses: Sequence["Tensor"]
+    ) -> None:
+        """
+        Create an invocation node given a sequence id. The node provides what tensors it defines,
+        what tensors it uses, and what tensors it mutates.
+        """
+        ...
+    # ==============================================================
+    # == APIs for the client to read response from the controller ==
+    # ==============================================================
+    # TODO: remove timeout parameter; instead, return a future that can wait on a timeout
+    def next_message(
+        self, timeout: Optional[float]
+    ) -> Optional[MessageResult | LogMessage]:
+        """
+        Read a message given a timeout in seconds. Returns a message output given the seq of an invocation.
+        The output could be the returned value or an exception.
+        If the returned message is None, it means there is no message to read within the given timeout.
+        If timeout is None, it means no timeout (infinite).
+        """
+        ...
+    def stop_mesh(self) -> None:
+        """Stop the system."""
+        ...
+    def drain_and_stop(self) -> List[MessageResult | LogMessage | DebuggerMessage]:
+        """Drain all the messages in the controller upon shutdown."""
+        ...
+    def worker_world_state(self) -> WorldState:
+        """
+        Retrieve the worker world state.
+        :return: The worker WorldState.
+        """
+        ...

monarch/common/device_mesh.py ADDED Viewed

@@ -0,0 +1,417 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import warnings
+from contextlib import AbstractContextManager, contextmanager
+from dataclasses import dataclass
+from enum import Enum
+from logging import Logger
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
+import monarch.common.messages as messages
+import torch
+from monarch.common.shape import MeshTrait
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._pytree import tree_map
+from ._tensor_to_table import tensor_to_table
+from .context_manager import activate_first_context_manager
+from .messages import Dims
+from .reference import Referenceable
+from .shape import NDSlice, Shape
+from .stream import Stream
+from .tensor import MeshSliceTensor, Tensor
+if TYPE_CHECKING:
+    from monarch.common.client import Client
+logger: Logger = logging.getLogger(__name__)
+class RemoteProcessGroup(Referenceable):
+    """
+    Client's view of a process group.
+    """
+    def __init__(self, dims, device_mesh):
+        logger.info(f"creating process group for {dims}")
+        self.dims = dims
+        self.device_mesh = device_mesh
+        self.ref = self.device_mesh.client.new_ref()
+        self._create_remotely()
+        # A set of streams for which we've sent the split-comm message.
+        self._split_comm_done = set()
+    def _create_remotely(self):
+        msg = messages.CreateRemoteProcessGroup(self, self.device_mesh, self.dims)
+        self.device_mesh._send(msg)
+    def ensure_split_comm_remotely(self, stream):
+        """
+        If we haven't already, send a message to the worker to split off a
+        communicator for this PG on the given stream.
+        """
+        # Currently, the worker will error if we try to do the split-comm more
+        # than once, so check for that here to allow this function to be called
+        # lazily.
+        if stream in self._split_comm_done:
+            return
+        self._split_comm_done.add(stream)
+        msg = messages.SplitCommForProcessGroup(
+            remote_process_group=self,
+            stream=stream,
+        )
+        self.device_mesh.client.send_nocoalesce(
+            self.device_mesh.client.all_ranks,
+            msg,
+        )
+    def delete_ref(self, ref: int):
+        if not self.device_mesh.client.has_shutdown:
+            self.device_mesh.client.handle_deletes(self.device_mesh.processes, [ref])
+    def drop(self):
+        if self.ref is None:
+            return
+        self._drop_ref()
+    def size(self):
+        return self.device_mesh.size(self.dims)
+    def _drop_ref(self):
+        if self.ref is None:
+            return
+        self.delete_ref(self.ref)
+        self.ref = None
+    @property
+    def dropped(self):
+        return self.ref is None
+class ActivateGuard:
+    def __init__(self, iter):
+        self.iter = iter
+        next(iter)
+    def __enter__(self):
+        return
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        try:
+            next(self.iter)
+        except StopIteration:
+            pass
+class DeviceMeshStatus(Enum):
+    """
+    Enum representing the status of a device mesh.
+    Attributes:
+        LIVE (str): The mesh has enough processes than the world size specified and all of them are healthy.
+        UNHEALTHY (str): Either the mesh does not have enough processes or some of the processes are unhealthy.
+        AWAITING_CREATION (str): The mesh is still being created by the scheduler.
+    """
+    LIVE = "Live"
+    UNHEALTHY = "Unhealthy"
+    AWAITING_CREATION = "Awaiting Creation"
+@dataclass
+class DeviceMeshInfo:
+    """
+    Data class representing information about a device mesh.
+    Attributes:
+        mesh_labels (Dict[str, str]): Maps mesh labels to values.
+        devices_labels (List[Dict[str, str]]): MAps  device labels to values.
+    """
+    mesh_labels: Dict[str, str]
+    devices_labels: List[Dict[str, str]]
+class DeviceMesh(Referenceable, MeshTrait):
+    def __init__(
+        self,
+        client: "Client",
+        processes: "NDSlice",
+        names: Dims,
+        mesh_name: str = "default",
+    ):
+        assert isinstance(processes, NDSlice)
+        self.client = client
+        assert processes.ndim == len(names)
+        self.names = names
+        self.mesh_name = mesh_name
+        # processes are a list of processes that participate in this device mesh, encoded as an NDSlice
+        self.processes = processes
+        self.exit = lambda: None
+        self.ref = None
+        self._active_mesh_context = None
+    def define_remotely(self):
+        if self.ref is None:
+            self.ref = self.client.new_ref()
+            msg = messages.CreateDeviceMesh(self, self.names, self.processes)
+            self.client.send(self.processes, msg)
+    def process_group(self, dims: str | Dims) -> RemoteProcessGroup:
+        self.define_remotely()
+        if isinstance(dims, str):
+            dims = (dims,)
+        return RemoteProcessGroup(dims, self)
+    def to_tensor(self):
+        with no_mesh.activate():
+            vals = torch.tensor(list(self.processes), device="cpu", dtype=torch.int)
+            return vals.view(self.processes.sizes)
+    def to_table(self):
+        with no_mesh.activate():
+            tensor = self.to_tensor()
+            names = list(self.names)
+            labels = [list(str(i) for i in range(i)) for i in tensor.shape]
+            gpus_per_host = self.client.gpu_per_host
+            def format_data(x):
+                return f"{x//gpus_per_host}.gpu[{x%gpus_per_host}]"
+            return tensor_to_table(
+                tensor, format_data=format_data, axis_names=names, axis_labels=labels
+            )
+    def __repr__(self):
+        return f"<DeviceMesh(names({self.names}), processes({list(self.processes)})) at {hex(id(self))}>"
+    def delete_ref(self, ref: int):
+        if not self.client.has_shutdown:
+            self.client.handle_deletes(self.processes, [ref])
+    def _send(self, cmd: NamedTuple):
+        self.client.flush_deletes()
+        self.client.send(self.processes, cmd)
+    def stack(self, **kwargs):
+        raise NotImplementedError()
+    @property
+    def _ndslice(self) -> NDSlice:
+        return self.processes
+    @property
+    def _labels(self) -> Tuple[str, ...]:
+        return self.names
+    def _new_with_shape(self, shape: Shape) -> "DeviceMesh":
+        mesh = DeviceMesh(self.client, shape.ndslice, tuple(shape.labels))
+        mesh.exit = self.exit
+        return mesh
+    def __call__(self, **kwargs) -> "DeviceMesh":
+        """
+        device_mesh(batch=3) or device_mesh(batch=slice(3, None))
+        """
+        warnings.warn(
+            "The use of this method is deprecated. Please use mesh.slice instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.slice(**kwargs)
+    def rotate(self, **kwargs: Dict[str, int]):
+        raise NotImplementedError()
+    def rank(self, dims: Union[str, Sequence[str]]) -> torch.Tensor:
+        self.define_remotely()
+        if isinstance(dims, str):
+            if dims not in self.names:
+                raise KeyError(f"{self} does not have dimension {repr(dims)}")
+            return _remote(
+                _rank,
+                propagate=lambda _self, _dims: torch.full((), 0, dtype=torch.long),
+            )(self, dims)
+        combined_rank: Any = 0
+        for dim in dims:
+            combined_rank *= self.size(dim)
+            combined_rank += self.rank(dim)
+        return combined_rank
+    @property
+    def ranks(self) -> dict[str, torch.Tensor]:
+        return {dim: self.rank(dim) for dim in self.names}
+    def process_idx(self):
+        self.define_remotely()
+        return _remote(
+            "monarch.worker.worker._process_idx",
+            propagate=lambda _self: torch.full((), 0, dtype=torch.long),
+        )(self)
+    def _process(self, coordinates: Optional[Dict[str, int]]) -> NDSlice:
+        if coordinates is None:
+            return NDSlice(offset=self.processes.offset, sizes=[1], strides=[1])
+        if len(coordinates) > len(self.names):
+            extra = set(coordinates.keys()) - set(self.names)
+            raise KeyError(f"{list(extra)}")
+        for name in self.names:
+            if name not in coordinates:
+                raise ValueError(
+                    f"Missing key '{name}' in shard map. Need all of {self.names}"
+                )
+        flat = [coordinates[name] for name in self.names]
+        return NDSlice(offset=self.processes.nditem(flat), sizes=[1], strides=[1])
+    def activate(self) -> AbstractContextManager:
+        self._active_mesh_context = _active_mesh(self)
+        return self._active_mesh_context
+    def deactivate(self):
+        if self._active_mesh_context is not None:
+            self._active_mesh_context.__exit__(None, None, None)
+            self._active_mesh_context = None
+    def get_info(self) -> DeviceMeshInfo:
+        """
+        Retrieves metadata about the device mesh and its constituent devices.
+        Returns:
+            DeviceMeshInfo: Contains mesh-level labels and per-device labels.
+        """
+        mesh_state = self.client.mesh_state()
+        return DeviceMeshInfo(
+            mesh_labels=mesh_state.labels,
+            devices_labels=[proc.labels for proc in mesh_state.procs.values()],
+        )
+_active: Optional[DeviceMesh] = None
+_dispatch_enabled = False
+def get_active_mesh():
+    if _active is None:
+        raise ValueError("no device mesh is active")
+    return _active
+class _ActiveMesh(TorchDispatchMode):
+    ignore = ["profiler._record_function_exit._RecordFunction"]
+    allowed_local_accessors = ["aten._local_scalar_dense.default"]
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if _active is None:
+            return func(*args, **kwargs)
+        fnstr = str(func)
+        if fnstr in self.ignore:
+            return func(*args, **kwargs)
+        if fnstr in self.allowed_local_accessors and not isinstance(args[0], Tensor):
+            return func(*args, **kwargs)
+        return _remote(func, propagate=func)(*args, **kwargs)
+def _rank(mesh, dim):
+    return torch.full((), mesh.dims[dim].rank, dtype=torch.long)
+@contextmanager
+def _dispatch():
+    global _dispatch_enabled
+    if _dispatch_enabled:
+        yield
+    else:
+        _dispatch_enabled = True
+        try:
+            with _ActiveMesh():
+                yield
+        finally:
+            _dispatch_enabled = False
+_on_change: List[Callable] = []
+@activate_first_context_manager
+def _active_mesh(mesh: Optional[DeviceMesh]):
+    global _active
+    for on_change in _on_change:
+        on_change(_active, mesh)
+    _active, old = mesh, _active
+    try:
+        with _dispatch():
+            yield
+    finally:
+        for on_change in _on_change:
+            on_change(_active, old)
+        _active = old
+class _NoMesh:
+    def activate(self):
+        return _active_mesh(None)
+no_mesh = _NoMesh()
+def _remote(*args, **kwargs):
+    # device_mesh <-> tensor <-> remote are mututally recursive
+    # we break the dependency to allow for separate files by
+    # having device_mesh and tensor locally import the `remote`
+    # entrypoint
+    from monarch.common.remote import remote
+    return remote(*args, **kwargs)
+def to_mesh(
+    tensors: Any,
+    mesh: "DeviceMesh",
+    stream: Optional[Stream] = None,
+) -> Any:
+    """
+    Move all tensors in tensors to the given mesh.
+    """
+    def _to_mesh(tensor: Union["Tensor", "MeshSliceTensor"]) -> "Tensor":
+        return tensor.to_mesh(mesh, stream)
+    return tree_map(_to_mesh, tensors)
+def slice_mesh(
+    tensors: Any,
+    **kwargs: Union[int, slice],
+) -> Any:
+    """
+    Performs the slice_mesh operation for each tensor in tensors.
+    """
+    def _slice_mesh(tensor: "Tensor") -> "MeshSliceTensor":
+        return tensor.slice_mesh(**kwargs)
+    return tree_map(_slice_mesh, tensors)

monarch/common/fake.py ADDED Viewed

@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from concurrent.futures import ThreadPoolExecutor
+from functools import cache
+from torch._subclasses.fake_tensor import FakeTensorMode
+@cache
+def _fake_mode_worker():
+    return ThreadPoolExecutor(max_workers=1)
+@cache
+def _fake_mode():
+    return FakeTensorMode()
+def fake_call(fn, *args, **kwargs):
+    """Execute on work on a ThreadPool worker
+    First call (ThreadPoolExecutor init) will take the GIL and may block for long time!
+    TODO: this will be replaced with something more performant
+    """
+    global _fake_mode_worker, fake_mode
+    # # Calls FakeTensorMode while re-enabling version counter tracking
+    # # todo(chilli): I'm not totally sure why I need to disable python dispatch
+    # # key. Perhaps there's some unwrapping that should have happened further up.
+    # include_to_set = torch._C._dispatch_tls_local_include_set()
+    # exclude_to_set = (
+    #     torch._C._dispatch_tls_local_exclude_set()
+    #     | torch._C.DispatchKeySet(torch._C.DispatchKey.Python)
+    # ) - torch._C.DispatchKeySet(torch._C.DispatchKey.ADInplaceOrView)
+    # def work():
+    #     with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set):
+    #         with fake_mode:
+    #             return fn(*args, **kwargs)
+    # return work()
+    def work():
+        # fake mode must be initialized in the worker thread
+        # otherwise a monarch dispatch mode may be active, causing
+        # FakeTensorMode to initialize wrong.
+        with _fake_mode():
+            return fn(*args, **kwargs)
+    return _fake_mode_worker().submit(work).result()