PyPI - torchmonarch-nightly - Versions diffs - 2025.8.2__cp311-cp311-manylinux2014_x86_64.whl → 2025.9.4__cp311-cp311-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.8.2__cp311-cp311-manylinux2014_x86_64.whl → 2025.9.4__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

monarch/_rust_bindings.so +0 -0
monarch/_src/actor/actor_mesh.py +504 -218
monarch/_src/actor/allocator.py +75 -6
monarch/_src/actor/bootstrap_main.py +7 -4
monarch/_src/actor/code_sync/__init__.py +2 -0
monarch/_src/actor/debugger/__init__.py +7 -0
monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
monarch/_src/actor/endpoint.py +27 -45
monarch/_src/actor/future.py +86 -24
monarch/_src/actor/host_mesh.py +125 -0
monarch/_src/actor/logging.py +94 -0
monarch/_src/actor/pickle.py +25 -0
monarch/_src/actor/proc_mesh.py +423 -156
monarch/_src/actor/python_extension_methods.py +90 -0
monarch/_src/actor/shape.py +8 -1
monarch/_src/actor/source_loader.py +45 -0
monarch/_src/actor/telemetry/__init__.py +172 -0
monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
monarch/_src/debug_cli/__init__.py +7 -0
monarch/_src/debug_cli/debug_cli.py +43 -0
monarch/_src/tensor_engine/rdma.py +64 -9
monarch/_testing.py +1 -3
monarch/actor/__init__.py +24 -4
monarch/common/_C.so +0 -0
monarch/common/device_mesh.py +14 -0
monarch/common/future.py +10 -0
monarch/common/remote.py +14 -25
monarch/common/tensor.py +12 -0
monarch/debug_cli/__init__.py +7 -0
monarch/debug_cli/__main__.py +12 -0
monarch/fetch.py +2 -2
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +4 -2
monarch/mesh_controller.py +34 -14
monarch/monarch_controller +0 -0
monarch/tools/colors.py +25 -0
monarch/tools/commands.py +42 -7
monarch/tools/components/hyperactor.py +6 -4
monarch/tools/config/__init__.py +35 -12
monarch/tools/config/defaults.py +15 -5
monarch/tools/config/environment.py +45 -0
monarch/tools/config/workspace.py +165 -0
monarch/tools/mesh_spec.py +3 -3
monarch/utils/__init__.py +9 -0
monarch/utils/utils.py +78 -0
tests/error_test_binary.py +5 -3
tests/python_actor_test_binary.py +52 -0
tests/test_actor_error.py +142 -14
tests/test_alloc.py +1 -1
tests/test_allocator.py +59 -72
tests/test_debugger.py +639 -45
tests/test_env_before_cuda.py +4 -4
tests/test_mesh_trait.py +38 -0
tests/test_python_actors.py +965 -75
tests/test_rdma.py +7 -6
tests/test_tensor_engine.py +6 -6
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/METADATA +82 -4
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/RECORD +63 -47
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/top_level.txt +0 -0

monarch/common/device_mesh.py CHANGED Viewed

@@ -154,6 +154,20 @@ class DeviceMeshInfo:
 class DeviceMesh(Referenceable, MeshTrait):
+    """A mesh of devices for distributed tensor operations.
+    DeviceMesh represents a collection of devices arranged in a
+    multidimensional grid for parallel computation. It manages
+    communication between devices and enables distributed execution
+    of operations across the mesh.
+    Args:
+        client (Client): The client connection to the mesh infrastructure
+        processes (NDSlice): Multi-dimensional slice representing the process layout
+        names (Dims): Names for each dimension of the mesh
+        mesh_name (str, optional): Name identifier for the mesh. Default: "default"
+    """
     def __init__(
         self,
         client: "Client",

monarch/common/future.py CHANGED Viewed

@@ -68,6 +68,16 @@ T = TypeVar("T")
 class Future(Generic[T]):
+    """A future object representing the result of an asynchronous computation.
+    Future provides a way to access the result of a computation that may not
+    have completed yet. It allows for non-blocking execution and provides
+    methods to wait for completion and retrieve results.
+    Args:
+        client (Client): The client connection for handling the future
+    """
     def __init__(self, client: "Client"):
         self._client = client
         self._status = "incomplete"

monarch/common/remote.py CHANGED Viewed

@@ -28,10 +28,10 @@ from typing import (
 import monarch.common.messages as messages
 import torch
-from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
-from monarch._rust_bindings.monarch_hyperactor.shape import Shape
-from monarch._src.actor.actor_mesh import Port, PortTuple
-from monarch._src.actor.endpoint import Extent, Selection
+from monarch._rust_bindings.monarch_hyperactor.shape import Extent, Shape
+from monarch._src.actor.actor_mesh import Port
+from monarch._src.actor.endpoint import Selection
+from monarch._src.actor.future import Future
 from monarch.common import _coalescing, device_mesh, stream
 from monarch.common.future import Future as OldFuture
@@ -135,20 +135,6 @@ class Remote(Generic[P, R], Endpoint[P, R]):
         client._request_status()
         return Extent(ambient_mesh._labels, ambient_mesh._ndslice.sizes)
-    def _port(self, once: bool = False) -> "PortTuple[R]":
-        ambient_mesh = device_mesh._active
-        if ambient_mesh is None:
-            raise ValueError(
-                "FIXME - cannot create a port without an active proc_mesh, because there is not way to create a port without a mailbox"
-            )
-        mesh_controller = getattr(ambient_mesh.client, "_mesh_controller", None)
-        if mesh_controller is None:
-            raise ValueError(
-                "Cannot create raw port objects with an old-style tensor engine controller."
-            )
-        mailbox: Mailbox = mesh_controller._mailbox
-        return PortTuple.create(mailbox, once)
     @property
     def _resolvable(self):
         return resolvable_function(self._remote_impl)
@@ -212,7 +198,7 @@ remote_identity = Remote(None, lambda x: x)
 def call_on_shard_and_fetch(
     remote: Endpoint[P, R], *args, shard: Dict[str, int] | None = None, **kwargs
-) -> OldFuture[R]:
+) -> Future[R]:
     # We have to flatten the tensors twice: first to discover
     # which mesh we are working on to shard it, and then again when doing the
     # dtensor_check in send. This complexity is a consequence of doing
@@ -224,17 +210,20 @@ def call_on_shard_and_fetch(
         checker.check_mesh_stream_local(device_mesh._active, stream._active)
         if not hasattr(checker.mesh.client, "_mesh_controller"):
-            return _old_call_on_shard_and_fetch(
-                cast("Remote[P, R]", remote),
-                *args,
-                shard=shard,
-                **kwargs,
+            return cast(
+                "Future[R]",
+                _old_call_on_shard_and_fetch(
+                    cast("Remote[P, R]", remote),
+                    *args,
+                    shard=shard,
+                    **kwargs,
+                ),
             )
         selected_slice = checker.mesh._process(shard)
         shard_mesh = checker.mesh._new_with_shape(Shape(["_"], selected_slice))
         with shard_mesh.activate():
-            return cast("OldFuture[R]", remote.call_one(*args, **kwargs))
+            return remote.call_one(*args, **kwargs)
 def _old_call_on_shard_and_fetch(

monarch/common/tensor.py CHANGED Viewed

@@ -74,6 +74,18 @@ class DropLocation(NamedTuple):
 class Tensor(Referenceable, BaseTensor):
+    """A distributed tensor for distributed computation across device meshes.
+    Tensor represents a distributed tensor that spans across multiple devices
+    in a device mesh. It provides the same interface as PyTorch tensors but
+    enables distributed operations and communication patterns.
+    Args:
+        fake (torch.Tensor): A fake tensor representing the shape and type
+        mesh (DeviceMesh): The device mesh this tensor is distributed across
+        stream (Stream): The computation stream for this tensor
+    """
     # pyre-fixme[13]: Attribute `stream` is never initialized.
     stream: Stream
     # pyre-fixme[13]: Attribute `mesh` is never initialized.

monarch/debug_cli/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe

monarch/debug_cli/__main__.py ADDED Viewed

@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from monarch._src.debug_cli import debug_cli
+if __name__ == "__main__":
+    debug_cli.run()

monarch/fetch.py CHANGED Viewed

@@ -11,9 +11,9 @@ This is a utility file for fetching a shard of a tensor from remote.
 from typing import cast, TypeVar
-from monarch.common.device_mesh import no_mesh
+from monarch.actor import Future
-from monarch.common.future import Future
+from monarch.common.device_mesh import no_mesh
 from monarch.common.remote import call_on_shard_and_fetch, remote_identity

monarch/gradient/_gradient_generator.so CHANGED Viewed

Binary file

monarch/gradient_generator.py CHANGED Viewed

@@ -151,14 +151,16 @@ def grad_function(fn):
 def gradient_execution_order(
-    roots: Sequence[TensorOrEdge], with_respect_to: Sequence[TensorOrEdge]
+    roots: Sequence[TensorOrEdge], with_respect_to: Sequence[Any]
 ) -> List[int]:
     """
     Returns the order in which the gradients for `with_respect_to` would become available
     if autograd were run on `roots`. This is the reverse order of each tensors
     first use in the gradient computation.
     """
-    with_respect_to = [_gradient_edge(g) for g in with_respect_to]
+    with_respect_to = [
+        (g.node, g.output_nr) for g in map(_gradient_edge, with_respect_to)
+    ]
     min_sequence_nr: Dict[Any, float] = {e: math.inf for e in with_respect_to}
     to_scan = [_gradient_edge(r).node for r in roots]

monarch/mesh_controller.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-unsafe
 import atexit
 import logging
 import os
@@ -43,7 +45,7 @@ from monarch._rust_bindings.monarch_hyperactor.proc import (  # @manual=//monarc
     ActorId,
 )
 from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask
-from monarch._src.actor.actor_mesh import ActorEndpoint, Port, PortTuple
+from monarch._src.actor.actor_mesh import ActorEndpoint, Channel, Port
 from monarch._src.actor.endpoint import Selection
 from monarch._src.actor.shape import NDSlice
 from monarch.common import device_mesh, messages, stream
@@ -63,6 +65,7 @@ if TYPE_CHECKING:
     from monarch.actor import ProcMesh
 from monarch._rust_bindings.monarch_hyperactor.shape import Point
+from monarch._src.actor.device_utils import _local_device_count
 from monarch.common.client import Client
 from monarch.common.controller_api import LogMessage, MessageResult
@@ -119,9 +122,18 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None:
     worker_rank = worker_point.rank
     try:
         _, worker_env = _get_worker_exec_info()
-        local_rank = worker_point["gpus"]
-        gpus_per_host = worker_point.size("gpus")
-        num_worker_procs = len(worker_point.shape)
+        if "gpus" in worker_point:
+            local_rank = worker_point["gpus"]
+            gpus_per_host = worker_point.size("gpus")
+        elif "gpu" in worker_point:
+            local_rank = worker_point["gpu"]
+            gpus_per_host = worker_point.size("gpu")
+        else:
+            gpus_per_host = _local_device_count()
+            local_rank = worker_rank % gpus_per_host
+        num_worker_procs = worker_point.extent.nelements
         process_env = {
             **worker_env,
             "CUDA_VISIBLE_DEVICES": str(local_rank),
@@ -156,7 +168,7 @@ class MeshClient(Client):
         defs: Tuple["Tensor", ...],
         uses: Tuple["Tensor", ...],
     ) -> "OldFuture":  # the OldFuture is a lie
-        sender, receiver = PortTuple.create(self._mesh_controller._mailbox, once=True)
+        sender, receiver = Channel.open(once=True)
         ident = self.new_node(defs, uses, cast("OldFuture", sender))
         process = mesh._process(shard)
@@ -192,7 +204,7 @@ class MeshClient(Client):
         atexit.unregister(self._atexit)
         self._shutdown = True
-        sender, receiver = PortTuple.create(self._mesh_controller._mailbox, once=True)
+        sender, receiver = Channel.open(once=True)
         assert sender._port_ref is not None
         self._mesh_controller.sync_at_exit(sender._port_ref.port_id)
         receiver.recv().get(timeout=60)
@@ -200,6 +212,14 @@ class MeshClient(Client):
         # waited for the responses
         self.inner.drain_and_stop()
+    def _atexit(self) -> None:
+        # Calling self.shutdown may cause a deadlock if something is wrong with
+        # the networking. Or should we make shutdown() not wait indefinitely?
+        self._shutdown = True
+        # send shutdown message to stop other processes.
+        self.inner.stop_mesh()
     @property
     def _mesh_controller(self) -> Controller:
         return cast(Controller, self.inner)
@@ -235,7 +255,9 @@ def spawn_tensor_engine(proc_mesh: "ProcMesh") -> DeviceMesh:
     # is currently only used for debug printing. It should be fixed to
     # report the proc ID instead of the rank it currently does.
     gpus = proc_mesh.sizes.get("gpus", 1)
-    backend_ctrl = Controller(proc_mesh._proc_mesh)
+    # we currently block on the creation of the proc mesh, but conceivably we could init concurrently here.
+    backend_ctrl = Controller(proc_mesh._proc_mesh.block_on())
     client = MeshClient(cast("TController", backend_ctrl), proc_mesh.size(), gpus)
     dm = DeviceMesh(
         client,
@@ -273,7 +295,7 @@ class RemoteException(Exception):
 def _cast_call_method_indirect(
     endpoint: ActorEndpoint,
-    selection: Selection,
+    selection: str,
     client: MeshClient,
     seq: Seq,
     args_kwargs_tuple: bytes,
@@ -290,7 +312,7 @@ def _cast_call_method_indirect(
         ),
         args_kwargs_tuple,
     )
-    endpoint._actor_mesh.cast(actor_msg, selection)
+    endpoint._actor_mesh.cast(actor_msg, selection, endpoint._mailbox)
     return broker_id
@@ -299,7 +321,7 @@ def actor_send(
     args_kwargs_tuple: bytes,
     refs: Sequence[Any],
     port: Optional[Port[Any]],
-    selection: Selection,
+    selection: str,
 ):
     tensors = [ref for ref in refs if isinstance(ref, Tensor)]
     # we have some monarch references, we need to ensure their
@@ -314,9 +336,7 @@ def actor_send(
         # TODO: move propagators into Endpoint abstraction and run the propagator to get the
         # mutates
         checker.check_permission(())
-    selected_device_mesh = (
-        endpoint._actor_mesh._proc_mesh and endpoint._actor_mesh._proc_mesh._device_mesh
-    )
+    selected_device_mesh = endpoint._proc_mesh and endpoint._proc_mesh._device_mesh
     if selected_device_mesh is not checker.mesh:
         raise ValueError(
             f"monarch Tensors sent to an actor must be located on the same process as the actor. However {checker.mesh} is not {selected_device_mesh}."
@@ -350,7 +370,7 @@ def _actor_send(
     args_kwargs_tuple: bytes,
     refs: Sequence[Any],
     port: Optional[Port[Any]],
-    selection: Selection,
+    selection: str,
     client: MeshClient,
     mesh: DeviceMesh,
     tensors: List[Tensor],

monarch/monarch_controller CHANGED Viewed

Binary file

monarch/tools/colors.py ADDED Viewed

@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import sys
+# only print colors if outputting directly to a terminal
+if not sys.stdout.closed and sys.stdout.isatty():
+    GREEN = "\033[32m"
+    BLUE = "\033[34m"
+    ORANGE = "\033[38:2:238:76:44m"
+    GRAY = "\033[2m"
+    CYAN = "\033[36m"
+    ENDC = "\033[0m"
+else:
+    GREEN = ""
+    ORANGE = ""
+    BLUE = ""
+    GRAY = ""
+    CYAN = ""
+    ENDC = ""

monarch/tools/commands.py CHANGED Viewed

@@ -11,9 +11,12 @@ import asyncio
 import inspect
 import logging
 import os
+import tempfile
 from datetime import datetime, timedelta
+from pathlib import Path
 from typing import Any, Callable, Mapping, Optional, Union
+from monarch.tools.colors import CYAN, ENDC
 from monarch.tools.components.hyperactor import DEFAULT_NAME
 from monarch.tools.config import (  # @manual=//monarch/python/monarch/tools/config/meta:defaults
@@ -21,6 +24,8 @@ from monarch.tools.config import (  # @manual=//monarch/python/monarch/tools/con
     defaults,
 )
 from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
+from monarch.tools.utils import MONARCH_HOME
 from torchx.runner import Runner  # @manual=//torchx/runner:lib_core
 from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal, parse_app_handle
 from torchx.specs.builders import parse_args
@@ -125,8 +130,18 @@ def create(
     with torchx_runner() as runner:
         appdef: AppDef = AppDef(name, config.appdef.roles, config.appdef.metadata)
+        if not config.workspace.dirs and not config.workspace.env:
+            info = runner.dryrun(appdef, scheduler, cfg, workspace=None)
+        else:
+            with tempfile.TemporaryDirectory(dir=MONARCH_HOME("out")) as tmpdir:
+                # multi-directory workspace is not supported natively in torchx; so merge into a single one
+                # TODO (kiuk@) may be able to delete bootstrap workspace copy (as the job is created)
+                #   since proc_mesh.sync_workspace() can do this without having to merge the workspace
+                workspace_out = Path(tmpdir) / "workspace"
+                config.workspace.merge(workspace_out)
+                config.workspace.set_env_vars(appdef)
-        info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
+                info = runner.dryrun(appdef, scheduler, cfg, str(workspace_out))
         info_json_fmt = AppDryRunInfo(
             info.request,
@@ -173,19 +188,25 @@ def info(server_handle: str) -> Optional[ServerSpec]:
         # null-guard since some schedulers do not fill replica_status
         if host_status := replica_status.get(role.name):
-            spec.hostnames = [h.hostname for h in host_status]
+            # make sure the hostnames are sorted by their respective node indexes
+            # this makes ServerSpec.host0 return hostname of node 0
+            spec.hostnames = [
+                h.hostname for h in sorted(host_status, key=lambda h: h.id)
+            ]
             # the mesh status is based on the "least progressive" replica status
             spec.state = min(h.state for h in host_status)
         mesh_specs.append(spec)
     scheduler, namespace, _ = parse_app_handle(server_handle)
     return ServerSpec(
         name=appdef.name,
         state=status.state,
         meshes=mesh_specs,
         scheduler=scheduler,
         namespace=namespace,
+        ui_url=status.ui_url,
     )
@@ -263,6 +284,7 @@ async def get_or_create(
     name: str,
     config: Config,
     check_interval: timedelta = _5_SECONDS,
+    force_restart: bool = False,
 ) -> ServerSpec:
     """Waits for the server based on identity `name` in the scheduler specified in the `config`
     to be ready (e.g. RUNNING). If the server is not found then this function creates one
@@ -280,6 +302,12 @@ async def get_or_create(
         server_handle = get_or_create(name="my_job_name", config)
         server_info = info(server_handle)
+    Args:
+        name: the name of the server (job) to get or create
+        config: configs used to create the job if one does not exist
+        check_interval: how often to poll the status of the job when waiting for it to be ready
+        force_restart: if True kills and re-creates the job even if one exists
     Returns: A `ServerSpec` containing information about either the existing or the newly
         created server.
@@ -288,7 +316,6 @@ async def get_or_create(
     server_handle = f"{config.scheduler}:///{name}"
     server_info = await server_ready(server_handle, check_interval)
     if not server_info or not server_info.is_running:  # then create one
         logger.info(
             "no existing RUNNING server `%s` creating new one...", server_handle
@@ -311,11 +338,19 @@ async def get_or_create(
                 f"the new server `{new_server_handle}` has {server_info.state}"
             )
-        print(f"\x1b[36mNew job `{new_server_handle}` is ready to serve. \x1b[0m")
-        return server_info
+        print(f"{CYAN}New job `{new_server_handle}` is ready to serve.{ENDC}")
     else:
-        print(f"\x1b[36mFound existing job `{server_handle}` ready to serve. \x1b[0m")
-        return server_info
+        print(f"{CYAN}Found existing job `{server_handle}` ready to serve.{ENDC}")
+        if force_restart:
+            print(f"{CYAN}force_restart=True, restarting `{server_handle}`.{ENDC}")
+            kill(server_handle)
+            server_info = await get_or_create(name, config, check_interval)
+    if server_info.ui_url:  # not all schedulers have a UI URL
+        print(f"{CYAN}Job URL: {server_info.ui_url}{ENDC}")
+    return server_info
 def kill(server_handle: str) -> None:

monarch/tools/components/hyperactor.py CHANGED Viewed

@@ -9,7 +9,8 @@ import getpass
 from typing import Optional
 from monarch.tools import mesh_spec
-from monarch.tools.config import UnnamedAppDef
+from monarch.tools.config import NOT_SET
 from monarch.tools.mesh_spec import mesh_spec_from_str
 from torchx import specs
@@ -19,16 +20,17 @@ _USER: str = getpass.getuser()
 DEFAULT_NAME: str = f"monarch-{_USER}"
 __version__ = "latest"  # TODO get version from monarch.__version_
 def host_mesh(
-    image: str = f"ghcr.io/pytorch-labs/monarch:{__version__}",  # TODO docker needs to be built and pushed to ghcr
+    image: str = f"ghcr.io/meta-pytorch/monarch:{__version__}",  # TODO docker needs to be built and pushed to ghcr
     meshes: list[str] = _DEFAULT_MESHES,
     env: Optional[dict[str, str]] = None,
     port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT,
     program: str = "monarch_bootstrap",  # installed with monarch wheel (as console script)
-) -> UnnamedAppDef:
+) -> specs.AppDef:
     """
     Args:
         name: the name of the monarch server job
@@ -39,7 +41,7 @@ def host_mesh(
         program: path to the binary that the remote process allocator spawns on an allocation request
     """
-    appdef = UnnamedAppDef()
+    appdef = specs.AppDef(name=NOT_SET)
     for mesh in [mesh_spec_from_str(mesh) for mesh in meshes]:
         mesh_role = specs.Role(

monarch/tools/config/__init__.py CHANGED Viewed

@@ -5,23 +5,24 @@
 # LICENSE file in the root directory of this source tree.
 # pyre-strict
+import warnings
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any
-from torchx.specs import Role
+from monarch.tools.config.workspace import Workspace
+# Gracefully handle cases where torchx might not be installed
+# NOTE: this can be removed once torchx.specs moves to monarch.session
+try:
+    from torchx import specs
+except ImportError:
+    pass
 NOT_SET: str = "__NOT_SET__"
-@dataclass
-class UnnamedAppDef:
-    """
-    A TorchX AppDef without a name.
-    """
-    roles: List[Role] = field(default_factory=list)
-    metadata: Dict[str, str] = field(default_factory=dict)
+def _empty_appdef() -> "specs.AppDef":
+    return specs.AppDef(name=NOT_SET)
 @dataclass
@@ -32,6 +33,28 @@ class Config:
     scheduler: str = NOT_SET
     scheduler_args: dict[str, Any] = field(default_factory=dict)
-    workspace: Optional[str] = None
+    workspace: Workspace = field(default_factory=Workspace.null)
     dryrun: bool = False
-    appdef: UnnamedAppDef = field(default_factory=UnnamedAppDef)
+    appdef: "specs.AppDef" = field(default_factory=_empty_appdef)
+    def __post_init__(self) -> None:
+        # workspace used to be Optional[str]
+        # while we type it as class Workspace now, handle workspace=None and str for BC
+        if self.workspace is None:
+            deprecation_msg = (
+                "Setting `workspace=None` is deprecated."
+                " Use `workspace=monarch.tools.config.workspace.Workspace(env=None)` instead."
+            )
+            warnings.warn(deprecation_msg, FutureWarning, stacklevel=2)
+            self.workspace = Workspace.null()
+        elif isinstance(self.workspace, str):
+            deprecation_msg = (
+                f"Setting `workspace='{self.workspace}'` is deprecated."
+                f" Use `workspace=monarch.tools.config.workspace.Workspace(dirs=['{self.workspace}'])` instead."
+            )
+            warnings.warn(deprecation_msg, FutureWarning, stacklevel=2)
+            # previous behavior (when workspace was a str pointing to the local project dir)
+            # was to copy the local dir into $WORKSPACE_DIR. For example:
+            # ~/github/torch/** (local) -> $WORKSPACE_DIR/** (remote)
+            # so we map it to "".
+            self.workspace = Workspace(dirs={self.workspace: ""})

monarch/tools/config/defaults.py CHANGED Viewed

@@ -8,10 +8,12 @@
 """Defines defaults for ``monarch.tools``"""
-from typing import Callable, Optional
+import warnings
+from typing import Callable
 from monarch.tools.components import hyperactor
-from monarch.tools.config import Config, UnnamedAppDef
+from monarch.tools.config import Config
+from monarch.tools.config.workspace import Workspace
 from torchx import specs
 from torchx.schedulers import (
@@ -23,7 +25,7 @@ from torchx.schedulers import (
 )
-def component_fn(scheduler: str) -> Callable[..., UnnamedAppDef]:
+def component_fn(scheduler: str) -> Callable[..., specs.AppDef]:
     """The default TorchX component function for the scheduler"""
     return hyperactor.host_mesh
@@ -40,9 +42,17 @@ def scheduler_factories() -> dict[str, SchedulerFactory]:
     }
-def config(scheduler: str, workspace: Optional[str] = None) -> Config:
+def config(scheduler: str, workspace: str | None = None) -> Config:
     """The default :py:class:`~monarch.tools.config.Config` to use when submitting to the provided ``scheduler``."""
-    return Config(scheduler=scheduler, workspace=workspace)
+    warnings.warn(
+        "`defaults.config()` is deprecated, prefer instantiating `Config()` directly",
+        FutureWarning,
+        stacklevel=2,
+    )
+    return Config(
+        scheduler=scheduler,
+        workspace=Workspace(dirs={workspace: ""}) if workspace else Workspace.null(),
+    )
 def dryrun_info_formatter(dryrun_info: specs.AppDryRunInfo) -> Callable[..., str]:

monarch/tools/config/environment.py ADDED Viewed

@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from monarch.tools import utils
+class Environment:
+    """An environment holds the necessary dependencies for the projects (directories)
+    in a `monarch.tools.workspace.Workspace`. When specified as part of a Workspace,
+    the local environment is packed into an ephemeral "image" (e.g. Docker) to mirror
+    the locally installed packages on the remote job.
+    """
+    pass
+class CondaEnvironment(Environment):
+    """Reference to a conda environment.
+    If no `conda_prefix` is specified, then defaults to the currently active conda environment.
+    """
+    def __init__(self, conda_prefix: str | None = None) -> None:
+        self._conda_prefix = conda_prefix
+    @property
+    def conda_prefix(self) -> str:
+        """Returns the `conda_prefix` this object was instantiated with or the currently active conda environment
+        if no `conda_prefix` was specified in the constructor."""
+        if not self._conda_prefix:
+            active_conda_prefix = utils.conda.active_env_dir()
+            assert active_conda_prefix, "No currently active conda environment. Either specify a `conda_prefix` or activate one."
+            return active_conda_prefix
+        else:
+            return self._conda_prefix
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, CondaEnvironment):
+            return False
+        return self._conda_prefix == other._conda_prefix