torchmonarch-nightly 2025.7.1__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.25__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. monarch/__init__.py +13 -9
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
  4. monarch/_src/actor/actor_mesh.py +874 -0
  5. monarch/{allocator.py → _src/actor/allocator.py} +26 -17
  6. monarch/_src/actor/bootstrap_main.py +73 -0
  7. monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
  8. monarch/_src/actor/code_sync/auto_reload.py +223 -0
  9. monarch/_src/actor/debugger.py +565 -0
  10. monarch/_src/actor/endpoint.py +270 -0
  11. monarch/_src/actor/event_loop.py +97 -0
  12. monarch/_src/actor/future.py +100 -0
  13. monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
  14. monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
  15. monarch/_src/actor/proc_mesh.py +500 -0
  16. monarch/_src/actor/sync_state.py +18 -0
  17. monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
  18. monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
  19. monarch/_src/actor/tensor_engine_shim.py +56 -0
  20. monarch/_src/tensor_engine/rdma.py +180 -0
  21. monarch/_testing.py +3 -2
  22. monarch/actor/__init__.py +51 -0
  23. monarch/actor_mesh.py +6 -765
  24. monarch/bootstrap_main.py +8 -47
  25. monarch/common/client.py +1 -1
  26. monarch/common/controller_api.py +2 -1
  27. monarch/common/device_mesh.py +12 -2
  28. monarch/common/messages.py +12 -1
  29. monarch/common/recording.py +4 -3
  30. monarch/common/remote.py +135 -52
  31. monarch/common/tensor.py +2 -1
  32. monarch/controller/backend.py +2 -2
  33. monarch/controller/controller.py +2 -1
  34. monarch/controller/rust_backend/controller.py +2 -1
  35. monarch/fetch.py +3 -5
  36. monarch/mesh_controller.py +201 -139
  37. monarch/monarch_controller +0 -0
  38. monarch/opaque_module.py +4 -6
  39. monarch/opaque_object.py +3 -3
  40. monarch/proc_mesh.py +6 -309
  41. monarch/python_local_mesh.py +1 -1
  42. monarch/rust_backend_mesh.py +2 -1
  43. monarch/rust_local_mesh.py +4 -2
  44. monarch/sim_mesh.py +10 -19
  45. monarch/simulator/command_history.py +1 -1
  46. monarch/simulator/interface.py +2 -1
  47. monarch/simulator/mock_controller.py +1 -1
  48. monarch/simulator/simulator.py +1 -1
  49. monarch/tensor_engine/__init__.py +23 -0
  50. monarch/tensor_worker_main.py +3 -1
  51. monarch/tools/cli.py +3 -1
  52. monarch/tools/commands.py +95 -35
  53. monarch/tools/mesh_spec.py +55 -0
  54. monarch/tools/utils.py +38 -0
  55. monarch/worker/worker.py +1 -1
  56. monarch/world_mesh.py +2 -1
  57. monarch_supervisor/python_executable.py +6 -3
  58. tests/error_test_binary.py +48 -10
  59. tests/test_actor_error.py +370 -21
  60. tests/test_alloc.py +1 -1
  61. tests/test_allocator.py +373 -17
  62. tests/test_controller.py +2 -0
  63. tests/test_debugger.py +416 -0
  64. tests/test_env_before_cuda.py +162 -0
  65. tests/test_python_actors.py +184 -333
  66. tests/test_rdma.py +198 -0
  67. tests/test_remote_functions.py +40 -12
  68. tests/test_rust_backend.py +7 -5
  69. tests/test_sim_backend.py +1 -4
  70. tests/test_tensor_engine.py +55 -1
  71. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
  72. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
  73. torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
  74. monarch/_monarch/hyperactor/__init__.py +0 -58
  75. monarch/_monarch/worker/debugger.py +0 -117
  76. monarch/_monarch/worker/logging.py +0 -107
  77. monarch/debugger.py +0 -379
  78. monarch/future.py +0 -76
  79. monarch/rdma.py +0 -162
  80. torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
  81. /monarch/{_monarch/worker → _src}/__init__.py +0 -0
  82. /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
  83. /monarch/{common → _src/actor}/shape.py +0 -0
  84. /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
  85. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
  86. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
  87. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
monarch/proc_mesh.py CHANGED
@@ -4,315 +4,12 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
- # pyre-strict
7
+ import warnings
8
8
 
9
- import os
10
- import sys
11
- from contextlib import AbstractContextManager
12
-
13
- from typing import (
14
- Any,
15
- cast,
16
- Dict,
17
- List,
18
- Optional,
19
- Sequence,
20
- Type,
21
- TYPE_CHECKING,
22
- TypeVar,
23
- )
24
-
25
- if TYPE_CHECKING:
26
- import torch
27
-
28
- import monarch
29
- from monarch import ActorFuture as Future
30
-
31
- # Conditionally import DeviceMesh and spawn_tensor_engine only if tensor_engine is available
32
- # pyre-ignore[21]
33
- from monarch._rust_bindings import has_tensor_engine
34
-
35
- from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension # @manual=//monarch/monarch_extension:monarch_extension
36
- Alloc,
37
- AllocConstraints,
38
- AllocSpec,
39
- )
40
- from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
41
- from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
42
- ProcMesh as HyProcMesh,
43
- ProcMeshMonitor,
9
+ warnings.warn(
10
+ "monarch.proc_mesh is deprecated, please import from monarch.actor instead.",
11
+ DeprecationWarning,
12
+ stacklevel=2,
44
13
  )
45
- from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
46
- from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
47
-
48
- from monarch.code_sync import RemoteWorkspace, RsyncMeshClient
49
- from monarch.common._device_utils import _local_device_count
50
- from monarch.common.shape import MeshTrait
51
- from monarch.rdma import RDMAManager
52
-
53
- if has_tensor_engine():
54
- from monarch.common.device_mesh import DeviceMesh
55
- from monarch.mesh_controller import spawn_tensor_engine
56
- else:
57
- DeviceMesh = None
58
- spawn_tensor_engine = None
59
-
60
- T = TypeVar("T")
61
- try:
62
- from __manifest__ import fbmake # noqa
63
-
64
- IN_PAR = True
65
- except ImportError:
66
- IN_PAR = False
67
-
68
-
69
- async def _allocate_nonblocking(alloc: Alloc) -> "ProcMesh":
70
- return ProcMesh(await HyProcMesh.allocate_nonblocking(alloc))
71
-
72
-
73
- def _allocate_blocking(alloc: Alloc) -> "ProcMesh":
74
- return ProcMesh(HyProcMesh.allocate_blocking(alloc))
75
-
76
-
77
- class ProcMesh(MeshTrait):
78
- def __init__(
79
- self,
80
- hy_proc_mesh: HyProcMesh,
81
- _mock_shape: Optional[Shape] = None,
82
- _device_mesh: Optional[DeviceMesh] = None,
83
- ) -> None:
84
- self._proc_mesh = hy_proc_mesh
85
- self._mock_shape: Optional[Shape] = _mock_shape
86
- self._mailbox: Mailbox = self._proc_mesh.client
87
- self._rdma_manager: Optional[RDMAManager] = None
88
- self._rsync_mesh_client: Optional[RsyncMeshClient] = None
89
- self._maybe_device_mesh: Optional[DeviceMesh] = _device_mesh
90
- if _mock_shape is None:
91
- self._rdma_manager = self._spawn_blocking("rdma_manager", RDMAManager)
92
-
93
- @property
94
- def _shape(self) -> Shape:
95
- return self._proc_mesh.shape if self._mock_shape is None else self._mock_shape
96
-
97
- @property
98
- def _ndslice(self) -> Slice:
99
- return self._shape.ndslice
100
-
101
- @property
102
- def _labels(self) -> List[str]:
103
- return self._shape.labels
104
-
105
- def _new_with_shape(self, shape: Shape) -> "ProcMesh":
106
- device_mesh = (
107
- None
108
- if self._device_mesh is None
109
- else self._device_mesh._new_with_shape(shape)
110
- )
111
- return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
112
-
113
- def spawn(
114
- self, name: str, Class: Type[T], *args: Any, **kwargs: Any
115
- ) -> Future[ActorMeshRef[T]]:
116
- if self._mock_shape is not None:
117
- raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
118
- return Future(
119
- lambda: self._spawn_nonblocking(name, Class, *args, **kwargs),
120
- lambda: self._spawn_blocking(name, Class, *args, **kwargs),
121
- )
122
-
123
- async def monitor(self) -> ProcMeshMonitor:
124
- """
125
- Get a monitor (async iterator) of the proc mesh, it is used to
126
- monitor the status of the proc mesh. This function can be called at most once.
127
-
128
- Note: This API is experimental and subject to change.
129
-
130
- Example:
131
-
132
- async def monitor_loop(monitor):
133
- async for event in monitor:
134
- await handle_exception_event(event)
135
-
136
- # Kick off in background
137
- asyncio.create_task(monitor_loop(monitor))
138
- """
139
- return await self._proc_mesh.monitor()
140
-
141
- @classmethod
142
- def from_alloc(self, alloc: Alloc) -> Future["ProcMesh"]:
143
- return Future(
144
- lambda: _allocate_nonblocking(alloc),
145
- lambda: _allocate_blocking(alloc),
146
- )
147
-
148
- def _spawn_blocking(
149
- self, name: str, Class: Type[T], *args: Any, **kwargs: Any
150
- ) -> T:
151
- if not issubclass(Class, Actor):
152
- raise ValueError(
153
- f"{Class} must subclass monarch.service.Actor to spawn it."
154
- )
155
-
156
- actor_mesh = self._proc_mesh.spawn_blocking(name, _Actor)
157
- service = ActorMeshRef(
158
- Class,
159
- _ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh),
160
- self._mailbox,
161
- )
162
- # useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
163
- # doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
164
- service._create(args, kwargs)
165
- return cast(T, service)
166
-
167
- def __repr__(self) -> str:
168
- return repr(self._proc_mesh)
169
-
170
- def __str__(self) -> str:
171
- return str(self._proc_mesh)
172
-
173
- async def _spawn_nonblocking(
174
- self, name: str, Class: Type[T], *args: Any, **kwargs: Any
175
- ) -> T:
176
- if not issubclass(Class, Actor):
177
- raise ValueError(
178
- f"{Class} must subclass monarch.service.Actor to spawn it."
179
- )
180
-
181
- actor_mesh = await self._proc_mesh.spawn_nonblocking(name, _Actor)
182
- service = ActorMeshRef(
183
- Class,
184
- _ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh),
185
- self._mailbox,
186
- )
187
- # useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
188
- # doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
189
- service._create(args, kwargs)
190
- return cast(T, service)
191
-
192
- @property
193
- def _device_mesh(self) -> "DeviceMesh":
194
- if spawn_tensor_engine is None:
195
- raise RuntimeError(
196
- "DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
197
- )
198
- if self._maybe_device_mesh is None:
199
- if self._mock_shape is not None:
200
- raise NotImplementedError(
201
- "NYI: activating a proc mesh must first happen on the root proc_mesh until we fix spawning on submeshes."
202
- )
203
- self._maybe_device_mesh = spawn_tensor_engine(self)
204
- return self._maybe_device_mesh
205
-
206
- # pyre-ignore
207
- def activate(self) -> AbstractContextManager:
208
- return self._device_mesh.activate()
209
-
210
- def rank_tensor(self, dim: str | Sequence[str]) -> "torch.Tensor":
211
- return self._device_mesh.rank(dim)
212
-
213
- def rank_tensors(self) -> Dict[str, "torch.Tensor"]:
214
- return self._device_mesh.ranks
215
-
216
- async def sync_workspace(self) -> None:
217
- if self._rsync_mesh_client is None:
218
- # TODO(agallagher): We need some way to configure and pass this
219
- # in -- right now we're assuming the `gpu` dimension, which isn't
220
- # correct.
221
- assert set(self._proc_mesh.shape.labels).issubset({"gpus", "hosts"})
222
- # The workspace shape (i.e. only perform one rsync per host).
223
- workspace_shape = self.slice(gpus=slice(0, 1, 1))._mock_shape
224
- assert workspace_shape is not None
225
- # TODO(agallagher): We should probably hide this behind something
226
- # like a `Workspace` class and support abstracting/configuring
227
- # different sync methods.
228
- self._rsync_mesh_client = RsyncMeshClient.spawn_blocking(
229
- proc_mesh=self._proc_mesh,
230
- shape=workspace_shape,
231
- # TODO(agallagher): Is there a better way to infer/set the local
232
- # workspace dir, rather than use PWD?
233
- local_workspace=os.getcwd(),
234
- remote_workspace=RemoteWorkspace.FromEnvVar("WORKSPACE_DIR"),
235
- )
236
- await self._rsync_mesh_client.sync_workspace()
237
-
238
-
239
- async def local_proc_mesh_nonblocking(
240
- *, gpus: Optional[int] = None, hosts: int = 1
241
- ) -> ProcMesh:
242
- if gpus is None:
243
- gpus = _local_device_count()
244
- spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
245
- allocator = monarch.LocalAllocator()
246
- alloc = await allocator.allocate(spec)
247
- return await ProcMesh.from_alloc(alloc)
248
-
249
-
250
- def local_proc_mesh_blocking(*, gpus: Optional[int] = None, hosts: int = 1) -> ProcMesh:
251
- if gpus is None:
252
- gpus = _local_device_count()
253
- spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
254
- allocator = monarch.LocalAllocator()
255
- alloc = allocator.allocate(spec).get()
256
- return ProcMesh.from_alloc(alloc).get()
257
-
258
-
259
- def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
260
- return Future(
261
- lambda: local_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
262
- lambda: local_proc_mesh_blocking(gpus=gpus, hosts=hosts),
263
- )
264
-
265
-
266
- _BOOTSTRAP_MAIN = "monarch.bootstrap_main"
267
-
268
-
269
- def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
270
- if IN_PAR:
271
- cmd = sys.argv[0]
272
- args = None
273
- env = {
274
- "PAR_MAIN_OVERRIDE": _BOOTSTRAP_MAIN,
275
- }
276
- else:
277
- cmd = sys.executable
278
- args = ["-m", _BOOTSTRAP_MAIN]
279
- env = {}
280
-
281
- return cmd, args, env
282
-
283
-
284
- async def proc_mesh_nonblocking(
285
- *, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
286
- ) -> ProcMesh:
287
- if gpus is None:
288
- gpus = _local_device_count()
289
- spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
290
- env = env or {}
291
- cmd, args, base_env = _get_bootstrap_args()
292
- env.update(base_env)
293
- allocator = monarch.ProcessAllocator(cmd, args, env)
294
- alloc = await allocator.allocate(spec)
295
- return await ProcMesh.from_alloc(alloc)
296
-
297
-
298
- def proc_mesh_blocking(
299
- *, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
300
- ) -> ProcMesh:
301
- if gpus is None:
302
- gpus = _local_device_count()
303
- spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
304
- env = env or {}
305
- cmd, args, base_env = _get_bootstrap_args()
306
- env.update(base_env)
307
- allocator = monarch.ProcessAllocator(cmd, args, env)
308
- alloc = allocator.allocate(spec).get()
309
- return ProcMesh.from_alloc(alloc).get()
310
-
311
14
 
312
- def proc_mesh(
313
- *, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
314
- ) -> Future[ProcMesh]:
315
- return Future(
316
- lambda: proc_mesh_nonblocking(gpus=gpus, hosts=hosts, env=env),
317
- lambda: proc_mesh_blocking(gpus=gpus, hosts=hosts, env=env),
318
- )
15
+ from monarch._src.actor.proc_mesh import * # noqa
@@ -11,7 +11,7 @@ from time import sleep
11
11
  from typing import Optional, TYPE_CHECKING
12
12
 
13
13
  import monarch_supervisor
14
- from monarch.common._device_utils import _local_device_count
14
+ from monarch._src.actor.device_utils import _local_device_count
15
15
  from monarch.common.fake import fake_call
16
16
  from monarch.common.invocation import DeviceException, RemoteException
17
17
  from monarch.world_mesh import world_mesh
@@ -20,11 +20,12 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarc
20
20
  init_proc,
21
21
  Proc,
22
22
  )
23
+
24
+ from monarch._src.actor.shape import NDSlice
23
25
  from monarch.common.client import Client
24
26
  from monarch.common.device_mesh import DeviceMesh, DeviceMeshStatus
25
27
  from monarch.common.invocation import DeviceException, RemoteException
26
28
  from monarch.common.mast import MastJob
27
- from monarch.common.shape import NDSlice
28
29
  from monarch.controller.rust_backend.controller import RustController
29
30
 
30
31
  TORCHX_MAST_TASK_GROUP_NAME = "script"
@@ -71,7 +71,7 @@ _MONARCH_TENSOR_WORKER_MAIN = "monarch.tensor_worker_main"
71
71
  try:
72
72
  from __manifest__ import fbmake # noqa
73
73
 
74
- IN_PAR = True
74
+ IN_PAR = bool(fbmake.get("par_style"))
75
75
  except ImportError:
76
76
  IN_PAR = False
77
77
 
@@ -122,7 +122,9 @@ _PROC_ENV: dict[str, str] = {}
122
122
 
123
123
  def get_controller_main() -> tuple[Path, dict[str, str]]:
124
124
  with (
125
- importlib.resources.path("monarch", "monarch_controller") as controller_main,
125
+ importlib.resources.as_file(
126
+ importlib.resources.files("monarch") / "monarch_controller"
127
+ ) as controller_main,
126
128
  ):
127
129
  if not controller_main.exists():
128
130
  if IN_PAR:
monarch/sim_mesh.py CHANGED
@@ -31,7 +31,6 @@ from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monar
31
31
  )
32
32
 
33
33
  from monarch._rust_bindings.monarch_extension.simulator_client import ( # @manual=//monarch/monarch_extension:monarch_extension
34
- bootstrap_simulator_backend,
35
34
  SimulatorClient,
36
35
  )
37
36
 
@@ -40,6 +39,8 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarc
40
39
  init_proc,
41
40
  Proc,
42
41
  )
42
+
43
+ from monarch._src.actor.shape import NDSlice
43
44
  from monarch.common.client import Client
44
45
  from monarch.common.constants import (
45
46
  SIM_MESH_CLIENT_SUPERVISION_UPDATE_INTERVAL,
@@ -50,7 +51,6 @@ from monarch.common.fake import fake_call
50
51
  from monarch.common.future import Future, T
51
52
  from monarch.common.invocation import DeviceException, RemoteException
52
53
  from monarch.common.messages import Dims
53
- from monarch.common.shape import NDSlice
54
54
  from monarch.controller.rust_backend.controller import RustController
55
55
  from monarch.rust_backend_mesh import MeshWorld
56
56
 
@@ -58,9 +58,7 @@ from monarch.rust_backend_mesh import MeshWorld
58
58
  logger: logging.Logger = logging.getLogger(__name__)
59
59
 
60
60
 
61
- def sim_mesh(
62
- n_meshes: int, hosts: int, gpus_per_host: int, proxy_addr: Optional[str] = None
63
- ) -> List[DeviceMesh]:
61
+ def sim_mesh(n_meshes: int, hosts: int, gpus_per_host: int) -> List[DeviceMesh]:
64
62
  """
65
63
  Creates a single simulated device mesh with the given number of per host.
66
64
 
@@ -75,7 +73,6 @@ def sim_mesh(
75
73
  bootstrap: Bootstrap = Bootstrap(
76
74
  n_meshes,
77
75
  mesh_world_state,
78
- proxy_addr=proxy_addr,
79
76
  world_size=hosts * gpus_per_host,
80
77
  )
81
78
 
@@ -180,14 +177,12 @@ class Bootstrap:
180
177
  self,
181
178
  num_meshes: int,
182
179
  mesh_world_state: Dict[MeshWorld, Optional[DeviceMesh]],
183
- proxy_addr: Optional[str] = None,
184
180
  world_size: int = 1,
185
181
  ) -> None:
186
182
  """
187
183
  Bootstraps a SimMesh.
188
184
  Args:
189
185
  num_meshes: int - number of meshes to create.
190
- proxy_addr: Option[str] - the proxy address of the simulation process
191
186
  mesh_world_state: a state of the meshes. Keys are the MeshWorld and values are boolean indicating if this mesh is active.
192
187
  """
193
188
  # do a fake call to instantiate ThreadPoolExecutor so we don't block GIL later
@@ -198,17 +193,11 @@ class Bootstrap:
198
193
 
199
194
  self._mesh_world_state: Dict[MeshWorld, Optional[DeviceMesh]] = mesh_world_state
200
195
 
201
- proxy_addr = proxy_addr or f"unix!@{_random_id()}-proxy"
202
- self.bootstrap_addr: str = f"sim!unix!@system,{proxy_addr}"
203
-
204
- client_proxy_addr = f"unix!@{_random_id()}-proxy"
205
- self.client_listen_addr: str = f"sim!unix!@client,{client_proxy_addr}"
206
- self.client_bootstrap_addr: str = (
207
- f"sim!unix!@client,{client_proxy_addr},unix!@system,{proxy_addr}"
208
- )
209
- bootstrap_simulator_backend(self.bootstrap_addr, proxy_addr, world_size)
196
+ self.bootstrap_addr: str = "sim!unix!@system"
197
+ self.client_listen_addr = "sim!unix!@client"
198
+ self.client_bootstrap_addr = "sim!unix!@client,unix!@system"
210
199
 
211
- self._simulator_client = SimulatorClient(proxy_addr)
200
+ self._simulator_client = SimulatorClient(self.bootstrap_addr, world_size)
212
201
  for i in range(num_meshes):
213
202
  mesh_name: str = f"mesh_{i}"
214
203
  controller_world: str = f"{mesh_name}_controller"
@@ -234,7 +223,9 @@ class Bootstrap:
234
223
  worker_world, controller_id = mesh_world
235
224
  controller_world = controller_id.world_name
236
225
  self._simulator_client.spawn_mesh(
237
- self.bootstrap_addr, f"{controller_world}[0].root", worker_world
226
+ self.bootstrap_addr,
227
+ f"{controller_world}[0].root",
228
+ worker_world,
238
229
  )
239
230
 
240
231
 
@@ -12,9 +12,9 @@ from dataclasses import dataclass
12
12
  from typing import List, NamedTuple, Optional, Sequence
13
13
 
14
14
  import torch
15
+ from monarch._src.actor.shape import NDSlice
15
16
 
16
17
  from monarch.common import messages
17
- from monarch.common.shape import NDSlice
18
18
  from monarch.simulator.ir import IRGraph
19
19
  from monarch.simulator.tensor import DTensorRef
20
20
  from monarch.simulator.utils import clean_name, file_path_with_iter
@@ -6,9 +6,10 @@
6
6
 
7
7
  from typing import Union
8
8
 
9
+ from monarch._src.actor.shape import NDSlice
10
+
9
11
  from monarch.common.client import Client as _Client
10
12
  from monarch.common.device_mesh import DeviceMesh
11
- from monarch.common.shape import NDSlice
12
13
 
13
14
  from monarch.simulator.ir import IRGraph
14
15
  from monarch.simulator.simulator import (
@@ -25,6 +25,7 @@ from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monar
25
25
  from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarch/monarch_extension:monarch_extension
26
26
  ActorId,
27
27
  )
28
+ from monarch._src.actor.shape import iter_ranks, NDSlice, Slices as Ranks
28
29
 
29
30
  from monarch.common import messages
30
31
 
@@ -32,7 +33,6 @@ from monarch.common.controller_api import DebuggerMessage, LogMessage, MessageRe
32
33
  from monarch.common.device_mesh import no_mesh
33
34
  from monarch.common.invocation import Invocation, RemoteException, Seq
34
35
  from monarch.common.reference import Ref
35
- from monarch.common.shape import iter_ranks, NDSlice, Slices as Ranks
36
36
  from monarch.common.tree import flatten
37
37
 
38
38
  if TYPE_CHECKING:
@@ -43,12 +43,12 @@ import torch
43
43
  from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarch/monarch_extension:monarch_extension
44
44
  ActorId,
45
45
  )
46
+ from monarch._src.actor.shape import iter_ranks, NDSlice
46
47
  from monarch.common import messages
47
48
  from monarch.common.controller_api import LogMessage, MessageResult
48
49
  from monarch.common.device_mesh import DeviceMesh
49
50
  from monarch.common.function import ResolvableFunction, ResolvableFunctionFromPath
50
51
  from monarch.common.invocation import DeviceException
51
- from monarch.common.shape import iter_ranks, NDSlice
52
52
  from monarch.simulator.command_history import CommandHistory, DTensorRef
53
53
  from monarch.simulator.config import META_VAL
54
54
  from monarch.simulator.ir import IRGraph
@@ -0,0 +1,23 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Monarch Tensor Engine API - Public interface for tensor engine functionality.
9
+ """
10
+
11
+ from monarch._src.tensor_engine.rdma import (
12
+ is_available,
13
+ RDMABuffer,
14
+ RDMAReadTransferWarning,
15
+ RDMAWriteTransferWarning,
16
+ )
17
+
18
+ __all__ = [
19
+ "is_available",
20
+ "RDMABuffer",
21
+ "RDMAReadTransferWarning",
22
+ "RDMAWriteTransferWarning",
23
+ ]
@@ -249,7 +249,9 @@ if __name__ == "__main__":
249
249
  torch.cuda.set_device = check_set_device
250
250
 
251
251
  with (
252
- importlib.resources.path("monarch", "py-spy") as pyspy,
252
+ importlib.resources.as_file(
253
+ importlib.resources.files("monarch") / "py-spy"
254
+ ) as pyspy,
253
255
  ):
254
256
  if pyspy.exists():
255
257
  os.environ["PYSPY_BIN"] = str(pyspy)
monarch/tools/cli.py CHANGED
@@ -86,7 +86,9 @@ class CreateCmd:
86
86
  else defaults.component_fn(config.scheduler)
87
87
  )
88
88
  component_args = component_args_from_cli(component_fn, args.component_args)
89
- handle = create(config, component_fn)(**component_args)
89
+ appdef = component_fn(**component_args)
90
+
91
+ handle = create(config, appdef)
90
92
  print(handle)
91
93
 
92
94