torchmonarch-nightly 2025.8.2__cp310-cp310-manylinux2014_x86_64.whl → 2025.9.4__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. monarch/_rust_bindings.so +0 -0
  2. monarch/_src/actor/actor_mesh.py +504 -218
  3. monarch/_src/actor/allocator.py +75 -6
  4. monarch/_src/actor/bootstrap_main.py +7 -4
  5. monarch/_src/actor/code_sync/__init__.py +2 -0
  6. monarch/_src/actor/debugger/__init__.py +7 -0
  7. monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
  8. monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
  9. monarch/_src/actor/endpoint.py +27 -45
  10. monarch/_src/actor/future.py +86 -24
  11. monarch/_src/actor/host_mesh.py +125 -0
  12. monarch/_src/actor/logging.py +94 -0
  13. monarch/_src/actor/pickle.py +25 -0
  14. monarch/_src/actor/proc_mesh.py +423 -156
  15. monarch/_src/actor/python_extension_methods.py +90 -0
  16. monarch/_src/actor/shape.py +8 -1
  17. monarch/_src/actor/source_loader.py +45 -0
  18. monarch/_src/actor/telemetry/__init__.py +172 -0
  19. monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
  20. monarch/_src/debug_cli/__init__.py +7 -0
  21. monarch/_src/debug_cli/debug_cli.py +43 -0
  22. monarch/_src/tensor_engine/rdma.py +64 -9
  23. monarch/_testing.py +1 -3
  24. monarch/actor/__init__.py +24 -4
  25. monarch/common/_C.so +0 -0
  26. monarch/common/device_mesh.py +14 -0
  27. monarch/common/future.py +10 -0
  28. monarch/common/remote.py +14 -25
  29. monarch/common/tensor.py +12 -0
  30. monarch/debug_cli/__init__.py +7 -0
  31. monarch/debug_cli/__main__.py +12 -0
  32. monarch/fetch.py +2 -2
  33. monarch/gradient/_gradient_generator.so +0 -0
  34. monarch/gradient_generator.py +4 -2
  35. monarch/mesh_controller.py +34 -14
  36. monarch/monarch_controller +0 -0
  37. monarch/tools/colors.py +25 -0
  38. monarch/tools/commands.py +42 -7
  39. monarch/tools/components/hyperactor.py +6 -4
  40. monarch/tools/config/__init__.py +35 -12
  41. monarch/tools/config/defaults.py +15 -5
  42. monarch/tools/config/environment.py +45 -0
  43. monarch/tools/config/workspace.py +165 -0
  44. monarch/tools/mesh_spec.py +3 -3
  45. monarch/utils/__init__.py +9 -0
  46. monarch/utils/utils.py +78 -0
  47. tests/error_test_binary.py +5 -3
  48. tests/python_actor_test_binary.py +52 -0
  49. tests/test_actor_error.py +142 -14
  50. tests/test_alloc.py +1 -1
  51. tests/test_allocator.py +59 -72
  52. tests/test_debugger.py +639 -45
  53. tests/test_env_before_cuda.py +4 -4
  54. tests/test_mesh_trait.py +38 -0
  55. tests/test_python_actors.py +965 -75
  56. tests/test_rdma.py +7 -6
  57. tests/test_tensor_engine.py +6 -6
  58. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/METADATA +82 -4
  59. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/RECORD +63 -47
  60. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/WHEEL +0 -0
  61. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/entry_points.txt +0 -0
  62. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/licenses/LICENSE +0 -0
  63. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/top_level.txt +0 -0
@@ -6,83 +6,91 @@
6
6
 
7
7
  # pyre-strict
8
8
 
9
+ import asyncio
9
10
  import logging
10
11
  import os
11
12
  import sys
13
+ import threading
12
14
  import warnings
13
15
  from contextlib import AbstractContextManager
14
16
 
17
+ from functools import cache
18
+ from pathlib import Path
19
+
15
20
  from typing import (
16
21
  Any,
17
22
  Callable,
18
23
  cast,
19
24
  Dict,
20
25
  List,
26
+ Literal,
21
27
  Optional,
22
28
  Sequence,
29
+ Tuple,
23
30
  Type,
24
31
  TYPE_CHECKING,
25
32
  TypeVar,
26
33
  )
34
+ from weakref import WeakValueDictionary
27
35
 
28
- from monarch._rust_bindings.monarch_extension.logging import LoggingMeshClient
29
36
  from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
30
37
  Alloc,
31
38
  AllocConstraints,
32
39
  AllocSpec,
33
40
  )
34
- from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
41
+
35
42
  from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
36
43
  ProcMesh as HyProcMesh,
37
44
  ProcMeshMonitor,
38
45
  )
46
+ from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask, Shared
39
47
  from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
40
- from monarch._src.actor.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
41
-
48
+ from monarch._src.actor.actor_mesh import _Actor, Actor, ActorMesh, context
42
49
  from monarch._src.actor.allocator import (
43
50
  AllocateMixin,
51
+ AllocHandle,
44
52
  LocalAllocator,
45
53
  ProcessAllocator,
46
54
  SimAllocator,
47
55
  )
48
56
  from monarch._src.actor.code_sync import (
49
57
  CodeSyncMeshClient,
58
+ CodeSyncMethod,
50
59
  RemoteWorkspace,
60
+ WorkspaceConfig,
51
61
  WorkspaceLocation,
52
62
  WorkspaceShape,
53
63
  )
54
- from monarch._src.actor.debugger import (
55
- _DEBUG_MANAGER_ACTOR_NAME,
56
- DebugClient,
57
- DebugManager,
58
- )
59
-
60
64
  from monarch._src.actor.device_utils import _local_device_count
61
65
 
62
66
  from monarch._src.actor.endpoint import endpoint
63
- from monarch._src.actor.future import Future
67
+ from monarch._src.actor.future import DeprecatedNotAFuture, Future
68
+ from monarch._src.actor.logging import LoggingManager
64
69
  from monarch._src.actor.shape import MeshTrait
70
+ from monarch.tools.config.environment import CondaEnvironment
71
+ from monarch.tools.config.workspace import Workspace
72
+ from monarch.tools.utils import conda as conda_utils
65
73
 
66
- HAS_TENSOR_ENGINE = False
67
- try:
68
- # Torch is needed for tensor engine
69
- import torch # @manual
70
74
 
71
- # Confirm that rust bindings were built with tensor engine enabled
72
- from monarch._rust_bindings.rdma import ( # type: ignore[import]
73
- _RdmaBuffer,
74
- _RdmaManager,
75
- )
75
+ @cache
76
+ def _has_tensor_engine() -> bool:
77
+ try:
78
+ # Torch is needed for tensor engine
79
+ import torch # @manual
76
80
 
77
- # type: ignore[16]
78
- HAS_TENSOR_ENGINE = torch.cuda.is_available()
79
- except ImportError:
80
- logging.warning("Tensor engine is not available on this platform")
81
+ # Confirm that rust bindings were built with tensor engine enabled
82
+ from monarch._rust_bindings.rdma import _RdmaManager # noqa
83
+
84
+ return True
85
+ except ImportError:
86
+ logging.warning("Tensor engine is not available on this platform")
87
+ return False
81
88
 
82
89
 
83
90
  if TYPE_CHECKING:
84
91
  Tensor = Any
85
92
  DeviceMesh = Any
93
+ from monarch._src.actor.host_mesh import HostMesh
86
94
 
87
95
 
88
96
  class SetupActor(Actor):
@@ -114,55 +122,108 @@ except ImportError:
114
122
  IN_PAR = False
115
123
 
116
124
 
117
- class ProcMesh(MeshTrait):
125
+ # A temporary gate used by the PythonActorMesh/PythonActorMeshRef migration.
126
+ # We can use this gate to quickly roll back to using _ActorMeshRefImpl, if we
127
+ # encounter any issues with the migration.
128
+ #
129
+ # This should be removed once we confirm PythonActorMesh/PythonActorMeshRef is
130
+ # working correctly in production.
131
+ @cache
132
+ def _use_standin_mesh() -> bool:
133
+ return os.getenv("USE_STANDIN_ACTOR_MESH", default="0") != "0"
134
+
135
+
136
+ # Ultra-hack to allow actors to identify proc meshes but with no real functionality.
137
+ class ProcMeshRef:
138
+ def __init__(self, proc_mesh_id: int) -> None:
139
+ self._proc_mesh_id = proc_mesh_id
140
+ self._host_mesh: Optional["HostMesh"] = None
141
+
142
+ @classmethod
143
+ def _fake_proc_mesh(cls, proc_mesh_id: int) -> "ProcMesh":
144
+ return cast(ProcMesh, cls(proc_mesh_id))
145
+
146
+ def __getattr__(self, attr: str) -> Any:
147
+ # AttributeError instead of NotImplementedError so that any hasattr calls
148
+ # will properly return False
149
+ raise AttributeError(
150
+ f"NYI: attempting to get ProcMesh attribute `{attr}` on object that's actually a ProcMeshRef"
151
+ )
152
+
153
+ def __hash__(self) -> int:
154
+ return hash(self._proc_mesh_id)
155
+
156
+ def __eq__(self, other: object) -> bool:
157
+ if not isinstance(other, ProcMeshRef):
158
+ return False
159
+ return self._proc_mesh_id == other._proc_mesh_id
160
+
161
+ @property
162
+ def _proc_mesh(self) -> Shared["HyProcMesh"]:
163
+ return _deref_proc_mesh(self)._proc_mesh
164
+
165
+
166
+ _proc_mesh_lock: threading.Lock = threading.Lock()
167
+ _proc_mesh_key: int = 0
168
+ _proc_mesh_registry: WeakValueDictionary[ProcMeshRef, "ProcMesh"] = (
169
+ WeakValueDictionary()
170
+ )
171
+
172
+
173
+ def _deref_proc_mesh(proc_mesh: ProcMeshRef) -> "ProcMesh":
174
+ if proc_mesh not in _proc_mesh_registry:
175
+ raise ValueError(
176
+ f"ProcMesh with id {proc_mesh._proc_mesh_id} does not exist on host."
177
+ )
178
+ return _proc_mesh_registry[proc_mesh]
179
+
180
+
181
+ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
118
182
  def __init__(
119
183
  self,
120
- hy_proc_mesh: HyProcMesh,
121
- _mock_shape: Optional[Shape] = None,
184
+ hy_proc_mesh: "Shared[HyProcMesh]",
185
+ shape: Shape,
122
186
  _device_mesh: Optional["DeviceMesh"] = None,
123
187
  ) -> None:
124
188
  self._proc_mesh = hy_proc_mesh
125
- self._mock_shape: Optional[Shape] = _mock_shape
126
- # type: ignore[21]
127
- self._rdma_manager: Optional["_RdmaManager"] = None
128
- self._debug_manager: Optional[DebugManager] = None
129
- self._mailbox: Mailbox = self._proc_mesh.client
189
+ global _proc_mesh_lock, _proc_mesh_key
190
+ with _proc_mesh_lock:
191
+ self._proc_mesh_id: int = _proc_mesh_key
192
+ _proc_mesh_key += 1
193
+ self._shape = shape
194
+ # until we have real slicing support keep track
195
+ # of whether this is a slice of a real proc_meshg
196
+ self._slice = False
130
197
  self._code_sync_client: Optional[CodeSyncMeshClient] = None
131
- self._logging_mesh_client: Optional[LoggingMeshClient] = None
198
+ self._logging_manager: LoggingManager = LoggingManager()
132
199
  self._maybe_device_mesh: Optional["DeviceMesh"] = _device_mesh
133
200
  self._stopped = False
201
+ self._controller_controller: Optional["_ControllerController"] = None
202
+ # current set only for context()'s proc_mesh to be a local host mesh.
203
+ self._host_mesh: Optional["HostMesh"] = None
134
204
 
135
- async def _init_manager_actors(
136
- self,
137
- setup: Callable[[], None] | None = None,
138
- ) -> "ProcMesh":
139
- _rdma_manager = (
140
- # type: ignore[16]
141
- await _RdmaManager.create_rdma_manager_nonblocking(self._proc_mesh)
142
- # type: ignore[16]
143
- if HAS_TENSOR_ENGINE and _RdmaBuffer.rdma_supported()
144
- else None
145
- )
146
-
147
- _debug_manager = await self._spawn_nonblocking(
148
- _DEBUG_MANAGER_ACTOR_NAME, DebugManager, await _debug_client()
149
- )
205
+ @property
206
+ def initialized(self) -> Future[Literal[True]]:
207
+ """
208
+ Future completes with 'True' when the ProcMesh has initialized.
209
+ Because ProcMesh are remote objects, there is no guarentee that the ProcMesh is
210
+ still usable after this completes, only that at some point in the past it was usable.
211
+ """
212
+ pm: Shared[HyProcMesh] = self._proc_mesh
150
213
 
151
- self._debug_manager = _debug_manager
152
- self._rdma_manager = _rdma_manager
214
+ async def task() -> Literal[True]:
215
+ await pm
216
+ return True
153
217
 
154
- if setup is not None:
155
- # If the user has passed the setup lambda, we need to call
156
- # it here before any of the other actors are spawned so that
157
- # the environment variables are set up before cuda init.
158
- setup_actor = await self._spawn_nonblocking("setup", SetupActor, setup)
159
- # pyre-ignore
160
- await setup_actor.setup.call()._status.coro
161
- return self
218
+ return Future(coro=task())
162
219
 
163
220
  @property
164
- def _shape(self) -> Shape:
165
- return self._proc_mesh.shape if self._mock_shape is None else self._mock_shape
221
+ def host_mesh(self) -> "HostMesh":
222
+ if self._host_mesh is None:
223
+ raise NotImplementedError(
224
+ "NYI complete for release 0.1 (ProcMeshRef knowing its host mesh)"
225
+ )
226
+ return self._host_mesh
166
227
 
167
228
  @property
168
229
  def _ndslice(self) -> Slice:
@@ -173,17 +234,34 @@ class ProcMesh(MeshTrait):
173
234
  return self._shape.labels
174
235
 
175
236
  def _new_with_shape(self, shape: Shape) -> "ProcMesh":
237
+ # make sure that if we slice something with unity,
238
+ # we do not lose the ability to spawn on it.
239
+ # remote when spawn is implemented.
240
+ if shape == self._shape:
241
+ return self
176
242
  device_mesh = (
177
243
  None
178
244
  if self._maybe_device_mesh is None
179
245
  else self._device_mesh._new_with_shape(shape)
180
246
  )
181
- return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
247
+ pm = ProcMesh(self._proc_mesh, shape, _device_mesh=device_mesh)
248
+ pm._slice = True
249
+ return pm
182
250
 
183
- def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> Future[T]:
184
- if self._mock_shape is not None:
251
+ def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> T:
252
+ if self._slice:
185
253
  raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
186
- return Future(coro=self._spawn_nonblocking(name, Class, *args, **kwargs))
254
+ return self._spawn_nonblocking(name, Class, *args, **kwargs)
255
+
256
+ @property
257
+ async def _proc_mesh_for_asyncio_fixme(self) -> HyProcMesh:
258
+ """
259
+ Get ProcMesh on the asyncio event stream.
260
+ We should redo this functionality to work on the tokio stream.
261
+ This must be called on the asyncio stream.
262
+ """
263
+ assert asyncio.get_running_loop() is not None
264
+ return await Future(coro=self._proc_mesh.task())
187
265
 
188
266
  async def monitor(self) -> ProcMeshMonitor:
189
267
  """
@@ -201,12 +279,17 @@ class ProcMesh(MeshTrait):
201
279
  # Kick off in background
202
280
  asyncio.create_task(monitor_loop(monitor))
203
281
  """
204
- return await self._proc_mesh.monitor()
282
+ # todo: move monitor to tokio loop
283
+ proc_mesh = await Future(coro=self._proc_mesh.task())
284
+ return await proc_mesh.monitor()
205
285
 
206
286
  @classmethod
207
287
  def from_alloc(
208
- self, alloc: Alloc, setup: Callable[[], None] | None = None
209
- ) -> Future["ProcMesh"]:
288
+ self,
289
+ alloc: AllocHandle,
290
+ setup: Callable[[], None] | None = None,
291
+ _attach_controller_controller: bool = True,
292
+ ) -> "ProcMesh":
210
293
  """
211
294
  Allocate a process mesh according to the provided alloc.
212
295
  Returns when the mesh is fully allocated.
@@ -225,37 +308,98 @@ class ProcMesh(MeshTrait):
225
308
  os.environ["LOCAL_RANK"] = str(rank["gpus"])
226
309
  ```
227
310
  """
228
- return Future(
229
- coro=_proc_mesh_from_alloc_coro(alloc, setup, init_manager_actors=True)
311
+
312
+ async def task() -> HyProcMesh:
313
+ return await HyProcMesh.allocate_nonblocking(await alloc._hy_alloc)
314
+
315
+ shape = Shape(
316
+ list(alloc._extent.keys()),
317
+ Slice.new_row_major(list(alloc._extent.values())),
230
318
  )
231
319
 
320
+ hy_proc_mesh = PythonTask.from_coroutine(task()).spawn()
321
+
322
+ pm = ProcMesh(hy_proc_mesh, shape)
323
+ if _attach_controller_controller:
324
+ instance = context().actor_instance
325
+ pm._controller_controller = instance._controller_controller
326
+ instance._add_child(pm)
327
+
328
+ async def task(
329
+ pm: "ProcMesh",
330
+ hy_proc_mesh_task: "Shared[HyProcMesh]",
331
+ setup_actor: Optional[SetupActor],
332
+ stream_log_to_client: bool,
333
+ ) -> HyProcMesh:
334
+ hy_proc_mesh = await hy_proc_mesh_task
335
+
336
+ await pm._logging_manager.init(hy_proc_mesh, stream_log_to_client)
337
+
338
+ if setup_actor is not None:
339
+ await setup_actor.setup.call()
340
+
341
+ return hy_proc_mesh
342
+
343
+ setup_actor = None
344
+ if setup is not None:
345
+ # If the user has passed the setup lambda, we need to call
346
+ # it here before any of the other actors are spawned so that
347
+ # the environment variables are set up before cuda init.
348
+ setup_actor = pm._spawn_nonblocking_on(
349
+ hy_proc_mesh, "setup", SetupActor, setup
350
+ )
351
+
352
+ pm._proc_mesh = PythonTask.from_coroutine(
353
+ task(pm, hy_proc_mesh, setup_actor, alloc.stream_logs)
354
+ ).spawn()
355
+
356
+ return pm
357
+
232
358
  def __repr__(self) -> str:
233
359
  return repr(self._proc_mesh)
234
360
 
235
361
  def __str__(self) -> str:
236
362
  return str(self._proc_mesh)
237
363
 
238
- async def _spawn_nonblocking(
364
+ def _spawn_nonblocking(
239
365
  self, name: str, Class: Type[T], *args: Any, **kwargs: Any
366
+ ) -> T:
367
+ return self._spawn_nonblocking_on(self._proc_mesh, name, Class, *args, **kwargs)
368
+
369
+ def to_table(self) -> str:
370
+ return self._device_mesh.to_table()
371
+
372
+ def _spawn_nonblocking_on(
373
+ self,
374
+ pm: "Shared[HyProcMesh]",
375
+ name: str,
376
+ Class: Type[T],
377
+ *args: Any,
378
+ **kwargs: Any,
240
379
  ) -> T:
241
380
  if not issubclass(Class, Actor):
242
381
  raise ValueError(
243
382
  f"{Class} must subclass monarch.service.Actor to spawn it."
244
383
  )
245
- actor_mesh = await self._proc_mesh.spawn_nonblocking(name, _Actor)
246
- service = ActorMeshRef(
384
+
385
+ actor_mesh = HyProcMesh.spawn_async(pm, name, _Actor, _use_standin_mesh())
386
+ instance = context().actor_instance
387
+ service = ActorMesh._create(
247
388
  Class,
248
- _ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh, self),
249
- self._mailbox,
389
+ actor_mesh,
390
+ instance._mailbox,
391
+ self._shape,
392
+ self,
393
+ self._controller_controller,
394
+ *args,
395
+ **kwargs,
250
396
  )
251
- # useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
252
- # doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
253
- service._create(args, kwargs)
397
+ instance._add_child(service)
254
398
  return cast(T, service)
255
399
 
256
400
  @property
257
401
  def _device_mesh(self) -> "DeviceMesh":
258
- if not HAS_TENSOR_ENGINE:
402
+ if not _has_tensor_engine():
259
403
  raise RuntimeError(
260
404
  "DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
261
405
  )
@@ -264,7 +408,7 @@ class ProcMesh(MeshTrait):
264
408
  from monarch.mesh_controller import spawn_tensor_engine # @manual
265
409
 
266
410
  if self._maybe_device_mesh is None:
267
- if self._mock_shape is not None:
411
+ if self._slice:
268
412
  raise NotImplementedError(
269
413
  "NYI: activating a proc mesh must first happen on the root proc_mesh until we fix spawning on submeshes."
270
414
  )
@@ -282,52 +426,97 @@ class ProcMesh(MeshTrait):
282
426
  def rank_tensors(self) -> Dict[str, "Tensor"]:
283
427
  return self._device_mesh.ranks
284
428
 
285
- async def sync_workspace(self, auto_reload: bool = False) -> None:
429
+ async def sync_workspace(
430
+ self,
431
+ workspace: Workspace,
432
+ conda: bool = False,
433
+ auto_reload: bool = False,
434
+ ) -> None:
286
435
  if self._code_sync_client is None:
287
436
  self._code_sync_client = CodeSyncMeshClient.spawn_blocking(
288
- proc_mesh=self._proc_mesh,
437
+ proc_mesh=await self._proc_mesh_for_asyncio_fixme,
289
438
  )
439
+
290
440
  # TODO(agallagher): We need some way to configure and pass this
291
441
  # in -- right now we're assuming the `gpu` dimension, which isn't
292
442
  # correct.
293
443
  # The workspace shape (i.e. only perform one rsync per host).
294
- assert set(self._proc_mesh.shape.labels).issubset({"gpus", "hosts"})
444
+ assert set(self._shape.labels).issubset({"gpus", "hosts"})
445
+
446
+ workspaces = []
447
+ for src_dir, dst_dir in workspace.dirs.items():
448
+ workspaces.append(
449
+ WorkspaceConfig(
450
+ local=Path(src_dir),
451
+ remote=RemoteWorkspace(
452
+ location=WorkspaceLocation.FromEnvVar(
453
+ env="WORKSPACE_DIR",
454
+ relpath=dst_dir,
455
+ ),
456
+ shape=WorkspaceShape.shared("gpus"),
457
+ ),
458
+ method=CodeSyncMethod.Rsync,
459
+ ),
460
+ )
461
+
462
+ # If `conda` is set, also sync the currently activated conda env.
463
+ conda_prefix = conda_utils.active_env_dir()
464
+ if isinstance(workspace.env, CondaEnvironment):
465
+ conda_prefix = workspace.env._conda_prefix
466
+
467
+ if conda and conda_prefix is not None:
468
+ conda_prefix = Path(conda_prefix)
469
+
470
+ # Resolve top-level symlinks for rsync/conda-sync.
471
+ while conda_prefix.is_symlink():
472
+ conda_prefix = conda_prefix.parent / conda_prefix.readlink()
473
+
474
+ workspaces.append(
475
+ WorkspaceConfig(
476
+ local=conda_prefix,
477
+ remote=RemoteWorkspace(
478
+ location=WorkspaceLocation.FromEnvVar(
479
+ env="CONDA_PREFIX",
480
+ relpath="",
481
+ ),
482
+ shape=WorkspaceShape.shared("gpus"),
483
+ ),
484
+ method=CodeSyncMethod.CondaSync,
485
+ ),
486
+ )
487
+
295
488
  assert self._code_sync_client is not None
296
- await self._code_sync_client.sync_workspace(
297
- # TODO(agallagher): Is there a better way to infer/set the local
298
- # workspace dir, rather than use PWD?
299
- local=os.getcwd(),
300
- remote=RemoteWorkspace(
301
- location=WorkspaceLocation.FromEnvVar("WORKSPACE_DIR"),
302
- shape=WorkspaceShape.shared("gpus"),
303
- ),
489
+ await self._code_sync_client.sync_workspaces(
490
+ workspaces=workspaces,
304
491
  auto_reload=auto_reload,
305
492
  )
306
493
 
307
494
  async def logging_option(
308
495
  self,
309
- stream_to_client: bool = False,
310
- aggregate_window_sec: int | None = None,
496
+ stream_to_client: bool = True,
497
+ aggregate_window_sec: int | None = 3,
498
+ level: int = logging.INFO,
311
499
  ) -> None:
312
500
  """
313
501
  Set the logging options for the remote processes
314
502
 
315
503
  Args:
316
504
  stream_to_client (bool): If True, logs from the remote processes will be streamed to the client.
317
- Defaults to False.
505
+ Defaults to True.
318
506
  aggregate_window_sec (Optional[int]): If not None, logs from the remote processes will be aggregated
319
- and sent to the client every aggregate_window_sec seconds. Defaults to None, meaning no aggregation.
320
- aggregate_window_sec will be ignored if stream_to_client is False.
507
+ and sent to the client every aggregate_window_sec seconds. Defaults to 3 seconds, meaning no aggregation.
508
+ Error will be thrown if aggregate_window_sec is set and stream_to_client is False.
509
+ level (int): The logging level of the logger. Defaults to logging.INFO.
321
510
 
322
511
  Returns:
323
512
  None
324
513
  """
325
- if self._logging_mesh_client is None:
326
- self._logging_mesh_client = await LoggingMeshClient.spawn(
327
- proc_mesh=self._proc_mesh
328
- )
329
- self._logging_mesh_client.set_mode(
330
- stream_to_client, aggregate_window_sec=aggregate_window_sec
514
+ await self.initialized
515
+
516
+ await self._logging_manager.logging_option(
517
+ stream_to_client=stream_to_client,
518
+ aggregate_window_sec=aggregate_window_sec,
519
+ level=level,
331
520
  )
332
521
 
333
522
  async def __aenter__(self) -> "ProcMesh":
@@ -336,8 +525,10 @@ class ProcMesh(MeshTrait):
336
525
  return self
337
526
 
338
527
  def stop(self) -> Future[None]:
528
+ self._logging_manager.stop()
529
+
339
530
  async def _stop_nonblocking() -> None:
340
- await self._proc_mesh.stop_nonblocking()
531
+ await (await self._proc_mesh).stop_nonblocking()
341
532
  self._stopped = True
342
533
 
343
534
  return Future(coro=_stop_nonblocking())
@@ -353,6 +544,8 @@ class ProcMesh(MeshTrait):
353
544
  # Finalizer to check if the proc mesh was closed properly.
354
545
  def __del__(self) -> None:
355
546
  if not self._stopped:
547
+ self._logging_manager.stop()
548
+
356
549
  warnings.warn(
357
550
  f"unstopped ProcMesh {self!r}",
358
551
  ResourceWarning,
@@ -361,17 +554,59 @@ class ProcMesh(MeshTrait):
361
554
  )
362
555
  # Cannot call stop here because it is async.
363
556
 
557
+ def __reduce_ex__(self, protocol: ...) -> Tuple[Any, Tuple[Any, ...]]:
558
+ # Ultra-hack. Remote python actors can get a reference to this proc mesh that
559
+ # doesn't have any real functionality, but if they send a request back to the client
560
+ # where the real proc mesh exists, the client can look it up in the proc mesh registry
561
+ # and do something with it.
562
+ global _proc_mesh_registry
563
+ _proc_mesh_registry[ProcMeshRef(self._proc_mesh_id)] = self
564
+ return (ProcMeshRef._fake_proc_mesh, (self._proc_mesh_id,))
565
+
566
+ @staticmethod
567
+ def _from_ref(proc_mesh_ref: ProcMeshRef) -> "ProcMesh":
568
+ maybe_proc_mesh = _proc_mesh_registry.get(proc_mesh_ref, None)
569
+ if maybe_proc_mesh is None:
570
+ raise RuntimeError(
571
+ f"ProcMesh with id {proc_mesh_ref._proc_mesh_id} does not exist"
572
+ )
573
+ return maybe_proc_mesh
574
+
575
+
576
+ def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> ProcMesh:
577
+ warnings.warn(
578
+ "Use monarch._src.actor.host_mesh.fake_in_process_host().spawn_procs for testing. For launching an actor in the current process use this_proc().spawn_procs()",
579
+ DeprecationWarning,
580
+ stacklevel=2,
581
+ )
364
582
 
365
- def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
366
- return Future(
367
- coro=_proc_mesh_coro(gpus=gpus, hosts=hosts, allocator=LocalAllocator())
583
+ return _proc_mesh_from_allocator(
584
+ allocator=LocalAllocator(),
585
+ gpus=gpus,
586
+ hosts=hosts,
368
587
  )
369
588
 
370
589
 
371
- def sim_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
372
- return Future(
373
- coro=_proc_mesh_coro(gpus=gpus, hosts=hosts, allocator=SimAllocator())
590
+ def sim_proc_mesh(
591
+ *,
592
+ gpus: int = 1,
593
+ hosts: int = 1,
594
+ racks: int = 1,
595
+ zones: int = 1,
596
+ dcs: int = 1,
597
+ regions: int = 1,
598
+ ) -> ProcMesh:
599
+ spec: AllocSpec = AllocSpec(
600
+ AllocConstraints(),
601
+ hosts=hosts,
602
+ gpus=gpus,
603
+ racks=racks,
604
+ zones=zones,
605
+ dcs=dcs,
606
+ regions=regions,
374
607
  )
608
+ alloc = SimAllocator().allocate(spec)
609
+ return ProcMesh.from_alloc(alloc, None, True)
375
610
 
376
611
 
377
612
  _BOOTSTRAP_MAIN = "monarch._src.actor.bootstrap_main"
@@ -392,25 +627,19 @@ def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
392
627
  return cmd, args, env
393
628
 
394
629
 
395
- async def _proc_mesh_from_alloc_coro(
396
- alloc: Alloc,
397
- setup: Callable[[], None] | None,
398
- init_manager_actors: bool,
399
- ) -> ProcMesh:
400
- _hy_proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
401
- proc_mesh = ProcMesh(_hy_proc_mesh)
402
- if init_manager_actors:
403
- await proc_mesh._init_manager_actors(setup)
404
- return proc_mesh
630
+ async def _hy_proc_mesh_from_alloc_coro(
631
+ alloc: "Shared[Alloc] | PythonTask[Alloc]",
632
+ ) -> HyProcMesh:
633
+ return await HyProcMesh.allocate_nonblocking(await alloc)
405
634
 
406
635
 
407
- async def _proc_mesh_coro(
636
+ def _proc_mesh_from_allocator(
408
637
  *,
409
638
  allocator: AllocateMixin,
410
- gpus: Optional[int] = None,
411
- hosts: int = 1,
639
+ gpus: Optional[int],
640
+ hosts: int,
412
641
  setup: Callable[[], None] | None = None,
413
- init_manager_actors: bool = True,
642
+ _attach_controller_controller: bool = True,
414
643
  ) -> ProcMesh:
415
644
  if gpus is None:
416
645
  gpus = _local_device_count()
@@ -418,9 +647,8 @@ async def _proc_mesh_coro(
418
647
  # test_remote_function_all_gather expects that hosts comes before gpus
419
648
  # in the order of the dimensions.
420
649
  spec: AllocSpec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
421
- alloc = await allocator.allocate_nonblocking(spec)
422
-
423
- return await _proc_mesh_from_alloc_coro(alloc, setup, init_manager_actors)
650
+ alloc = allocator.allocate(spec)
651
+ return ProcMesh.from_alloc(alloc, setup, _attach_controller_controller)
424
652
 
425
653
 
426
654
  def proc_mesh(
@@ -429,53 +657,92 @@ def proc_mesh(
429
657
  hosts: int = 1,
430
658
  env: dict[str, str] | None = None,
431
659
  setup: Callable[[], None] | None = None,
432
- ) -> Future[ProcMesh]:
433
- env = env or {}
660
+ ) -> ProcMesh:
661
+ warnings.warn(
662
+ "use this_host().spawn_procs(per_host = {'hosts': 2, 'gpus': 3}) instead of monarch.actor.proc_mesh(hosts=2, gpus=3)",
663
+ DeprecationWarning,
664
+ stacklevel=2,
665
+ )
434
666
 
667
+ env = env or {}
435
668
  # Todo: Deprecate the env field from the ProcessAllocator
436
669
  # The PAR_MAIN_OVERRIDE needs to be passed as an env
437
670
  # to the proc mesh construction in rust, so can not be moved to the
438
671
  # SetupActor yet
439
672
  cmd, args, bootstrap_env = _get_bootstrap_args()
440
673
  env.update(bootstrap_env)
441
- task = _proc_mesh_coro(
442
- gpus=gpus,
674
+ return _proc_mesh_from_allocator(
675
+ allocator=ProcessAllocator(cmd, args, env),
443
676
  hosts=hosts,
677
+ gpus=gpus,
444
678
  setup=setup,
445
- allocator=ProcessAllocator(cmd, args, env),
446
- init_manager_actors=True,
679
+ _attach_controller_controller=True,
447
680
  )
448
- return Future(coro=task)
449
681
 
450
682
 
451
- _debug_proc_mesh: Optional["ProcMesh"] = None
683
+ _ActorType = TypeVar("_ActorType", bound=Actor)
452
684
 
453
685
 
454
- # Lazy init of the debug proc mesh so that importing monarch.proc_mesh
455
- # doesn't trigger the debug client to spawn, which could cause confusing
456
- # logs. This is defined in proc_mesh.py instead of debugger.py for
457
- # circular import reasons.
458
- async def _get_debug_proc_mesh() -> "ProcMesh":
459
- global _debug_proc_mesh
460
- if _debug_proc_mesh is None:
461
- _debug_proc_mesh = await _proc_mesh_coro(
462
- gpus=1, hosts=1, allocator=LocalAllocator(), init_manager_actors=False
463
- )
464
- return _debug_proc_mesh
465
-
686
+ class _ControllerController(Actor):
687
+ def __init__(self) -> None:
688
+ self._controllers: Dict[str, Actor] = {}
466
689
 
467
- _debug_client_mesh: Optional[DebugClient] = None
690
+ # pyre-ignore
691
+ @endpoint
692
+ def get_or_spawn(
693
+ self, name: str, Class: Type[_ActorType], *args: Any, **kwargs: Any
694
+ ) -> _ActorType:
695
+ if name not in self._controllers:
696
+ proc_mesh = _proc_mesh_from_allocator(
697
+ gpus=1,
698
+ hosts=1,
699
+ allocator=LocalAllocator(),
700
+ )
701
+ self._controllers[name] = proc_mesh.spawn(name, Class, *args, **kwargs)
702
+ return cast(_ActorType, self._controllers[name])
703
+
704
+
705
+ _cc_init = threading.Lock()
706
+ _cc_proc_mesh: Optional["ProcMesh"] = None
707
+ _controller_controller: Optional["_ControllerController"] = None
708
+
709
+
710
+ # Lazy init so that the controller_controller and proc do not produce logs when they aren't used.
711
+ # Checking for the controller (when it does not already exist in the MonarchContext) needs a lock,
712
+ # otherwise two initializing procs will both try to init resulting in duplicates. The critical
713
+ # region is not blocking: it spawns a separate task to do the init, assigns the
714
+ # Shared[_ControllerController] from that task to the global and releases the lock.
715
+ def _get_controller_controller() -> "Tuple[ProcMesh, _ControllerController]":
716
+ global _controller_controller, _cc_proc_mesh
717
+ with _cc_init:
718
+ if _controller_controller is None:
719
+ alloc = LocalAllocator().allocate(AllocSpec(AllocConstraints()))
720
+ _cc_proc_mesh = ProcMesh.from_alloc(
721
+ alloc, _attach_controller_controller=False
722
+ )
723
+ _controller_controller = _cc_proc_mesh.spawn(
724
+ "controller_controller", _ControllerController
725
+ )
726
+ assert _cc_proc_mesh is not None
727
+ return _cc_proc_mesh, _controller_controller
468
728
 
469
729
 
470
- # Lazy init for the same reason as above. This is defined in proc_mesh.py
471
- # instead of debugger.py for circular import reasons.
472
- async def _debug_client() -> DebugClient:
473
- global _debug_client_mesh
474
- if _debug_client_mesh is None:
475
- mesh = await _get_debug_proc_mesh()
476
- _debug_client_mesh = await mesh._spawn_nonblocking("debug_client", DebugClient)
477
- return _debug_client_mesh
730
+ def get_or_spawn_controller(
731
+ name: str, Class: Type["_ActorType"], *args: Any, **kwargs: Any
732
+ ) -> Future["_ActorType"]:
733
+ """
734
+ Creates a singleton actor (controller) indexed by name, or if it already exists, returns the
735
+ existing actor.
478
736
 
737
+ Args:
738
+ name (str): The unique name of the actor, used as a key for retrieval.
739
+ Class (Type): The class of the actor to spawn. Must be a subclass of Actor.
740
+ *args (Any): Positional arguments to pass to the actor constructor.
741
+ **kwargs (Any): Keyword arguments to pass to the actor constructor.
479
742
 
480
- def debug_client() -> DebugClient:
481
- return Future(coro=_debug_client()).get()
743
+ Returns:
744
+ A Future that resolves to a reference to the actor.
745
+ """
746
+ return context().actor_instance._controller_controller.get_or_spawn.call_one(
747
+ name, Class, *args, **kwargs
748
+ )