torchmonarch-nightly 2025.7.1__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.26__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. monarch/__init__.py +13 -9
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
  4. monarch/_src/actor/actor_mesh.py +878 -0
  5. monarch/{allocator.py → _src/actor/allocator.py} +26 -17
  6. monarch/_src/actor/bootstrap_main.py +73 -0
  7. monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
  8. monarch/_src/actor/code_sync/auto_reload.py +223 -0
  9. monarch/_src/actor/debugger.py +565 -0
  10. monarch/_src/actor/endpoint.py +303 -0
  11. monarch/_src/actor/event_loop.py +97 -0
  12. monarch/_src/actor/future.py +100 -0
  13. monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
  14. monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
  15. monarch/_src/actor/proc_mesh.py +508 -0
  16. monarch/_src/actor/sync_state.py +18 -0
  17. monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
  18. monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
  19. monarch/_src/actor/tensor_engine_shim.py +59 -0
  20. monarch/_src/tensor_engine/rdma.py +180 -0
  21. monarch/_testing.py +3 -2
  22. monarch/actor/__init__.py +53 -0
  23. monarch/actor_mesh.py +6 -765
  24. monarch/bootstrap_main.py +8 -47
  25. monarch/common/client.py +1 -1
  26. monarch/common/controller_api.py +2 -1
  27. monarch/common/device_mesh.py +12 -2
  28. monarch/common/messages.py +21 -1
  29. monarch/common/recording.py +4 -3
  30. monarch/common/remote.py +135 -52
  31. monarch/common/tensor.py +2 -1
  32. monarch/controller/backend.py +2 -2
  33. monarch/controller/controller.py +2 -1
  34. monarch/controller/rust_backend/controller.py +2 -1
  35. monarch/fetch.py +3 -5
  36. monarch/gradient/_gradient_generator.so +0 -0
  37. monarch/mesh_controller.py +263 -139
  38. monarch/monarch_controller +0 -0
  39. monarch/opaque_module.py +4 -6
  40. monarch/opaque_object.py +3 -3
  41. monarch/proc_mesh.py +6 -309
  42. monarch/python_local_mesh.py +1 -1
  43. monarch/rust_backend_mesh.py +2 -1
  44. monarch/rust_local_mesh.py +4 -2
  45. monarch/sim_mesh.py +10 -19
  46. monarch/simulator/command_history.py +1 -1
  47. monarch/simulator/interface.py +2 -1
  48. monarch/simulator/mock_controller.py +1 -1
  49. monarch/simulator/simulator.py +1 -1
  50. monarch/tensor_engine/__init__.py +23 -0
  51. monarch/tensor_worker_main.py +3 -1
  52. monarch/tools/cli.py +3 -1
  53. monarch/tools/commands.py +129 -47
  54. monarch/tools/components/hyperactor.py +5 -3
  55. monarch/tools/config/__init__.py +18 -1
  56. monarch/tools/config/defaults.py +2 -2
  57. monarch/tools/mesh_spec.py +59 -1
  58. monarch/tools/utils.py +38 -0
  59. monarch/worker/worker.py +1 -1
  60. monarch/world_mesh.py +2 -1
  61. monarch_supervisor/python_executable.py +6 -3
  62. tests/error_test_binary.py +48 -10
  63. tests/test_actor_error.py +370 -21
  64. tests/test_alloc.py +1 -1
  65. tests/test_allocator.py +369 -17
  66. tests/test_controller.py +2 -0
  67. tests/test_debugger.py +416 -0
  68. tests/test_env_before_cuda.py +161 -0
  69. tests/test_python_actors.py +184 -333
  70. tests/test_rdma.py +198 -0
  71. tests/test_remote_functions.py +40 -12
  72. tests/test_rust_backend.py +7 -5
  73. tests/test_sim_backend.py +1 -4
  74. tests/test_tensor_engine.py +81 -1
  75. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/METADATA +39 -1
  76. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/RECORD +84 -72
  77. torchmonarch_nightly-2025.7.26.dist-info/entry_points.txt +3 -0
  78. monarch/_monarch/hyperactor/__init__.py +0 -58
  79. monarch/_monarch/worker/debugger.py +0 -117
  80. monarch/_monarch/worker/logging.py +0 -107
  81. monarch/debugger.py +0 -379
  82. monarch/future.py +0 -76
  83. monarch/rdma.py +0 -162
  84. torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
  85. /monarch/{_monarch/worker → _src}/__init__.py +0 -0
  86. /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
  87. /monarch/{common → _src/actor}/shape.py +0 -0
  88. /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
  89. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/WHEEL +0 -0
  90. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/licenses/LICENSE +0 -0
  91. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,508 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import logging
10
+ import os
11
+ import sys
12
+ import warnings
13
+ from contextlib import AbstractContextManager
14
+
15
+ from typing import (
16
+ Any,
17
+ Callable,
18
+ cast,
19
+ Dict,
20
+ List,
21
+ Optional,
22
+ Sequence,
23
+ Type,
24
+ TYPE_CHECKING,
25
+ TypeVar,
26
+ )
27
+
28
+ from monarch._rust_bindings.monarch_extension.logging import LoggingMeshClient
29
+ from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
30
+ Alloc,
31
+ AllocConstraints,
32
+ AllocSpec,
33
+ )
34
+ from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
35
+ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
36
+ ProcMesh as HyProcMesh,
37
+ ProcMeshMonitor,
38
+ )
39
+ from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
40
+ from monarch._src.actor.actor_mesh import (
41
+ _Actor,
42
+ _ActorMeshRefImpl,
43
+ Actor,
44
+ ActorMeshRef,
45
+ fake_sync_state,
46
+ )
47
+
48
+ from monarch._src.actor.allocator import LocalAllocator, ProcessAllocator, SimAllocator
49
+ from monarch._src.actor.code_sync import (
50
+ CodeSyncMeshClient,
51
+ RemoteWorkspace,
52
+ WorkspaceLocation,
53
+ WorkspaceShape,
54
+ )
55
+ from monarch._src.actor.debugger import (
56
+ _DEBUG_MANAGER_ACTOR_NAME,
57
+ DebugClient,
58
+ DebugManager,
59
+ )
60
+
61
+ from monarch._src.actor.device_utils import _local_device_count
62
+
63
+ from monarch._src.actor.endpoint import endpoint
64
+ from monarch._src.actor.future import Future
65
+ from monarch._src.actor.shape import MeshTrait
66
+
67
+ HAS_TENSOR_ENGINE = False
68
+ try:
69
+ from monarch._rust_bindings.rdma import ( # type: ignore[import]
70
+ _RdmaBuffer,
71
+ _RdmaManager,
72
+ )
73
+
74
+ # type: ignore[16]
75
+ HAS_TENSOR_ENGINE = _RdmaBuffer.rdma_supported()
76
+ except ImportError:
77
+ logging.warning("RDMA is not available on this platform")
78
+
79
+
80
+ if TYPE_CHECKING:
81
+ Tensor = Any
82
+ DeviceMesh = Any
83
+
84
+
85
+ class SetupActor(Actor):
86
+ """
87
+ A helper actor to setup the proc mesh with user defined setup method.
88
+ Typically used to setup the environment variables.
89
+ """
90
+
91
+ def __init__(self, env: Callable[[], None]) -> None:
92
+ """
93
+ Initialize the setup actor with the user defined setup method.
94
+ """
95
+ self._setup_method = env
96
+
97
+ @endpoint
98
+ async def setup(self) -> None:
99
+ """
100
+ Call the user defined setup method with the monarch context.
101
+ """
102
+ self._setup_method()
103
+
104
+
105
+ T = TypeVar("T")
106
+ try:
107
+ from __manifest__ import fbmake # noqa
108
+
109
+ IN_PAR = bool(fbmake.get("par_style"))
110
+ except ImportError:
111
+ IN_PAR = False
112
+
113
+
114
+ async def _allocate_nonblocking(
115
+ alloc: Alloc, setup: Callable[[], None] | None = None
116
+ ) -> "ProcMesh":
117
+ _proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
118
+ if setup is None:
119
+ return ProcMesh(_proc_mesh)
120
+ # If the user has passed the setup lambda, we need to call
121
+ # it here before any of the other actors are spawned so that
122
+ # the environment variables are set up before cuda init.
123
+ proc_mesh = ProcMesh(_proc_mesh)
124
+ setup_actor = await proc_mesh.spawn("setup", SetupActor, setup)
125
+ await setup_actor.setup.call()
126
+ del setup_actor
127
+ return proc_mesh
128
+
129
+
130
+ class ProcMesh(MeshTrait):
131
+ def __init__(
132
+ self,
133
+ hy_proc_mesh: HyProcMesh,
134
+ _mock_shape: Optional[Shape] = None,
135
+ _device_mesh: Optional["DeviceMesh"] = None,
136
+ _is_initializing_debugger: bool = False,
137
+ ) -> None:
138
+ self._proc_mesh = hy_proc_mesh
139
+ self._mock_shape: Optional[Shape] = _mock_shape
140
+ # type: ignore[21]
141
+ self._rdma_manager: Optional["_RdmaManager"] = None
142
+ self._debug_manager: Optional[DebugManager] = None
143
+ self._mailbox: Mailbox = self._proc_mesh.client
144
+ self._code_sync_client: Optional[CodeSyncMeshClient] = None
145
+ self._logging_mesh_client: Optional[LoggingMeshClient] = None
146
+ self._maybe_device_mesh: Optional["DeviceMesh"] = _device_mesh
147
+ self._stopped = False
148
+
149
+ # This code is unsafe in async contexts, but we currently do it all over the place
150
+ # we need to refactor this by moving it to the first time we try to spawn on the mesh.
151
+ # Right now we simply preserve the previous behavior and disable the check that prevents
152
+ # end users from doing the same.
153
+ with fake_sync_state():
154
+ if _mock_shape is None and HAS_TENSOR_ENGINE:
155
+ # type: ignore[21]
156
+ self._rdma_manager = _RdmaManager.create_rdma_manager_blocking(
157
+ self._proc_mesh
158
+ )
159
+ if not _is_initializing_debugger and _mock_shape is None:
160
+ self._debug_manager = self.spawn(
161
+ _DEBUG_MANAGER_ACTOR_NAME, DebugManager, debug_client()
162
+ ).get()
163
+
164
+ @property
165
+ def _shape(self) -> Shape:
166
+ return self._proc_mesh.shape if self._mock_shape is None else self._mock_shape
167
+
168
+ @property
169
+ def _ndslice(self) -> Slice:
170
+ return self._shape.ndslice
171
+
172
+ @property
173
+ def _labels(self) -> List[str]:
174
+ return self._shape.labels
175
+
176
+ def _new_with_shape(self, shape: Shape) -> "ProcMesh":
177
+ device_mesh = (
178
+ None
179
+ if self._maybe_device_mesh is None
180
+ else self._device_mesh._new_with_shape(shape)
181
+ )
182
+ return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
183
+
184
+ def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> Future[T]:
185
+ if self._mock_shape is not None:
186
+ raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
187
+ return Future(
188
+ impl=lambda: self._spawn_nonblocking(name, Class, *args, **kwargs),
189
+ requires_loop=False,
190
+ )
191
+
192
+ async def monitor(self) -> ProcMeshMonitor:
193
+ """
194
+ Get a monitor (async iterator) of the proc mesh, it is used to
195
+ monitor the status of the proc mesh. This function can be called at most once.
196
+
197
+ Note: This API is experimental and subject to change.
198
+
199
+ Example:
200
+
201
+ async def monitor_loop(monitor):
202
+ async for event in monitor:
203
+ await handle_exception_event(event)
204
+
205
+ # Kick off in background
206
+ asyncio.create_task(monitor_loop(monitor))
207
+ """
208
+ return await self._proc_mesh.monitor()
209
+
210
+ @classmethod
211
+ def from_alloc(
212
+ self, alloc: Alloc, setup: Callable[[], None] | None = None
213
+ ) -> Future["ProcMesh"]:
214
+ """
215
+ Allocate a process mesh according to the provided alloc.
216
+ Returns when the mesh is fully allocated.
217
+
218
+ Arguments:
219
+ - `alloc`: The alloc to allocate according to.
220
+ - `setup`: An optional lambda function to configure environment variables on the allocated mesh.
221
+ Use the `current_rank()` method within the lambda to obtain the rank.
222
+
223
+ Example of a setup method to initialize torch distributed environment variables:
224
+ ```
225
+ def setup():
226
+ rank = current_rank()
227
+ os.environ["RANK"] = str(rank)
228
+ os.environ["WORLD_SIZE"] = str(len(rank.shape))
229
+ os.environ["LOCAL_RANK"] = str(rank["gpus"])
230
+ ```
231
+ """
232
+ return Future(
233
+ impl=lambda: _allocate_nonblocking(alloc, setup),
234
+ requires_loop=False,
235
+ )
236
+
237
+ def __repr__(self) -> str:
238
+ return repr(self._proc_mesh)
239
+
240
+ def __str__(self) -> str:
241
+ return str(self._proc_mesh)
242
+
243
+ async def _spawn_nonblocking(
244
+ self, name: str, Class: Type[T], *args: Any, **kwargs: Any
245
+ ) -> T:
246
+ if not issubclass(Class, Actor):
247
+ raise ValueError(
248
+ f"{Class} must subclass monarch.service.Actor to spawn it."
249
+ )
250
+ actor_mesh = await self._proc_mesh.spawn_nonblocking(name, _Actor)
251
+ service = ActorMeshRef(
252
+ Class,
253
+ _ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh, self),
254
+ self._mailbox,
255
+ )
256
+ # useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
257
+ # doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
258
+ service._create(args, kwargs)
259
+ return cast(T, service)
260
+
261
+ @property
262
+ def _device_mesh(self) -> "DeviceMesh":
263
+ if not HAS_TENSOR_ENGINE:
264
+ raise RuntimeError(
265
+ "DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
266
+ )
267
+
268
+ # type: ignore[21]
269
+ from monarch.mesh_controller import spawn_tensor_engine # @manual
270
+
271
+ if self._maybe_device_mesh is None:
272
+ if self._mock_shape is not None:
273
+ raise NotImplementedError(
274
+ "NYI: activating a proc mesh must first happen on the root proc_mesh until we fix spawning on submeshes."
275
+ )
276
+ # type: ignore[21]
277
+ self._maybe_device_mesh = spawn_tensor_engine(self)
278
+ return self._maybe_device_mesh
279
+
280
+ # pyre-ignore
281
+ def activate(self) -> AbstractContextManager:
282
+ return self._device_mesh.activate()
283
+
284
+ def rank_tensor(self, dim: str | Sequence[str]) -> "Tensor":
285
+ return self._device_mesh.rank(dim)
286
+
287
+ def rank_tensors(self) -> Dict[str, "Tensor"]:
288
+ return self._device_mesh.ranks
289
+
290
+ async def sync_workspace(self, auto_reload: bool = False) -> None:
291
+ if self._code_sync_client is None:
292
+ self._code_sync_client = CodeSyncMeshClient.spawn_blocking(
293
+ proc_mesh=self._proc_mesh,
294
+ )
295
+ # TODO(agallagher): We need some way to configure and pass this
296
+ # in -- right now we're assuming the `gpu` dimension, which isn't
297
+ # correct.
298
+ # The workspace shape (i.e. only perform one rsync per host).
299
+ assert set(self._proc_mesh.shape.labels).issubset({"gpus", "hosts"})
300
+ assert self._code_sync_client is not None
301
+ await self._code_sync_client.sync_workspace(
302
+ # TODO(agallagher): Is there a better way to infer/set the local
303
+ # workspace dir, rather than use PWD?
304
+ local=os.getcwd(),
305
+ remote=RemoteWorkspace(
306
+ location=WorkspaceLocation.FromEnvVar("WORKSPACE_DIR"),
307
+ shape=WorkspaceShape.shared("gpus"),
308
+ ),
309
+ auto_reload=auto_reload,
310
+ )
311
+
312
+ async def logging_option(
313
+ self,
314
+ stream_to_client: bool = False,
315
+ aggregate_window_sec: int | None = None,
316
+ ) -> None:
317
+ """
318
+ Set the logging options for the remote processes
319
+
320
+ Args:
321
+ stream_to_client (bool): If True, logs from the remote processes will be streamed to the client.
322
+ Defaults to False.
323
+ aggregate_window_sec (Optional[int]): If not None, logs from the remote processes will be aggregated
324
+ and sent to the client every aggregate_window_sec seconds. Defaults to None, meaning no aggregation.
325
+ aggregate_window_sec will be ignored if stream_to_client is False.
326
+
327
+ Returns:
328
+ None
329
+ """
330
+ if self._logging_mesh_client is None:
331
+ self._logging_mesh_client = await LoggingMeshClient.spawn(
332
+ proc_mesh=self._proc_mesh
333
+ )
334
+ self._logging_mesh_client.set_mode(
335
+ stream_to_client, aggregate_window_sec=aggregate_window_sec
336
+ )
337
+
338
+ async def __aenter__(self) -> "ProcMesh":
339
+ if self._stopped:
340
+ raise RuntimeError("`ProcMesh` has already been stopped")
341
+ return self
342
+
343
+ def stop(self) -> Future[None]:
344
+ async def _stop_nonblocking() -> None:
345
+ await self._proc_mesh.stop_nonblocking()
346
+ self._stopped = True
347
+
348
+ return Future(
349
+ impl=lambda: _stop_nonblocking(),
350
+ requires_loop=False,
351
+ )
352
+
353
+ async def __aexit__(
354
+ self, exc_type: object, exc_val: object, exc_tb: object
355
+ ) -> None:
356
+ # In case there are multiple nested "async with" statements, we only
357
+ # want it to close once.
358
+ if not self._stopped:
359
+ await self.stop()
360
+
361
+ # Finalizer to check if the proc mesh was closed properly.
362
+ def __del__(self) -> None:
363
+ if not self._stopped:
364
+ warnings.warn(
365
+ f"unstopped ProcMesh {self!r}",
366
+ ResourceWarning,
367
+ stacklevel=2,
368
+ source=self,
369
+ )
370
+ # Cannot call stop here because it is async.
371
+
372
+
373
+ async def local_proc_mesh_nonblocking(
374
+ *,
375
+ gpus: Optional[int] = None,
376
+ hosts: int = 1,
377
+ _is_initializing_debugger: bool = False,
378
+ ) -> ProcMesh:
379
+ if gpus is None:
380
+ gpus = _local_device_count()
381
+ spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
382
+ allocator = LocalAllocator()
383
+ alloc = await allocator.allocate(spec)
384
+ proc_mesh = HyProcMesh.allocate_nonblocking(alloc)
385
+ return ProcMesh(
386
+ await proc_mesh,
387
+ _is_initializing_debugger=_is_initializing_debugger,
388
+ )
389
+
390
+
391
+ def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
392
+ return Future(
393
+ impl=lambda: local_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
394
+ requires_loop=False,
395
+ )
396
+
397
+
398
+ async def sim_proc_mesh_nonblocking(
399
+ *, gpus: Optional[int] = None, hosts: int = 1
400
+ ) -> ProcMesh:
401
+ if gpus is None:
402
+ gpus = _local_device_count()
403
+ spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
404
+ allocator = SimAllocator()
405
+ alloc = await allocator.allocate(spec)
406
+ return await ProcMesh.from_alloc(alloc)
407
+
408
+
409
+ def sim_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
410
+ return Future(
411
+ impl=lambda: sim_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
412
+ requires_loop=False,
413
+ )
414
+
415
+
416
+ _BOOTSTRAP_MAIN = "monarch._src.actor.bootstrap_main"
417
+
418
+
419
+ def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
420
+ if IN_PAR:
421
+ cmd = sys.argv[0]
422
+ args = None
423
+ env = {
424
+ "PAR_MAIN_OVERRIDE": _BOOTSTRAP_MAIN,
425
+ }
426
+ else:
427
+ cmd = sys.executable
428
+ args = ["-m", _BOOTSTRAP_MAIN]
429
+ env = {}
430
+
431
+ return cmd, args, env
432
+
433
+
434
+ async def proc_mesh_nonblocking(
435
+ *,
436
+ gpus: Optional[int] = None,
437
+ hosts: int = 1,
438
+ env: dict[str, str] | None = None,
439
+ setup: Callable[[], None] | None = None,
440
+ ) -> ProcMesh:
441
+ if gpus is None:
442
+ gpus = _local_device_count()
443
+ # gpus must come last in this order because
444
+ # test_remote_function_all_gather expects that hosts comes before gpus
445
+ # in the order of the dimensions.
446
+ spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
447
+ env = env or {}
448
+ # Todo: Deprecate the env field from the ProcessAllocator
449
+ # The PAR_MAIN_OVERRIDE needs to be passed as an env
450
+ # to the proc mesh construction in rust, so can not be moved to the
451
+ # SetupActor yet
452
+ cmd, args, bootstrap_env = _get_bootstrap_args()
453
+ env.update(bootstrap_env)
454
+ allocator = ProcessAllocator(cmd, args, env)
455
+ alloc = await allocator.allocate(spec)
456
+
457
+ return await ProcMesh.from_alloc(
458
+ alloc,
459
+ setup=setup,
460
+ )
461
+
462
+
463
+ def proc_mesh(
464
+ *,
465
+ gpus: Optional[int] = None,
466
+ hosts: int = 1,
467
+ env: dict[str, str] | None = None,
468
+ setup: Callable[[], None] | None = None,
469
+ ) -> Future[ProcMesh]:
470
+ return Future(
471
+ impl=lambda: proc_mesh_nonblocking(
472
+ gpus=gpus, hosts=hosts, env=env, setup=setup
473
+ ),
474
+ requires_loop=False,
475
+ )
476
+
477
+
478
+ _debug_proc_mesh: Optional["ProcMesh"] = None
479
+
480
+
481
+ # Lazy init of the debug proc mesh so that importing monarch.proc_mesh
482
+ # doesn't trigger the debug client to spawn, which could cause confusing
483
+ # logs. This is defined in proc_mesh.py instead of debugger.py for
484
+ # circular import reasons.
485
+ def _get_debug_proc_mesh() -> "ProcMesh":
486
+ global _debug_proc_mesh
487
+ if _debug_proc_mesh is None:
488
+ _debug_proc_mesh = Future(
489
+ impl=lambda: local_proc_mesh_nonblocking(
490
+ gpus=1, hosts=1, _is_initializing_debugger=True
491
+ ),
492
+ requires_loop=False,
493
+ ).get()
494
+ return _debug_proc_mesh
495
+
496
+
497
+ _debug_client_mesh: Optional[DebugClient] = None
498
+
499
+
500
+ # Lazy init for the same reason as above. This is defined in proc_mesh.py
501
+ # instead of debugger.py for circular import reasons.
502
+ def debug_client() -> DebugClient:
503
+ global _debug_client_mesh
504
+ if _debug_client_mesh is None:
505
+ _debug_client_mesh = (
506
+ _get_debug_proc_mesh().spawn("debug_client", DebugClient).get()
507
+ )
508
+ return _debug_client_mesh
@@ -0,0 +1,18 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import asyncio
8
+ from contextlib import contextmanager
9
+
10
+
11
+ @contextmanager
12
+ def fake_sync_state():
13
+ prev_loop = asyncio.events._get_running_loop()
14
+ asyncio._set_running_loop(None)
15
+ try:
16
+ yield
17
+ finally:
18
+ asyncio._set_running_loop(prev_loop)
@@ -9,7 +9,7 @@
9
9
 
10
10
  import logging
11
11
 
12
- from monarch._rust_bindings.hyperactor_extension.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension
12
+ from monarch._rust_bindings.monarch_hyperactor.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension
13
13
  forward_to_tracing,
14
14
  )
15
15
 
@@ -0,0 +1,159 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import logging
10
+ from contextlib import contextmanager
11
+ from typing import Iterator, Mapping, Optional, Union
12
+
13
+ import opentelemetry.util.types as types # @manual=fbsource//third-party/pypi/opentelemetry-api:opentelemetry-api
14
+
15
+ from monarch._rust_bindings.monarch_hyperactor.telemetry import (
16
+ get_current_span_id,
17
+ PySpan,
18
+ )
19
+
20
+ from opentelemetry import ( # @manual=fbsource//third-party/pypi/opentelemetry-api:opentelemetry-api
21
+ trace,
22
+ )
23
+ from opentelemetry.trace import Tracer
24
+ from opentelemetry.trace.status import Status, StatusCode
25
+ from pyre_extensions import override
26
+
27
+ logger: logging.Logger = logging.getLogger(__name__)
28
+
29
+
30
+ class SpanWrapper(trace.Span):
31
+ def __init__(self, name: str) -> None:
32
+ super().__init__()
33
+ self._span: PySpan | None = PySpan(name)
34
+
35
+ @override
36
+ def end(self, end_time: Optional[int] = None) -> None:
37
+ # since PySpan is not sendable, we need to make sure it is deallocated on this thread so it doesn't log warnings.
38
+ s = self._span
39
+ assert s is not None
40
+ s.exit()
41
+ self._span = None
42
+ del s
43
+
44
+ def record_exception(
45
+ self,
46
+ exception: BaseException,
47
+ attributes: types.Attributes = None,
48
+ timestamp: Optional[int] = None,
49
+ escaped: bool = False,
50
+ ) -> None:
51
+ pass
52
+
53
+ def is_recording(self) -> bool:
54
+ return False
55
+
56
+ def get_span_context(self) -> trace.span.SpanContext:
57
+ span_id = get_current_span_id()
58
+ return trace.span.SpanContext(trace_id=0, span_id=span_id, is_remote=False)
59
+
60
+ def set_attributes(self, attributes: Mapping[str, types.AttributeValue]) -> None:
61
+ pass
62
+
63
+ def set_attribute(self, key: str, value: types.AttributeValue) -> None:
64
+ pass
65
+
66
+ def add_event(
67
+ self,
68
+ name: str,
69
+ attributes: types.Attributes = None,
70
+ timestamp: Optional[int] = None,
71
+ ) -> None:
72
+ pass
73
+
74
+ def update_name(self, name: str) -> None:
75
+ pass
76
+
77
+ def set_status(
78
+ self,
79
+ status: Union[Status, StatusCode],
80
+ description: Optional[str] = None,
81
+ ) -> None:
82
+ pass
83
+
84
+
85
+ class RustTracer(trace.Tracer):
86
+ def start_span(
87
+ self,
88
+ name: str,
89
+ context: Optional[trace.Context] = None,
90
+ kind: trace.SpanKind = trace.SpanKind.INTERNAL,
91
+ attributes: types.Attributes = None,
92
+ links: trace._Links = None,
93
+ start_time: Optional[int] = None,
94
+ record_exception: bool = True,
95
+ set_status_on_exception: bool = True,
96
+ ) -> trace.Span:
97
+ return SpanWrapper(name)
98
+
99
+ @contextmanager
100
+ # pyre-fixme[15]: `start_as_current_span` overrides method defined in `Tracer`
101
+ # inconsistently.
102
+ def start_as_current_span(
103
+ self,
104
+ name: str,
105
+ context: Optional[trace.Context] = None,
106
+ kind: trace.SpanKind = trace.SpanKind.INTERNAL,
107
+ attributes: types.Attributes = None,
108
+ links: trace._Links = None,
109
+ start_time: Optional[int] = None,
110
+ record_exception: bool = True,
111
+ set_status_on_exception: bool = True,
112
+ end_on_exit: bool = True,
113
+ ) -> Iterator[trace.Span]:
114
+ with SpanWrapper(name) as s:
115
+ with trace.use_span(s):
116
+ yield s
117
+ del s
118
+
119
+
120
+ class RustTracerProvider(trace.TracerProvider):
121
+ @override
122
+ def get_tracer(
123
+ self,
124
+ instrumenting_module_name: str,
125
+ *args: object,
126
+ instrumenting_library_version: Optional[str] = None,
127
+ schema_url: Optional[str] = None,
128
+ **kwargs: object,
129
+ ) -> trace.Tracer:
130
+ return RustTracer()
131
+
132
+
133
+ def get_monarch_tracer() -> Tracer:
134
+ """
135
+ Creates and returns a Monarch python tracer that logs to the Rust telemetry system.
136
+
137
+ Returns:
138
+ Tracer: A configured OpenTelemetry tracer for Monarch.
139
+
140
+ Usage:
141
+ tracer = get_monarch_tracer()
142
+ with tracer.start_as_current_span("span_name") as span:
143
+ # code here
144
+ """
145
+ install()
146
+ return trace.get_tracer("monarch.python.tracer")
147
+
148
+
149
+ _INSTALLED = False
150
+
151
+
152
+ def install() -> None:
153
+ global _INSTALLED
154
+ if _INSTALLED:
155
+ return
156
+
157
+ provider = RustTracerProvider()
158
+ trace.set_tracer_provider(provider)
159
+ _INSTALLED = True