torchmonarch-nightly 2025.7.1__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.25__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. monarch/__init__.py +13 -9
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
  4. monarch/_src/actor/actor_mesh.py +874 -0
  5. monarch/{allocator.py → _src/actor/allocator.py} +26 -17
  6. monarch/_src/actor/bootstrap_main.py +73 -0
  7. monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
  8. monarch/_src/actor/code_sync/auto_reload.py +223 -0
  9. monarch/_src/actor/debugger.py +565 -0
  10. monarch/_src/actor/endpoint.py +270 -0
  11. monarch/_src/actor/event_loop.py +97 -0
  12. monarch/_src/actor/future.py +100 -0
  13. monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
  14. monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
  15. monarch/_src/actor/proc_mesh.py +500 -0
  16. monarch/_src/actor/sync_state.py +18 -0
  17. monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
  18. monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
  19. monarch/_src/actor/tensor_engine_shim.py +56 -0
  20. monarch/_src/tensor_engine/rdma.py +180 -0
  21. monarch/_testing.py +3 -2
  22. monarch/actor/__init__.py +51 -0
  23. monarch/actor_mesh.py +6 -765
  24. monarch/bootstrap_main.py +8 -47
  25. monarch/common/client.py +1 -1
  26. monarch/common/controller_api.py +2 -1
  27. monarch/common/device_mesh.py +12 -2
  28. monarch/common/messages.py +12 -1
  29. monarch/common/recording.py +4 -3
  30. monarch/common/remote.py +135 -52
  31. monarch/common/tensor.py +2 -1
  32. monarch/controller/backend.py +2 -2
  33. monarch/controller/controller.py +2 -1
  34. monarch/controller/rust_backend/controller.py +2 -1
  35. monarch/fetch.py +3 -5
  36. monarch/mesh_controller.py +201 -139
  37. monarch/monarch_controller +0 -0
  38. monarch/opaque_module.py +4 -6
  39. monarch/opaque_object.py +3 -3
  40. monarch/proc_mesh.py +6 -309
  41. monarch/python_local_mesh.py +1 -1
  42. monarch/rust_backend_mesh.py +2 -1
  43. monarch/rust_local_mesh.py +4 -2
  44. monarch/sim_mesh.py +10 -19
  45. monarch/simulator/command_history.py +1 -1
  46. monarch/simulator/interface.py +2 -1
  47. monarch/simulator/mock_controller.py +1 -1
  48. monarch/simulator/simulator.py +1 -1
  49. monarch/tensor_engine/__init__.py +23 -0
  50. monarch/tensor_worker_main.py +3 -1
  51. monarch/tools/cli.py +3 -1
  52. monarch/tools/commands.py +95 -35
  53. monarch/tools/mesh_spec.py +55 -0
  54. monarch/tools/utils.py +38 -0
  55. monarch/worker/worker.py +1 -1
  56. monarch/world_mesh.py +2 -1
  57. monarch_supervisor/python_executable.py +6 -3
  58. tests/error_test_binary.py +48 -10
  59. tests/test_actor_error.py +370 -21
  60. tests/test_alloc.py +1 -1
  61. tests/test_allocator.py +373 -17
  62. tests/test_controller.py +2 -0
  63. tests/test_debugger.py +416 -0
  64. tests/test_env_before_cuda.py +162 -0
  65. tests/test_python_actors.py +184 -333
  66. tests/test_rdma.py +198 -0
  67. tests/test_remote_functions.py +40 -12
  68. tests/test_rust_backend.py +7 -5
  69. tests/test_sim_backend.py +1 -4
  70. tests/test_tensor_engine.py +55 -1
  71. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
  72. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
  73. torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
  74. monarch/_monarch/hyperactor/__init__.py +0 -58
  75. monarch/_monarch/worker/debugger.py +0 -117
  76. monarch/_monarch/worker/logging.py +0 -107
  77. monarch/debugger.py +0 -379
  78. monarch/future.py +0 -76
  79. monarch/rdma.py +0 -162
  80. torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
  81. /monarch/{_monarch/worker → _src}/__init__.py +0 -0
  82. /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
  83. /monarch/{common → _src/actor}/shape.py +0 -0
  84. /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
  85. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
  86. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
  87. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,500 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import logging
10
+ import os
11
+ import sys
12
+ import warnings
13
+ from contextlib import AbstractContextManager
14
+
15
+ from typing import (
16
+ Any,
17
+ Callable,
18
+ cast,
19
+ Dict,
20
+ List,
21
+ Optional,
22
+ Sequence,
23
+ Type,
24
+ TYPE_CHECKING,
25
+ TypeVar,
26
+ )
27
+
28
+ from monarch._rust_bindings.monarch_extension.logging import LoggingMeshClient
29
+ from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
30
+ Alloc,
31
+ AllocConstraints,
32
+ AllocSpec,
33
+ )
34
+ from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
35
+ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
36
+ ProcMesh as HyProcMesh,
37
+ ProcMeshMonitor,
38
+ )
39
+ from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
40
+ from monarch._src.actor.actor_mesh import (
41
+ _Actor,
42
+ _ActorMeshRefImpl,
43
+ Actor,
44
+ ActorMeshRef,
45
+ fake_sync_state,
46
+ MonarchContext,
47
+ )
48
+
49
+ from monarch._src.actor.allocator import LocalAllocator, ProcessAllocator, SimAllocator
50
+ from monarch._src.actor.code_sync import (
51
+ CodeSyncMeshClient,
52
+ RemoteWorkspace,
53
+ WorkspaceLocation,
54
+ WorkspaceShape,
55
+ )
56
+ from monarch._src.actor.debugger import (
57
+ _DEBUG_MANAGER_ACTOR_NAME,
58
+ DebugClient,
59
+ DebugManager,
60
+ )
61
+
62
+ from monarch._src.actor.device_utils import _local_device_count
63
+
64
+ from monarch._src.actor.endpoint import endpoint
65
+ from monarch._src.actor.future import Future
66
+ from monarch._src.actor.shape import MeshTrait
67
+
68
+ HAS_TENSOR_ENGINE = False
69
+ try:
70
+ from monarch._rust_bindings.rdma import ( # type: ignore[import]
71
+ _RdmaBuffer,
72
+ _RdmaManager,
73
+ )
74
+
75
+ # type: ignore[16]
76
+ HAS_TENSOR_ENGINE = _RdmaBuffer.rdma_supported()
77
+ except ImportError:
78
+ logging.warning("RDMA is not available on this platform")
79
+
80
+
81
+ if TYPE_CHECKING:
82
+ Tensor = Any
83
+ DeviceMesh = Any
84
+
85
+
86
+ class SetupActor(Actor):
87
+ """
88
+ A helper actor to setup the proc mesh with user defined setup method.
89
+ Typically used to setup the environment variables.
90
+ """
91
+
92
+ def __init__(self, env: Callable[[MonarchContext], None]) -> None:
93
+ """
94
+ Initialize the setup actor with the user defined setup method.
95
+ """
96
+ self._setup_method = env
97
+
98
+ @endpoint
99
+ async def setup(self) -> None:
100
+ """
101
+ Call the user defined setup method with the monarch context.
102
+ """
103
+ ctx = MonarchContext.get()
104
+ self._setup_method(ctx)
105
+
106
+
107
+ T = TypeVar("T")
108
+ try:
109
+ from __manifest__ import fbmake # noqa
110
+
111
+ IN_PAR = bool(fbmake.get("par_style"))
112
+ except ImportError:
113
+ IN_PAR = False
114
+
115
+
116
+ async def _allocate_nonblocking(
117
+ alloc: Alloc, setup: Callable[[MonarchContext], None] | None = None
118
+ ) -> "ProcMesh":
119
+ _proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
120
+ if setup is None:
121
+ return ProcMesh(_proc_mesh)
122
+ # If the user has passed the setup lambda, we need to call
123
+ # it here before any of the other actors are spawned so that
124
+ # the environment variables are set up before cuda init.
125
+ proc_mesh = ProcMesh(_proc_mesh)
126
+ setup_actor = await proc_mesh.spawn("setup", SetupActor, setup)
127
+ await setup_actor.setup.call()
128
+ del setup_actor
129
+ return proc_mesh
130
+
131
+
132
+ class ProcMesh(MeshTrait):
133
+ def __init__(
134
+ self,
135
+ hy_proc_mesh: HyProcMesh,
136
+ _mock_shape: Optional[Shape] = None,
137
+ _device_mesh: Optional["DeviceMesh"] = None,
138
+ _is_initializing_debugger: bool = False,
139
+ ) -> None:
140
+ self._proc_mesh = hy_proc_mesh
141
+ self._mock_shape: Optional[Shape] = _mock_shape
142
+ # type: ignore[21]
143
+ self._rdma_manager: Optional["_RdmaManager"] = None
144
+ self._debug_manager: Optional[DebugManager] = None
145
+ self._mailbox: Mailbox = self._proc_mesh.client
146
+ self._code_sync_client: Optional[CodeSyncMeshClient] = None
147
+ self._logging_mesh_client: Optional[LoggingMeshClient] = None
148
+ self._maybe_device_mesh: Optional["DeviceMesh"] = _device_mesh
149
+ self._stopped = False
150
+
151
+ # This code is unsafe in async contexts, but we currently do it all over the place
152
+ # we need to refactor this by moving it to the first time we try to spawn on the mesh.
153
+ # Right now we simply preserve the previous behavior and disable the check that prevents
154
+ # end users from doing the same.
155
+ with fake_sync_state():
156
+ if _mock_shape is None and HAS_TENSOR_ENGINE:
157
+ # type: ignore[21]
158
+ self._rdma_manager = _RdmaManager.create_rdma_manager_blocking(
159
+ self._proc_mesh
160
+ )
161
+ if not _is_initializing_debugger and _mock_shape is None:
162
+ self._debug_manager = self.spawn(
163
+ _DEBUG_MANAGER_ACTOR_NAME, DebugManager, debug_client()
164
+ ).get()
165
+
166
+ @property
167
+ def _shape(self) -> Shape:
168
+ return self._proc_mesh.shape if self._mock_shape is None else self._mock_shape
169
+
170
+ @property
171
+ def _ndslice(self) -> Slice:
172
+ return self._shape.ndslice
173
+
174
+ @property
175
+ def _labels(self) -> List[str]:
176
+ return self._shape.labels
177
+
178
+ def _new_with_shape(self, shape: Shape) -> "ProcMesh":
179
+ device_mesh = (
180
+ None
181
+ if self._maybe_device_mesh is None
182
+ else self._device_mesh._new_with_shape(shape)
183
+ )
184
+ return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
185
+
186
+ def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> Future[T]:
187
+ if self._mock_shape is not None:
188
+ raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
189
+ return Future(
190
+ impl=lambda: self._spawn_nonblocking(name, Class, *args, **kwargs),
191
+ requires_loop=False,
192
+ )
193
+
194
+ async def monitor(self) -> ProcMeshMonitor:
195
+ """
196
+ Get a monitor (async iterator) of the proc mesh, it is used to
197
+ monitor the status of the proc mesh. This function can be called at most once.
198
+
199
+ Note: This API is experimental and subject to change.
200
+
201
+ Example:
202
+
203
+ async def monitor_loop(monitor):
204
+ async for event in monitor:
205
+ await handle_exception_event(event)
206
+
207
+ # Kick off in background
208
+ asyncio.create_task(monitor_loop(monitor))
209
+ """
210
+ return await self._proc_mesh.monitor()
211
+
212
+ @classmethod
213
+ def from_alloc(
214
+ self, alloc: Alloc, setup: Callable[[MonarchContext], None] | None = None
215
+ ) -> Future["ProcMesh"]:
216
+ """
217
+ Allocate a process mesh according to the provided alloc.
218
+ Returns when the mesh is fully allocated.
219
+
220
+ Arguments:
221
+ - `alloc`: The alloc to allocate according to.
222
+ - `setup`: A lambda taking MonarchContext as param, can be used to setup env vars on the allocated mesh
223
+ """
224
+ return Future(
225
+ impl=lambda: _allocate_nonblocking(alloc, setup),
226
+ requires_loop=False,
227
+ )
228
+
229
+ def __repr__(self) -> str:
230
+ return repr(self._proc_mesh)
231
+
232
+ def __str__(self) -> str:
233
+ return str(self._proc_mesh)
234
+
235
+ async def _spawn_nonblocking(
236
+ self, name: str, Class: Type[T], *args: Any, **kwargs: Any
237
+ ) -> T:
238
+ if not issubclass(Class, Actor):
239
+ raise ValueError(
240
+ f"{Class} must subclass monarch.service.Actor to spawn it."
241
+ )
242
+ actor_mesh = await self._proc_mesh.spawn_nonblocking(name, _Actor)
243
+ service = ActorMeshRef(
244
+ Class,
245
+ _ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh, self),
246
+ self._mailbox,
247
+ )
248
+ # useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
249
+ # doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
250
+ service._create(args, kwargs)
251
+ return cast(T, service)
252
+
253
+ @property
254
+ def _device_mesh(self) -> "DeviceMesh":
255
+ if not HAS_TENSOR_ENGINE:
256
+ raise RuntimeError(
257
+ "DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
258
+ )
259
+
260
+ # type: ignore[21]
261
+ from monarch.mesh_controller import spawn_tensor_engine # @manual
262
+
263
+ if self._maybe_device_mesh is None:
264
+ if self._mock_shape is not None:
265
+ raise NotImplementedError(
266
+ "NYI: activating a proc mesh must first happen on the root proc_mesh until we fix spawning on submeshes."
267
+ )
268
+ # type: ignore[21]
269
+ self._maybe_device_mesh = spawn_tensor_engine(self)
270
+ return self._maybe_device_mesh
271
+
272
+ # pyre-ignore
273
+ def activate(self) -> AbstractContextManager:
274
+ return self._device_mesh.activate()
275
+
276
+ def rank_tensor(self, dim: str | Sequence[str]) -> "Tensor":
277
+ return self._device_mesh.rank(dim)
278
+
279
+ def rank_tensors(self) -> Dict[str, "Tensor"]:
280
+ return self._device_mesh.ranks
281
+
282
+ async def sync_workspace(self, auto_reload: bool = False) -> None:
283
+ if self._code_sync_client is None:
284
+ self._code_sync_client = CodeSyncMeshClient.spawn_blocking(
285
+ proc_mesh=self._proc_mesh,
286
+ )
287
+ # TODO(agallagher): We need some way to configure and pass this
288
+ # in -- right now we're assuming the `gpu` dimension, which isn't
289
+ # correct.
290
+ # The workspace shape (i.e. only perform one rsync per host).
291
+ assert set(self._proc_mesh.shape.labels).issubset({"gpus", "hosts"})
292
+ assert self._code_sync_client is not None
293
+ await self._code_sync_client.sync_workspace(
294
+ # TODO(agallagher): Is there a better way to infer/set the local
295
+ # workspace dir, rather than use PWD?
296
+ local=os.getcwd(),
297
+ remote=RemoteWorkspace(
298
+ location=WorkspaceLocation.FromEnvVar("WORKSPACE_DIR"),
299
+ shape=WorkspaceShape.shared("gpus"),
300
+ ),
301
+ auto_reload=auto_reload,
302
+ )
303
+
304
+ async def logging_option(
305
+ self,
306
+ stream_to_client: bool = False,
307
+ aggregate_window_sec: int | None = None,
308
+ ) -> None:
309
+ """
310
+ Set the logging options for the remote processes
311
+
312
+ Args:
313
+ stream_to_client (bool): If True, logs from the remote processes will be streamed to the client.
314
+ Defaults to False.
315
+ aggregate_window_sec (Optional[int]): If not None, logs from the remote processes will be aggregated
316
+ and sent to the client every aggregate_window_sec seconds. Defaults to None, meaning no aggregation.
317
+ aggregate_window_sec will be ignored if stream_to_client is False.
318
+
319
+ Returns:
320
+ None
321
+ """
322
+ if self._logging_mesh_client is None:
323
+ self._logging_mesh_client = await LoggingMeshClient.spawn(
324
+ proc_mesh=self._proc_mesh
325
+ )
326
+ self._logging_mesh_client.set_mode(
327
+ stream_to_client, aggregate_window_sec=aggregate_window_sec
328
+ )
329
+
330
+ async def __aenter__(self) -> "ProcMesh":
331
+ if self._stopped:
332
+ raise RuntimeError("`ProcMesh` has already been stopped")
333
+ return self
334
+
335
+ def stop(self) -> Future[None]:
336
+ async def _stop_nonblocking() -> None:
337
+ await self._proc_mesh.stop_nonblocking()
338
+ self._stopped = True
339
+
340
+ return Future(
341
+ impl=lambda: _stop_nonblocking(),
342
+ requires_loop=False,
343
+ )
344
+
345
+ async def __aexit__(
346
+ self, exc_type: object, exc_val: object, exc_tb: object
347
+ ) -> None:
348
+ # In case there are multiple nested "async with" statements, we only
349
+ # want it to close once.
350
+ if not self._stopped:
351
+ await self.stop()
352
+
353
+ # Finalizer to check if the proc mesh was closed properly.
354
+ def __del__(self) -> None:
355
+ if not self._stopped:
356
+ warnings.warn(
357
+ f"unstopped ProcMesh {self!r}",
358
+ ResourceWarning,
359
+ stacklevel=2,
360
+ source=self,
361
+ )
362
+ # Cannot call stop here because it is async.
363
+
364
+
365
+ async def local_proc_mesh_nonblocking(
366
+ *,
367
+ gpus: Optional[int] = None,
368
+ hosts: int = 1,
369
+ _is_initializing_debugger: bool = False,
370
+ ) -> ProcMesh:
371
+ if gpus is None:
372
+ gpus = _local_device_count()
373
+ spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
374
+ allocator = LocalAllocator()
375
+ alloc = await allocator.allocate(spec)
376
+ proc_mesh = HyProcMesh.allocate_nonblocking(alloc)
377
+ return ProcMesh(
378
+ await proc_mesh,
379
+ _is_initializing_debugger=_is_initializing_debugger,
380
+ )
381
+
382
+
383
+ def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
384
+ return Future(
385
+ impl=lambda: local_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
386
+ requires_loop=False,
387
+ )
388
+
389
+
390
+ async def sim_proc_mesh_nonblocking(
391
+ *, gpus: Optional[int] = None, hosts: int = 1
392
+ ) -> ProcMesh:
393
+ if gpus is None:
394
+ gpus = _local_device_count()
395
+ spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
396
+ allocator = SimAllocator()
397
+ alloc = await allocator.allocate(spec)
398
+ return await ProcMesh.from_alloc(alloc)
399
+
400
+
401
+ def sim_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
402
+ return Future(
403
+ impl=lambda: sim_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
404
+ requires_loop=False,
405
+ )
406
+
407
+
408
+ _BOOTSTRAP_MAIN = "monarch._src.actor.bootstrap_main"
409
+
410
+
411
+ def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
412
+ if IN_PAR:
413
+ cmd = sys.argv[0]
414
+ args = None
415
+ env = {
416
+ "PAR_MAIN_OVERRIDE": _BOOTSTRAP_MAIN,
417
+ }
418
+ else:
419
+ cmd = sys.executable
420
+ args = ["-m", _BOOTSTRAP_MAIN]
421
+ env = {}
422
+
423
+ return cmd, args, env
424
+
425
+
426
+ async def proc_mesh_nonblocking(
427
+ *,
428
+ gpus: Optional[int] = None,
429
+ hosts: int = 1,
430
+ env: dict[str, str] | None = None,
431
+ setup: Callable[[MonarchContext], None] | None = None,
432
+ ) -> ProcMesh:
433
+ if gpus is None:
434
+ gpus = _local_device_count()
435
+ # gpus must come last in this order because
436
+ # test_remote_function_all_gather expects that hosts comes before gpus
437
+ # in the order of the dimensions.
438
+ spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
439
+ env = env or {}
440
+ # Todo: Deprecate the env field from the ProcessAllocator
441
+ # The PAR_MAIN_OVERRIDE needs to be passed as an env
442
+ # to the proc mesh construction in rust, so can not be moved to the
443
+ # SetupActor yet
444
+ cmd, args, bootstrap_env = _get_bootstrap_args()
445
+ env.update(bootstrap_env)
446
+ allocator = ProcessAllocator(cmd, args, env)
447
+ alloc = await allocator.allocate(spec)
448
+
449
+ return await ProcMesh.from_alloc(
450
+ alloc,
451
+ setup=setup,
452
+ )
453
+
454
+
455
+ def proc_mesh(
456
+ *,
457
+ gpus: Optional[int] = None,
458
+ hosts: int = 1,
459
+ env: dict[str, str] | None = None,
460
+ setup: Callable[[MonarchContext], None] | None = None,
461
+ ) -> Future[ProcMesh]:
462
+ return Future(
463
+ impl=lambda: proc_mesh_nonblocking(
464
+ gpus=gpus, hosts=hosts, env=env, setup=setup
465
+ ),
466
+ requires_loop=False,
467
+ )
468
+
469
+
470
+ _debug_proc_mesh: Optional["ProcMesh"] = None
471
+
472
+
473
+ # Lazy init of the debug proc mesh so that importing monarch.proc_mesh
474
+ # doesn't trigger the debug client to spawn, which could cause confusing
475
+ # logs. This is defined in proc_mesh.py instead of debugger.py for
476
+ # circular import reasons.
477
+ def _get_debug_proc_mesh() -> "ProcMesh":
478
+ global _debug_proc_mesh
479
+ if _debug_proc_mesh is None:
480
+ _debug_proc_mesh = Future(
481
+ impl=lambda: local_proc_mesh_nonblocking(
482
+ gpus=1, hosts=1, _is_initializing_debugger=True
483
+ ),
484
+ requires_loop=False,
485
+ ).get()
486
+ return _debug_proc_mesh
487
+
488
+
489
+ _debug_client_mesh: Optional[DebugClient] = None
490
+
491
+
492
+ # Lazy init for the same reason as above. This is defined in proc_mesh.py
493
+ # instead of debugger.py for circular import reasons.
494
+ def debug_client() -> DebugClient:
495
+ global _debug_client_mesh
496
+ if _debug_client_mesh is None:
497
+ _debug_client_mesh = (
498
+ _get_debug_proc_mesh().spawn("debug_client", DebugClient).get()
499
+ )
500
+ return _debug_client_mesh
@@ -0,0 +1,18 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import asyncio
8
+ from contextlib import contextmanager
9
+
10
+
11
+ @contextmanager
12
+ def fake_sync_state():
13
+ prev_loop = asyncio.events._get_running_loop()
14
+ asyncio._set_running_loop(None)
15
+ try:
16
+ yield
17
+ finally:
18
+ asyncio._set_running_loop(prev_loop)
@@ -9,7 +9,7 @@
9
9
 
10
10
  import logging
11
11
 
12
- from monarch._rust_bindings.hyperactor_extension.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension
12
+ from monarch._rust_bindings.monarch_hyperactor.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension
13
13
  forward_to_tracing,
14
14
  )
15
15
 
@@ -0,0 +1,159 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import logging
10
+ from contextlib import contextmanager
11
+ from typing import Iterator, Mapping, Optional, Union
12
+
13
+ import opentelemetry.util.types as types # @manual=fbsource//third-party/pypi/opentelemetry-api:opentelemetry-api
14
+
15
+ from monarch._rust_bindings.monarch_hyperactor.telemetry import (
16
+ get_current_span_id,
17
+ PySpan,
18
+ )
19
+
20
+ from opentelemetry import ( # @manual=fbsource//third-party/pypi/opentelemetry-api:opentelemetry-api
21
+ trace,
22
+ )
23
+ from opentelemetry.trace import Tracer
24
+ from opentelemetry.trace.status import Status, StatusCode
25
+ from pyre_extensions import override
26
+
27
+ logger: logging.Logger = logging.getLogger(__name__)
28
+
29
+
30
+ class SpanWrapper(trace.Span):
31
+ def __init__(self, name: str) -> None:
32
+ super().__init__()
33
+ self._span: PySpan | None = PySpan(name)
34
+
35
+ @override
36
+ def end(self, end_time: Optional[int] = None) -> None:
37
+ # since PySpan is not sendable, we need to make sure it is deallocated on this thread so it doesn't log warnings.
38
+ s = self._span
39
+ assert s is not None
40
+ s.exit()
41
+ self._span = None
42
+ del s
43
+
44
+ def record_exception(
45
+ self,
46
+ exception: BaseException,
47
+ attributes: types.Attributes = None,
48
+ timestamp: Optional[int] = None,
49
+ escaped: bool = False,
50
+ ) -> None:
51
+ pass
52
+
53
+ def is_recording(self) -> bool:
54
+ return False
55
+
56
+ def get_span_context(self) -> trace.span.SpanContext:
57
+ span_id = get_current_span_id()
58
+ return trace.span.SpanContext(trace_id=0, span_id=span_id, is_remote=False)
59
+
60
+ def set_attributes(self, attributes: Mapping[str, types.AttributeValue]) -> None:
61
+ pass
62
+
63
+ def set_attribute(self, key: str, value: types.AttributeValue) -> None:
64
+ pass
65
+
66
+ def add_event(
67
+ self,
68
+ name: str,
69
+ attributes: types.Attributes = None,
70
+ timestamp: Optional[int] = None,
71
+ ) -> None:
72
+ pass
73
+
74
+ def update_name(self, name: str) -> None:
75
+ pass
76
+
77
+ def set_status(
78
+ self,
79
+ status: Union[Status, StatusCode],
80
+ description: Optional[str] = None,
81
+ ) -> None:
82
+ pass
83
+
84
+
85
+ class RustTracer(trace.Tracer):
86
+ def start_span(
87
+ self,
88
+ name: str,
89
+ context: Optional[trace.Context] = None,
90
+ kind: trace.SpanKind = trace.SpanKind.INTERNAL,
91
+ attributes: types.Attributes = None,
92
+ links: trace._Links = None,
93
+ start_time: Optional[int] = None,
94
+ record_exception: bool = True,
95
+ set_status_on_exception: bool = True,
96
+ ) -> trace.Span:
97
+ return SpanWrapper(name)
98
+
99
+ @contextmanager
100
+ # pyre-fixme[15]: `start_as_current_span` overrides method defined in `Tracer`
101
+ # inconsistently.
102
+ def start_as_current_span(
103
+ self,
104
+ name: str,
105
+ context: Optional[trace.Context] = None,
106
+ kind: trace.SpanKind = trace.SpanKind.INTERNAL,
107
+ attributes: types.Attributes = None,
108
+ links: trace._Links = None,
109
+ start_time: Optional[int] = None,
110
+ record_exception: bool = True,
111
+ set_status_on_exception: bool = True,
112
+ end_on_exit: bool = True,
113
+ ) -> Iterator[trace.Span]:
114
+ with SpanWrapper(name) as s:
115
+ with trace.use_span(s):
116
+ yield s
117
+ del s
118
+
119
+
120
+ class RustTracerProvider(trace.TracerProvider):
121
+ @override
122
+ def get_tracer(
123
+ self,
124
+ instrumenting_module_name: str,
125
+ *args: object,
126
+ instrumenting_library_version: Optional[str] = None,
127
+ schema_url: Optional[str] = None,
128
+ **kwargs: object,
129
+ ) -> trace.Tracer:
130
+ return RustTracer()
131
+
132
+
133
+ def get_monarch_tracer() -> Tracer:
134
+ """
135
+ Creates and returns a Monarch python tracer that logs to the Rust telemetry system.
136
+
137
+ Returns:
138
+ Tracer: A configured OpenTelemetry tracer for Monarch.
139
+
140
+ Usage:
141
+ tracer = get_monarch_tracer()
142
+ with tracer.start_as_current_span("span_name") as span:
143
+ # code here
144
+ """
145
+ install()
146
+ return trace.get_tracer("monarch.python.tracer")
147
+
148
+
149
+ _INSTALLED = False
150
+
151
+
152
+ def install() -> None:
153
+ global _INSTALLED
154
+ if _INSTALLED:
155
+ return
156
+
157
+ provider = RustTracerProvider()
158
+ trace.set_tracer_provider(provider)
159
+ _INSTALLED = True