torchmonarch-nightly 2025.8.2__cp313-cp313-manylinux2014_x86_64.whl → 2025.9.3__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. monarch/_rust_bindings.so +0 -0
  2. monarch/_src/actor/actor_mesh.py +414 -216
  3. monarch/_src/actor/allocator.py +75 -6
  4. monarch/_src/actor/bootstrap_main.py +7 -4
  5. monarch/_src/actor/code_sync/__init__.py +2 -0
  6. monarch/_src/actor/debugger/__init__.py +7 -0
  7. monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
  8. monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
  9. monarch/_src/actor/endpoint.py +27 -45
  10. monarch/_src/actor/future.py +86 -24
  11. monarch/_src/actor/host_mesh.py +125 -0
  12. monarch/_src/actor/logging.py +94 -0
  13. monarch/_src/actor/pickle.py +25 -0
  14. monarch/_src/actor/proc_mesh.py +423 -156
  15. monarch/_src/actor/python_extension_methods.py +90 -0
  16. monarch/_src/actor/shape.py +8 -1
  17. monarch/_src/actor/source_loader.py +45 -0
  18. monarch/_src/actor/telemetry/__init__.py +172 -0
  19. monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
  20. monarch/_src/debug_cli/__init__.py +7 -0
  21. monarch/_src/debug_cli/debug_cli.py +43 -0
  22. monarch/_src/tensor_engine/rdma.py +64 -9
  23. monarch/_testing.py +1 -3
  24. monarch/actor/__init__.py +24 -4
  25. monarch/common/_C.so +0 -0
  26. monarch/common/device_mesh.py +14 -0
  27. monarch/common/future.py +10 -0
  28. monarch/common/remote.py +14 -25
  29. monarch/common/tensor.py +12 -0
  30. monarch/debug_cli/__init__.py +7 -0
  31. monarch/debug_cli/__main__.py +12 -0
  32. monarch/fetch.py +2 -2
  33. monarch/gradient/_gradient_generator.so +0 -0
  34. monarch/gradient_generator.py +4 -2
  35. monarch/mesh_controller.py +34 -14
  36. monarch/monarch_controller +0 -0
  37. monarch/tools/colors.py +25 -0
  38. monarch/tools/commands.py +42 -7
  39. monarch/tools/components/hyperactor.py +1 -1
  40. monarch/tools/config/__init__.py +31 -4
  41. monarch/tools/config/defaults.py +13 -3
  42. monarch/tools/config/environment.py +45 -0
  43. monarch/tools/config/workspace.py +165 -0
  44. monarch/tools/mesh_spec.py +2 -0
  45. monarch/utils/__init__.py +9 -0
  46. monarch/utils/utils.py +78 -0
  47. tests/error_test_binary.py +5 -3
  48. tests/python_actor_test_binary.py +52 -0
  49. tests/test_actor_error.py +142 -14
  50. tests/test_alloc.py +1 -1
  51. tests/test_allocator.py +59 -72
  52. tests/test_debugger.py +639 -45
  53. tests/test_env_before_cuda.py +4 -4
  54. tests/test_mesh_trait.py +38 -0
  55. tests/test_python_actors.py +965 -75
  56. tests/test_rdma.py +7 -6
  57. tests/test_tensor_engine.py +6 -6
  58. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/METADATA +82 -4
  59. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/RECORD +63 -47
  60. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/WHEEL +0 -0
  61. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/entry_points.txt +0 -0
  62. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/licenses/LICENSE +0 -0
  63. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@
6
6
 
7
7
  # pyre-unsafe
8
8
 
9
+ import abc
9
10
  import collections
10
11
  import contextvars
11
12
  import functools
@@ -14,17 +15,18 @@ import itertools
14
15
  import logging
15
16
  import random
16
17
  import traceback
18
+ from abc import abstractmethod, abstractproperty
17
19
 
18
20
  from dataclasses import dataclass
19
- from traceback import extract_tb, StackSummary
21
+ from pprint import pformat
22
+ from textwrap import indent
23
+ from traceback import TracebackException
20
24
  from typing import (
21
25
  Any,
22
- AsyncGenerator,
23
26
  Awaitable,
24
27
  Callable,
25
28
  cast,
26
29
  Concatenate,
27
- Coroutine,
28
30
  Dict,
29
31
  Generator,
30
32
  Generic,
@@ -32,7 +34,6 @@ from typing import (
32
34
  Iterator,
33
35
  List,
34
36
  Literal,
35
- NamedTuple,
36
37
  Optional,
37
38
  overload,
38
39
  ParamSpec,
@@ -48,24 +49,25 @@ from monarch._rust_bindings.monarch_hyperactor.actor import (
48
49
  PythonMessage,
49
50
  PythonMessageKind,
50
51
  )
51
- from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh
52
+ from monarch._rust_bindings.monarch_hyperactor.actor_mesh import (
53
+ PythonActorMesh,
54
+ PythonActorMeshImpl,
55
+ )
52
56
  from monarch._rust_bindings.monarch_hyperactor.mailbox import (
53
57
  Mailbox,
54
- OncePortReceiver,
58
+ OncePortReceiver as HyOncePortReceiver, # noqa: F401
55
59
  OncePortRef,
56
- PortReceiver as HyPortReceiver,
60
+ PortReceiver as HyPortReceiver, # noqa: F401
57
61
  PortRef,
62
+ UndeliverableMessageEnvelope,
58
63
  )
59
-
60
- if TYPE_CHECKING:
61
- from monarch._rust_bindings.monarch_hyperactor.actor import PortProtocol
62
- from monarch._rust_bindings.monarch_hyperactor.mailbox import PortReceiverBase
63
-
64
64
  from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
65
+ from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask, Shared
66
+ from monarch._rust_bindings.monarch_hyperactor.selection import Selection as HySelection
65
67
  from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Shape
66
68
  from monarch._rust_bindings.monarch_hyperactor.supervision import SupervisionError
67
- from monarch._rust_bindings.monarch_hyperactor.telemetry import enter_span, exit_span
68
69
  from monarch._src.actor.allocator import LocalAllocator, ProcessAllocator
70
+ from monarch._src.actor.debugger.pdb_wrapper import PdbWrapper
69
71
  from monarch._src.actor.endpoint import (
70
72
  Endpoint,
71
73
  EndpointProperty,
@@ -74,21 +76,28 @@ from monarch._src.actor.endpoint import (
74
76
  Propagator,
75
77
  Selection,
76
78
  )
77
- from monarch._src.actor.future import Future
78
- from monarch._src.actor.pdb_wrapper import PdbWrapper
79
-
79
+ from monarch._src.actor.future import DeprecatedNotAFuture, Future
80
80
  from monarch._src.actor.pickle import flatten, unflatten
81
-
81
+ from monarch._src.actor.python_extension_methods import rust_struct
82
82
  from monarch._src.actor.shape import MeshTrait, NDSlice
83
83
  from monarch._src.actor.sync_state import fake_sync_state
84
-
84
+ from monarch._src.actor.telemetry import METER
85
85
  from monarch._src.actor.tensor_engine_shim import actor_rref, actor_send
86
+ from typing_extensions import Self
86
87
 
87
88
  if TYPE_CHECKING:
88
- from monarch._src.actor.proc_mesh import ProcMesh
89
+ from monarch._rust_bindings.monarch_hyperactor.actor import PortProtocol
90
+ from monarch._rust_bindings.monarch_hyperactor.actor_mesh import ActorMeshProtocol
91
+ from monarch._rust_bindings.monarch_hyperactor.mailbox import PortReceiverBase
92
+ from monarch._src.actor.proc_mesh import _ControllerController, ProcMesh
93
+ from monarch._src.actor.telemetry import get_monarch_tracer
94
+
95
+ CallMethod = PythonMessageKind.CallMethod
89
96
 
90
97
  logger: logging.Logger = logging.getLogger(__name__)
91
98
 
99
+ TRACER = get_monarch_tracer()
100
+
92
101
  Allocator = ProcessAllocator | LocalAllocator
93
102
 
94
103
  try:
@@ -106,22 +115,103 @@ class Point(HyPoint, collections.abc.Mapping):
106
115
  pass
107
116
 
108
117
 
109
- @dataclass
110
- class MonarchContext:
111
- mailbox: Mailbox
112
- proc_id: str
113
- point: Point
118
+ @rust_struct("monarch_hyperactor::mailbox::Instance")
119
+ class Instance(abc.ABC):
120
+ @abstractproperty
121
+ def _mailbox(self) -> Mailbox:
122
+ """
123
+ This can be removed once we fix all the uses of mailbox to just use context instead.
124
+ """
125
+ ...
126
+
127
+ @property
128
+ def proc_id(self) -> str:
129
+ """
130
+ The proc_id of the current actor.
131
+ """
132
+ return self.actor_id.proc_id
133
+
134
+ @abstractproperty
135
+ def actor_id(self) -> ActorId:
136
+ """
137
+ The actor_id of the current actor.
138
+ """
139
+ ...
140
+
141
+ @property
142
+ def proc(self) -> "ProcMesh":
143
+ """
144
+ The singleton proc mesh that corresponds to just this actor.
145
+ """
146
+
147
+ return self.proc_mesh.slice(**self.rank)
148
+
149
+ """
150
+ Every actor is spawned over some mesh of processes. This identifies the point in that mesh where
151
+ the current actor was spawned. In other words, it is the `monarch.current_rank()` of
152
+ The actors __init__ message.
153
+ """
154
+ rank: Point
155
+ proc_mesh: "ProcMesh"
156
+ _controller_controller: "_ControllerController"
157
+
158
+ # this property is used to hold the handles to actors and processes launched by this actor
159
+ # in order to keep them alive until this actor exits.
160
+ _children: "Optional[List[ActorMesh | ProcMesh]]"
161
+
162
+ def _add_child(self, child: "ActorMesh | ProcMesh") -> None:
163
+ if self._children is None:
164
+ self._children = [child]
165
+ else:
166
+ self._children.append(child)
167
+
168
+
169
+ @rust_struct("monarch_hyperactor::mailbox::Context")
170
+ class Context:
171
+ @property
172
+ def actor_instance(self) -> Instance:
173
+ """
174
+ Information about the actor currently running in this context.
175
+ """
176
+ ...
177
+
178
+ @property
179
+ def message_rank(self) -> Point:
180
+ """
181
+ Every message is sent as some broadcast of messages. This call identifies the
182
+ point in this space where the current actor is participating.
183
+
184
+ This is not the same self.actor_instance.rank: if the message was sent to some slice of
185
+ actors this identifies where the actor appears in the slice and not the identity of the actor.
186
+
187
+ These Point objects always exist. For singletons it will have 0 dimensions.
188
+ """
189
+ ...
114
190
 
115
191
  @staticmethod
116
- def get() -> "MonarchContext":
117
- return _context.get()
192
+ def _root_client_context() -> "Context": ...
118
193
 
119
194
 
120
- _context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
195
+ _context: contextvars.ContextVar[Context] = contextvars.ContextVar(
121
196
  "monarch.actor_mesh._context"
122
197
  )
123
198
 
124
199
 
200
+ def context() -> Context:
201
+ c = _context.get(None)
202
+ if c is None:
203
+ c = Context._root_client_context()
204
+ _context.set(c)
205
+ from monarch._src.actor.host_mesh import create_local_host_mesh
206
+ from monarch._src.actor.proc_mesh import _get_controller_controller
207
+
208
+ c.actor_instance.proc_mesh, c.actor_instance._controller_controller = (
209
+ _get_controller_controller()
210
+ )
211
+ c.actor_instance.proc_mesh._host_mesh = create_local_host_mesh()
212
+ return c
213
+
214
+
125
215
  @dataclass
126
216
  class DebugContext:
127
217
  pdb_wrapper: Optional[PdbWrapper] = None
@@ -149,6 +239,37 @@ A = TypeVar("A")
149
239
  _load_balancing_seed = random.Random(4)
150
240
 
151
241
 
242
+ class _SingletonActorAdapator:
243
+ def __init__(self, inner: ActorId, shape: Optional[Shape] = None) -> None:
244
+ self._inner: ActorId = inner
245
+ if shape is None:
246
+ shape = singleton_shape
247
+ self._shape = shape
248
+
249
+ def cast(
250
+ self,
251
+ message: PythonMessage,
252
+ selection: str,
253
+ mailbox: Mailbox,
254
+ ) -> None:
255
+ mailbox.post(self._inner, message)
256
+
257
+ def new_with_shape(self, shape: Shape) -> "ActorMeshProtocol":
258
+ return _SingletonActorAdapator(self._inner, self._shape)
259
+
260
+ def supervision_event(self) -> "Optional[Shared[Exception]]":
261
+ return None
262
+
263
+ def stop(self) -> "PythonTask[None]":
264
+ raise NotImplementedError("stop()")
265
+
266
+ def initialized(self) -> "PythonTask[None]":
267
+ async def empty():
268
+ pass
269
+
270
+ return PythonTask.from_coroutine(empty())
271
+
272
+
152
273
  # standin class for whatever is the serializable python object we use
153
274
  # to name an actor mesh. Hacked up today because ActorMesh
154
275
  # isn't plumbed to non-clients
@@ -156,7 +277,7 @@ class _ActorMeshRefImpl:
156
277
  def __init__(
157
278
  self,
158
279
  mailbox: Mailbox,
159
- hy_actor_mesh: Optional[PythonActorMesh],
280
+ hy_actor_mesh: Optional[PythonActorMeshImpl],
160
281
  proc_mesh: "Optional[ProcMesh]",
161
282
  shape: Shape,
162
283
  actor_ids: List[ActorId],
@@ -171,29 +292,19 @@ class _ActorMeshRefImpl:
171
292
 
172
293
  @staticmethod
173
294
  def from_hyperactor_mesh(
174
- mailbox: Mailbox, hy_actor_mesh: PythonActorMesh, proc_mesh: "ProcMesh"
295
+ mailbox: Mailbox,
296
+ shape: Shape,
297
+ hy_actor_mesh: PythonActorMeshImpl,
298
+ proc_mesh: "ProcMesh",
175
299
  ) -> "_ActorMeshRefImpl":
176
- shape: Shape = hy_actor_mesh.shape
177
300
  return _ActorMeshRefImpl(
178
301
  mailbox,
179
302
  hy_actor_mesh,
180
303
  proc_mesh,
181
- hy_actor_mesh.shape,
304
+ shape,
182
305
  [cast(ActorId, hy_actor_mesh.get(i)) for i in range(len(shape))],
183
306
  )
184
307
 
185
- @staticmethod
186
- def from_actor_id(mailbox: Mailbox, actor_id: ActorId) -> "_ActorMeshRefImpl":
187
- return _ActorMeshRefImpl(mailbox, None, None, singleton_shape, [actor_id])
188
-
189
- @staticmethod
190
- def from_actor_ref_with_shape(
191
- ref: "_ActorMeshRefImpl", shape: Shape
192
- ) -> "_ActorMeshRefImpl":
193
- return _ActorMeshRefImpl(
194
- ref._mailbox, None, None, shape, ref._please_replace_me_actor_ids
195
- )
196
-
197
308
  def __getstate__(
198
309
  self,
199
310
  ) -> Tuple[Shape, List[ActorId], Mailbox]:
@@ -214,22 +325,19 @@ class _ActorMeshRefImpl:
214
325
  if self._actor_mesh is not None:
215
326
  if self._actor_mesh.stopped:
216
327
  raise SupervisionError(
217
- "actor mesh is not in a healthy state: `ActorMesh` has been stopped"
328
+ "actor mesh is unhealthy with reason: actor mesh is stopped due to proc mesh shutdown. "
329
+ "`PythonActorMesh` has already been stopped."
218
330
  )
219
331
 
220
332
  event = self._actor_mesh.get_supervision_event()
221
333
  if event is not None:
222
- raise SupervisionError(f"actor mesh is not in a healthy state: {event}")
223
-
224
- def send(self, rank: int, message: PythonMessage) -> None:
225
- self._check_state()
226
- actor = self._please_replace_me_actor_ids[rank]
227
- self._mailbox.post(actor, message)
334
+ raise SupervisionError(f"actor mesh is unhealthy with reason: {event}")
228
335
 
229
336
  def cast(
230
337
  self,
231
338
  message: PythonMessage,
232
- selection: Selection,
339
+ selection: str,
340
+ mailbox: Mailbox,
233
341
  ) -> None:
234
342
  self._check_state()
235
343
 
@@ -279,14 +387,44 @@ class _ActorMeshRefImpl:
279
387
  actor_id0 = self._please_replace_me_actor_ids[0]
280
388
  return actor_id0.actor_name, actor_id0.pid
281
389
 
282
- async def stop(self):
283
- await self._actor_mesh.stop()
390
+ @property
391
+ def shape(self) -> Shape:
392
+ return self._shape
393
+
394
+ @property
395
+ def proc_mesh(self) -> Optional["ProcMesh"]:
396
+ return self._proc_mesh
397
+
398
+ def new_with_shape(self, shape: Shape) -> "_ActorMeshRefImpl":
399
+ return _ActorMeshRefImpl(
400
+ self._mailbox, None, None, shape, self._please_replace_me_actor_ids
401
+ )
402
+
403
+ def supervision_event(self) -> "Optional[Shared[Exception]]":
404
+ if self._actor_mesh is None:
405
+ return None
406
+ return self._actor_mesh.supervision_event()
407
+
408
+ def stop(self) -> PythonTask[None]:
409
+ async def task():
410
+ if self._actor_mesh is not None:
411
+ self._actor_mesh.stop()
412
+
413
+ return PythonTask.from_coroutine(task())
414
+
415
+ def initialized(self) -> PythonTask[None]:
416
+ async def task():
417
+ pass
418
+
419
+ return PythonTask.from_coroutine(task())
284
420
 
285
421
 
286
422
  class ActorEndpoint(Endpoint[P, R]):
287
423
  def __init__(
288
424
  self,
289
- actor_mesh_ref: _ActorMeshRefImpl,
425
+ actor_mesh: "ActorMeshProtocol",
426
+ shape: Shape,
427
+ proc_mesh: "Optional[ProcMesh]",
290
428
  name: MethodSpecifier,
291
429
  impl: Callable[Concatenate[Any, P], Awaitable[R]],
292
430
  mailbox: Mailbox,
@@ -294,16 +432,14 @@ class ActorEndpoint(Endpoint[P, R]):
294
432
  explicit_response_port: bool,
295
433
  ) -> None:
296
434
  super().__init__(propagator)
297
- self._actor_mesh = actor_mesh_ref
435
+ self._actor_mesh = actor_mesh
298
436
  self._name = name
437
+ self._shape = shape
438
+ self._proc_mesh = proc_mesh
299
439
  self._signature: inspect.Signature = inspect.signature(impl)
300
440
  self._mailbox = mailbox
301
441
  self._explicit_response_port = explicit_response_port
302
442
 
303
- def _supervise(self, r: HyPortReceiver | OncePortReceiver) -> Any:
304
- mesh = self._actor_mesh._actor_mesh
305
- return r if mesh is None else mesh.supervise(r)
306
-
307
443
  def _call_name(self) -> Any:
308
444
  return self._name
309
445
 
@@ -334,19 +470,17 @@ class ActorEndpoint(Endpoint[P, R]):
334
470
  ),
335
471
  bytes,
336
472
  )
337
- self._actor_mesh.cast(message, selection)
473
+ self._actor_mesh.cast(message, selection, self._mailbox)
338
474
  else:
339
475
  actor_send(self, bytes, objects, port, selection)
340
- shape = self._actor_mesh._shape
476
+ shape = self._shape
341
477
  return Extent(shape.labels, shape.ndslice.sizes)
342
478
 
343
- def _port(self, once: bool = False) -> "PortTuple[R]":
344
- p, r = PortTuple.create(self._mailbox, once)
345
- if TYPE_CHECKING:
346
- assert isinstance(
347
- r._receiver, (HyPortReceiver | OncePortReceiver)
348
- ), "unexpected receiver type"
349
- return PortTuple(p, PortReceiver(self._mailbox, self._supervise(r._receiver)))
479
+ def _port(self, once: bool = False) -> "Tuple[Port[R], PortReceiver[R]]":
480
+ p, r = super()._port(once=once)
481
+ monitor: Optional[Shared[Exception]] = self._actor_mesh.supervision_event()
482
+ r._set_monitor(monitor)
483
+ return (p, r)
350
484
 
351
485
  def _rref(self, args, kwargs):
352
486
  self._check_arguments(args, kwargs)
@@ -386,11 +520,9 @@ def as_endpoint(
386
520
  if explicit_response_port
387
521
  else MethodSpecifier.ReturnsResponse
388
522
  )
389
- return ActorEndpoint(
390
- not_an_endpoint._ref._actor_mesh_ref,
523
+ return not_an_endpoint._ref._endpoint(
391
524
  kind(not_an_endpoint._name),
392
525
  getattr(not_an_endpoint._ref, not_an_endpoint._name),
393
- not_an_endpoint._ref._mailbox,
394
526
  propagate,
395
527
  explicit_response_port,
396
528
  )
@@ -405,9 +537,7 @@ class Accumulator(Generic[P, R, A]):
405
537
  self._combine: Callable[[A, R], A] = combine
406
538
 
407
539
  def accumulate(self, *args: P.args, **kwargs: P.kwargs) -> "Future[A]":
408
- gen: Generator[Coroutine[None, None, R], None, None] = self._endpoint._stream(
409
- *args, **kwargs
410
- )
540
+ gen: Generator[Future[R], None, None] = self._endpoint.stream(*args, **kwargs)
411
541
 
412
542
  async def impl() -> A:
413
543
  value = self._identity
@@ -438,17 +568,16 @@ class ValueMesh(MeshTrait, Generic[R]):
438
568
  return self._values[self._ndslice.nditem(coordinates)]
439
569
 
440
570
  def items(self) -> Iterable[Tuple[Point, R]]:
441
- for rank in self._shape.ranks():
442
- yield Point(rank, self._shape), self._values[rank]
571
+ extent = self._shape.extent
572
+ for i, rank in enumerate(self._shape.ranks()):
573
+ yield Point(i, extent), self._values[rank]
443
574
 
444
575
  def __iter__(self) -> Iterator[Tuple[Point, R]]:
445
576
  return iter(self.items())
446
577
 
447
- def __len__(self) -> int:
448
- return len(self._shape)
449
-
450
578
  def __repr__(self) -> str:
451
- return f"ValueMesh({self._shape})"
579
+ body = indent(pformat(tuple(self.items())), " ")
580
+ return f"ValueMesh({self._shape.extent}):\n{body}"
452
581
 
453
582
  @property
454
583
  def _ndslice(self) -> NDSlice:
@@ -522,49 +651,25 @@ R = TypeVar("R")
522
651
 
523
652
  T = TypeVar("T")
524
653
 
525
- if TYPE_CHECKING:
526
- # Python <= 3.10 cannot inherit from Generic[R] and NamedTuple at the same time.
527
- # we only need it for type checking though, so copypasta it until 3.11.
528
- class PortTuple(NamedTuple, Generic[R]):
529
- sender: "Port[R]"
530
- receiver: "PortReceiver[R]"
531
-
532
- @staticmethod
533
- def create(mailbox: Mailbox, once: bool = False) -> "PortTuple[Any]":
534
- handle, receiver = mailbox.open_once_port() if once else mailbox.open_port()
535
- port_ref = handle.bind()
536
- return PortTuple(
537
- Port(port_ref, mailbox, rank=None),
538
- PortReceiver(mailbox, receiver),
539
- )
540
- else:
541
-
542
- class PortTuple(NamedTuple):
543
- sender: "Port[Any]"
544
- receiver: "PortReceiver[Any]"
545
-
546
- @staticmethod
547
- def create(mailbox: Mailbox, once: bool = False) -> "PortTuple[Any]":
548
- handle, receiver = mailbox.open_once_port() if once else mailbox.open_port()
549
- port_ref = handle.bind()
550
- return PortTuple(
551
- Port(port_ref, mailbox, rank=None),
552
- PortReceiver(mailbox, receiver),
553
- )
554
-
555
654
 
556
655
  # advance lower-level API for sending messages. This is intentially
557
656
  # not part of the Endpoint API because they way it accepts arguments
558
657
  # and handles concerns is different.
559
- def port(endpoint: Endpoint[P, R], once: bool = False) -> "PortTuple[R]":
560
- return endpoint._port(once)
561
-
658
+ class Channel(Generic[R]):
659
+ @staticmethod
660
+ def open(once: bool = False) -> Tuple["Port[R]", "PortReceiver[R]"]:
661
+ mailbox = context().actor_instance._mailbox
662
+ handle, receiver = mailbox.open_once_port() if once else mailbox.open_port()
663
+ port_ref = handle.bind()
664
+ return (
665
+ Port(port_ref, mailbox, rank=None),
666
+ PortReceiver(mailbox, receiver),
667
+ )
562
668
 
563
- def ranked_port(
564
- endpoint: Endpoint[P, R], once: bool = False
565
- ) -> Tuple["Port[R]", "RankedPortReceiver[R]"]:
566
- p, receiver = port(endpoint, once)
567
- return p, RankedPortReceiver[R](receiver._mailbox, receiver._receiver)
669
+ @staticmethod
670
+ def open_ranked(once: bool = False) -> Tuple["Port[R]", "RankedPortReceiver[R]"]:
671
+ send, recv = Channel[R].open()
672
+ return (send, recv.ranked())
568
673
 
569
674
 
570
675
  class PortReceiver(Generic[R]):
@@ -572,12 +677,22 @@ class PortReceiver(Generic[R]):
572
677
  self,
573
678
  mailbox: Mailbox,
574
679
  receiver: "PortReceiverBase",
680
+ monitor: "Optional[Shared[Exception]]" = None,
575
681
  ) -> None:
576
682
  self._mailbox: Mailbox = mailbox
683
+ self._monitor = monitor
577
684
  self._receiver = receiver
578
685
 
579
686
  async def _recv(self) -> R:
580
- return self._process(await self._receiver.recv_task())
687
+ awaitable = self._receiver.recv_task()
688
+ if self._monitor is None:
689
+ result = await awaitable
690
+ else:
691
+ # type: ignore
692
+ result, i = await PythonTask.select_one([self._monitor.task(), awaitable])
693
+ if i == 0:
694
+ raise result
695
+ return self._process(result)
581
696
 
582
697
  def _process(self, msg: PythonMessage) -> R:
583
698
  # TODO: Try to do something more structured than a cast here
@@ -593,6 +708,12 @@ class PortReceiver(Generic[R]):
593
708
  def recv(self) -> "Future[R]":
594
709
  return Future(coro=self._recv())
595
710
 
711
+ def ranked(self) -> "RankedPortReceiver[R]":
712
+ return RankedPortReceiver[R](self._mailbox, self._receiver, self._monitor)
713
+
714
+ def _set_monitor(self, monitor: "Optional[Shared[Exception]]"):
715
+ self._monitor = monitor
716
+
596
717
 
597
718
  class RankedPortReceiver(PortReceiver[Tuple[int, R]]):
598
719
  def _process(self, msg: PythonMessage) -> Tuple[int, R]:
@@ -614,6 +735,8 @@ singleton_shape = Shape([], NDSlice(offset=0, sizes=[], strides=[]))
614
735
  # we need to signal to the consumer of the PythonTask object that the thread really isn't in an async context.
615
736
  # We do this by blanking out the running event loop during the call to the synchronous actor function.
616
737
 
738
+ MESSAGES_HANDLED = METER.create_counter("py_mesages_handled")
739
+
617
740
 
618
741
  class _Actor:
619
742
  """
@@ -637,30 +760,28 @@ class _Actor:
637
760
 
638
761
  async def handle(
639
762
  self,
640
- mailbox: Mailbox,
641
- rank: int,
642
- shape: Shape,
643
- method_spec: MethodSpecifier,
763
+ ctx: Context,
764
+ method: MethodSpecifier,
644
765
  message: bytes,
645
766
  panic_flag: PanicFlag,
646
767
  local_state: Iterable[Any],
647
- port: "PortProtocol",
768
+ response_port: "PortProtocol[Any]",
648
769
  ) -> None:
770
+ MESSAGES_HANDLED.add(1)
649
771
  # response_port can be None. If so, then sending to port will drop the response,
650
772
  # and raise any exceptions to the caller.
651
773
  try:
652
- ctx: MonarchContext = MonarchContext(
653
- mailbox, mailbox.actor_id.proc_id, Point(rank, shape)
654
- )
655
774
  _context.set(ctx)
656
775
 
657
776
  DebugContext.set(DebugContext())
658
777
 
659
778
  args, kwargs = unflatten(message, local_state)
660
779
 
661
- match method_spec:
780
+ match method:
662
781
  case MethodSpecifier.Init():
663
- Class, *args = args
782
+ ins = ctx.actor_instance
783
+ Class, ins.proc_mesh, ins._controller_controller, *args = args
784
+ ins.rank = ctx.message_rank
664
785
  try:
665
786
  self.instance = Class(*args, **kwargs)
666
787
  except Exception as e:
@@ -668,13 +789,13 @@ class _Actor:
668
789
  e, f"Remote actor {Class}.__init__ call failed."
669
790
  )
670
791
  raise e
671
- port.send(None)
792
+ response_port.send(None)
672
793
  return None
673
- case MethodSpecifier.ReturnsResponse(name=method):
794
+ case MethodSpecifier.ReturnsResponse(name=method_name):
674
795
  pass
675
- case MethodSpecifier.ExplicitPort(name=method):
676
- args = (port, *args)
677
- port = DroppingPort()
796
+ case MethodSpecifier.ExplicitPort(name=method_name):
797
+ args = (response_port, *args)
798
+ response_port = DroppingPort()
678
799
 
679
800
  if self.instance is None:
680
801
  # This could happen because of the following reasons. Both
@@ -687,52 +808,50 @@ class _Actor:
687
808
  # should never happen. It indicates either a bug in the
688
809
  # message delivery mechanism, or the framework accidentally
689
810
  # mixed the usage of cast and direct send.
690
- error_message = f"Actor object is missing when executing method {method} on actor {mailbox.actor_id}."
811
+
812
+ error_message = f"Actor object is missing when executing method {method_name} on actor {ctx.actor_instance.actor_id}."
691
813
  if self._saved_error is not None:
692
814
  error_message += (
693
815
  f" This is likely due to an earlier error: {self._saved_error}"
694
816
  )
695
817
  raise AssertionError(error_message)
696
- the_method = getattr(self.instance, method)
818
+
819
+ the_method = getattr(self.instance, method_name)
697
820
  if isinstance(the_method, EndpointProperty):
698
- module = the_method._method.__module__
699
821
  the_method = functools.partial(the_method._method, self.instance)
700
- else:
701
- module = the_method.__module__
702
822
 
703
823
  if inspect.iscoroutinefunction(the_method):
704
824
 
705
825
  async def instrumented():
706
- enter_span(
707
- module,
708
- method,
709
- str(ctx.mailbox.actor_id),
710
- )
711
- try:
712
- result = await the_method(*args, **kwargs)
713
- self._maybe_exit_debugger()
714
- except Exception as e:
715
- logging.critical(
716
- "Unhandled exception in actor endpoint",
717
- exc_info=e,
718
- )
719
- raise e
720
- exit_span()
826
+ with TRACER.start_as_current_span(
827
+ method_name,
828
+ attributes={"actor_id": str(ctx.actor_instance.actor_id)},
829
+ ):
830
+ try:
831
+ result = await the_method(*args, **kwargs)
832
+ self._maybe_exit_debugger()
833
+ except Exception as e:
834
+ logging.critical(
835
+ "Unhandled exception in actor endpoint",
836
+ exc_info=e,
837
+ )
838
+ raise e
721
839
  return result
722
840
 
723
841
  result = await instrumented()
724
842
  else:
725
- enter_span(module, method, str(ctx.mailbox.actor_id))
726
- with fake_sync_state():
727
- result = the_method(*args, **kwargs)
728
- self._maybe_exit_debugger()
729
- exit_span()
730
-
731
- port.send(result)
843
+ with TRACER.start_as_current_span(
844
+ method_name,
845
+ attributes={"actor_id": str(ctx.actor_instance.actor_id)},
846
+ ):
847
+ with fake_sync_state():
848
+ result = the_method(*args, **kwargs)
849
+ self._maybe_exit_debugger()
850
+
851
+ response_port.send(result)
732
852
  except Exception as e:
733
853
  self._post_mortem_debug(e.__traceback__)
734
- traceback.print_exc()
735
- port.exception(ActorError(e))
854
+ response_port.exception(ActorError(e))
736
855
  except BaseException as e:
737
856
  self._post_mortem_debug(e.__traceback__)
738
857
  # A BaseException can be thrown in the case of a Rust panic.
@@ -754,21 +873,33 @@ class _Actor:
754
873
  DebugContext.set(DebugContext())
755
874
 
756
875
  def _post_mortem_debug(self, exc_tb) -> None:
757
- from monarch._src.actor.debugger import DebugManager
876
+ from monarch._src.actor.debugger.debugger import debug_controller
758
877
 
759
878
  if (pdb_wrapper := DebugContext.get().pdb_wrapper) is not None:
760
879
  with fake_sync_state():
761
- ctx = MonarchContext.get()
880
+ ctx = context()
881
+ msg_rank = ctx.message_rank
762
882
  pdb_wrapper = PdbWrapper(
763
- ctx.point.rank,
764
- ctx.point.shape.coordinates(ctx.point.rank),
765
- ctx.mailbox.actor_id,
766
- DebugManager.ref().get_debug_client.call_one().get(),
883
+ msg_rank.rank,
884
+ {k: msg_rank[k] for k in msg_rank},
885
+ ctx.actor_instance.actor_id,
886
+ debug_controller(),
767
887
  )
768
888
  DebugContext.set(DebugContext(pdb_wrapper))
769
889
  pdb_wrapper.post_mortem(exc_tb)
770
890
  self._maybe_exit_debugger(do_continue=False)
771
891
 
892
+ def _handle_undeliverable_message(
893
+ self, message: UndeliverableMessageEnvelope
894
+ ) -> bool:
895
+ handle_undeliverable = getattr(
896
+ self.instance, "_handle_undeliverable_message", None
897
+ )
898
+ if handle_undeliverable is not None:
899
+ return handle_undeliverable(message)
900
+ else:
901
+ return False
902
+
772
903
 
773
904
  def _is_mailbox(x: object) -> bool:
774
905
  if hasattr(x, "__monarch_ref__"):
@@ -787,7 +918,7 @@ def _pickle(obj: object) -> bytes:
787
918
  return msg
788
919
 
789
920
 
790
- class Actor(MeshTrait):
921
+ class Actor(MeshTrait, DeprecatedNotAFuture):
791
922
  @functools.cached_property
792
923
  def logger(cls) -> logging.Logger:
793
924
  lgr = logging.getLogger(cls.__class__.__name__)
@@ -806,20 +937,39 @@ class Actor(MeshTrait):
806
937
  "actor implementations are not meshes, but we can't convince the typechecker of it..."
807
938
  )
808
939
 
809
- def _new_with_shape(self, shape: Shape) -> "ActorMeshRef":
940
+ def _new_with_shape(self, shape: Shape) -> Self:
810
941
  raise NotImplementedError(
811
942
  "actor implementations are not meshes, but we can't convince the typechecker of it..."
812
943
  )
813
944
 
945
+ @property
946
+ def initialized(self):
947
+ raise NotImplementedError(
948
+ "actor implementations are not meshes, but we can't convince the typechecker of it..."
949
+ )
950
+
951
+ def _handle_undeliverable_message(
952
+ self, message: UndeliverableMessageEnvelope
953
+ ) -> bool:
954
+ # Return False to indicate that the undeliverable message was not handled.
955
+ return False
814
956
 
815
- class ActorMeshRef(MeshTrait):
957
+
958
+ class ActorMesh(MeshTrait, Generic[T], DeprecatedNotAFuture):
816
959
  def __init__(
817
- self, Class: Type[T], actor_mesh_ref: _ActorMeshRefImpl, mailbox: Mailbox
960
+ self,
961
+ Class: Type[T],
962
+ inner: "ActorMeshProtocol",
963
+ mailbox: Mailbox,
964
+ shape: Shape,
965
+ proc_mesh: "Optional[ProcMesh]",
818
966
  ) -> None:
819
967
  self.__name__: str = Class.__name__
820
968
  self._class: Type[T] = Class
821
- self._actor_mesh_ref: _ActorMeshRefImpl = actor_mesh_ref
969
+ self._inner: "ActorMeshProtocol" = inner
822
970
  self._mailbox: Mailbox = mailbox
971
+ self._shape = shape
972
+ self._proc_mesh = proc_mesh
823
973
  for attr_name in dir(self._class):
824
974
  attr_value = getattr(self._class, attr_name, None)
825
975
  if isinstance(attr_value, EndpointProperty):
@@ -832,11 +982,9 @@ class ActorMeshRef(MeshTrait):
832
982
  setattr(
833
983
  self,
834
984
  attr_name,
835
- ActorEndpoint(
836
- self._actor_mesh_ref,
985
+ self._endpoint(
837
986
  kind(attr_name),
838
987
  attr_value._method,
839
- self._mailbox,
840
988
  attr_value._propagator,
841
989
  attr_value._explicit_response_port,
842
990
  ),
@@ -847,53 +995,95 @@ class ActorMeshRef(MeshTrait):
847
995
  return NotAnEndpoint(self, attr)
848
996
  raise AttributeError(attr)
849
997
 
850
- def _create(
998
+ def _endpoint(
851
999
  self,
852
- args: Iterable[Any],
853
- kwargs: Dict[str, Any],
854
- ) -> None:
1000
+ name: MethodSpecifier,
1001
+ impl: Callable[Concatenate[Any, P], Awaitable[R]],
1002
+ propagator: Any,
1003
+ explicit_response_port: bool,
1004
+ ):
1005
+ return ActorEndpoint(
1006
+ self._inner,
1007
+ self._shape,
1008
+ self._proc_mesh,
1009
+ name,
1010
+ impl,
1011
+ self._mailbox,
1012
+ propagator,
1013
+ explicit_response_port,
1014
+ )
1015
+
1016
+ @classmethod
1017
+ def _create(
1018
+ cls,
1019
+ Class: Type[T],
1020
+ actor_mesh: "PythonActorMesh | PythonActorMeshImpl",
1021
+ mailbox: Mailbox,
1022
+ shape: Shape,
1023
+ proc_mesh: "ProcMesh",
1024
+ controller_controller: Optional["_ControllerController"],
1025
+ # args and kwargs are passed to the __init__ method of the user defined
1026
+ # python actor object.
1027
+ *args: Any,
1028
+ **kwargs: Any,
1029
+ ) -> "ActorMesh[T]":
1030
+ if isinstance(actor_mesh, PythonActorMeshImpl):
1031
+ actor_mesh = _ActorMeshRefImpl.from_hyperactor_mesh(
1032
+ mailbox, shape, actor_mesh, proc_mesh
1033
+ )
1034
+
1035
+ mesh = cls(Class, actor_mesh, mailbox, shape, proc_mesh)
1036
+
855
1037
  async def null_func(*_args: Iterable[Any], **_kwargs: Dict[str, Any]) -> None:
856
1038
  return None
857
1039
 
858
- ep = ActorEndpoint(
859
- self._actor_mesh_ref,
1040
+ # send __init__ message to the mesh to initialize the user defined
1041
+ # python actor object.
1042
+ ep = mesh._endpoint(
860
1043
  MethodSpecifier.Init(),
861
1044
  null_func,
862
- self._mailbox,
863
1045
  None,
864
1046
  False,
865
1047
  )
866
- send(ep, (self._class, *args), kwargs)
867
-
868
- def __reduce_ex__(
869
- self, protocol: ...
870
- ) -> "Tuple[Type[ActorMeshRef], Tuple[Any, ...]]":
871
- return ActorMeshRef, (
872
- self._class,
873
- self._actor_mesh_ref,
874
- self._mailbox,
1048
+ send(ep, (mesh._class, proc_mesh, controller_controller, *args), kwargs)
1049
+
1050
+ return mesh
1051
+
1052
+ @classmethod
1053
+ def from_actor_id(
1054
+ cls,
1055
+ Class: Type[T],
1056
+ actor_id: ActorId,
1057
+ mailbox: Mailbox,
1058
+ ) -> "ActorMesh[T]":
1059
+ return cls(
1060
+ Class, _SingletonActorAdapator(actor_id), mailbox, singleton_shape, None
875
1061
  )
876
1062
 
1063
+ def __reduce_ex__(self, protocol: ...) -> "Tuple[Type[ActorMesh], Tuple[Any, ...]]":
1064
+ return ActorMesh, (self._class, self._inner, self._mailbox, self._shape, None)
1065
+
877
1066
  @property
878
1067
  def _ndslice(self) -> NDSlice:
879
- return self._actor_mesh_ref._shape.ndslice
1068
+ return self._shape.ndslice
880
1069
 
881
1070
  @property
882
1071
  def _labels(self) -> Iterable[str]:
883
- return self._actor_mesh_ref._shape.labels
1072
+ return self._shape.labels
884
1073
 
885
- def _new_with_shape(self, shape: Shape) -> "ActorMeshRef":
886
- return ActorMeshRef(
887
- self._class,
888
- _ActorMeshRefImpl.from_actor_ref_with_shape(self._actor_mesh_ref, shape),
889
- self._mailbox,
890
- )
1074
+ def _new_with_shape(self, shape: Shape) -> "ActorMesh[T]":
1075
+ sliced = self._inner.new_with_shape(shape)
1076
+ return ActorMesh(self._class, sliced, self._mailbox, shape, self._proc_mesh)
891
1077
 
892
1078
  def __repr__(self) -> str:
893
- return f"ActorMeshRef(class={self._class}, shape={self._actor_mesh_ref._shape})"
1079
+ return f"ActorMesh(class={self._class}, shape={self._shape}), inner={type(self._inner)})"
1080
+
1081
+ def stop(self) -> "Future[None]":
1082
+ return Future(coro=self._inner.stop())
894
1083
 
895
- async def stop(self):
896
- await self._actor_mesh_ref.stop()
1084
+ @property
1085
+ def initialized(self) -> Future[None]:
1086
+ return Future(coro=self._inner.initialized())
897
1087
 
898
1088
 
899
1089
  class ActorError(Exception):
@@ -909,27 +1099,35 @@ class ActorError(Exception):
909
1099
  message: str = "A remote actor call has failed.",
910
1100
  ) -> None:
911
1101
  self.exception = exception
912
- self.actor_mesh_ref_frames: StackSummary = extract_tb(exception.__traceback__)
1102
+ # Need to stringify the exception early, because the PyPI package
1103
+ # exceptiongroup may monkeypatch the "TracebackException" class for python
1104
+ # versions < 3.11. If it gets unpickled in a different scope without
1105
+ # using that monkeypatch, it'll have an exception in "format()".
1106
+ # Store the traceback string instead which shouldn't change between machines.
1107
+ actor_mesh_ref_tb = TracebackException.from_exception(exception).format()
1108
+ # Replace any traceback lines to indicate it's a remote call traceback.
1109
+ actor_mesh_ref_tb = (
1110
+ s.replace(
1111
+ "Traceback (most recent call last):",
1112
+ "Traceback of where the remote call failed (most recent call last):",
1113
+ )
1114
+ for s in actor_mesh_ref_tb
1115
+ )
1116
+ self.exception_formatted = "".join(actor_mesh_ref_tb)
913
1117
  self.message = message
914
1118
 
915
1119
  def __str__(self) -> str:
916
- exe = str(self.exception)
917
- actor_mesh_ref_tb = "".join(traceback.format_list(self.actor_mesh_ref_frames))
918
- return (
919
- f"{self.message}\n"
920
- f"Traceback of where the remote call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
921
- )
1120
+ return f"{self.message}\n {self.exception_formatted}"
922
1121
 
923
1122
 
924
1123
  def current_actor_name() -> str:
925
- return str(MonarchContext.get().mailbox.actor_id)
1124
+ return str(context().actor_instance.actor_id)
926
1125
 
927
1126
 
928
1127
  def current_rank() -> Point:
929
- ctx = MonarchContext.get()
930
- return ctx.point
1128
+ return context().message_rank
931
1129
 
932
1130
 
933
1131
  def current_size() -> Dict[str, int]:
934
- ctx = MonarchContext.get()
935
- return dict(zip(ctx.point.shape.labels, ctx.point.shape.ndslice.sizes))
1132
+ r = context().message_rank.extent
1133
+ return {k: r[k] for k in r}