torchmonarch-nightly 2025.7.1__cp313-cp313-manylinux2014_x86_64.whl → 2025.7.25__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. monarch/__init__.py +13 -9
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
  4. monarch/_src/actor/actor_mesh.py +874 -0
  5. monarch/{allocator.py → _src/actor/allocator.py} +26 -17
  6. monarch/_src/actor/bootstrap_main.py +73 -0
  7. monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
  8. monarch/_src/actor/code_sync/auto_reload.py +223 -0
  9. monarch/_src/actor/debugger.py +565 -0
  10. monarch/_src/actor/endpoint.py +270 -0
  11. monarch/_src/actor/event_loop.py +97 -0
  12. monarch/_src/actor/future.py +100 -0
  13. monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
  14. monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
  15. monarch/_src/actor/proc_mesh.py +500 -0
  16. monarch/_src/actor/sync_state.py +18 -0
  17. monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
  18. monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
  19. monarch/_src/actor/tensor_engine_shim.py +56 -0
  20. monarch/_src/tensor_engine/rdma.py +180 -0
  21. monarch/_testing.py +3 -2
  22. monarch/actor/__init__.py +51 -0
  23. monarch/actor_mesh.py +6 -765
  24. monarch/bootstrap_main.py +8 -47
  25. monarch/common/client.py +1 -1
  26. monarch/common/controller_api.py +2 -1
  27. monarch/common/device_mesh.py +12 -2
  28. monarch/common/messages.py +12 -1
  29. monarch/common/recording.py +4 -3
  30. monarch/common/remote.py +135 -52
  31. monarch/common/tensor.py +2 -1
  32. monarch/controller/backend.py +2 -2
  33. monarch/controller/controller.py +2 -1
  34. monarch/controller/rust_backend/controller.py +2 -1
  35. monarch/fetch.py +3 -5
  36. monarch/mesh_controller.py +201 -139
  37. monarch/monarch_controller +0 -0
  38. monarch/opaque_module.py +4 -6
  39. monarch/opaque_object.py +3 -3
  40. monarch/proc_mesh.py +6 -309
  41. monarch/python_local_mesh.py +1 -1
  42. monarch/rust_backend_mesh.py +2 -1
  43. monarch/rust_local_mesh.py +4 -2
  44. monarch/sim_mesh.py +10 -19
  45. monarch/simulator/command_history.py +1 -1
  46. monarch/simulator/interface.py +2 -1
  47. monarch/simulator/mock_controller.py +1 -1
  48. monarch/simulator/simulator.py +1 -1
  49. monarch/tensor_engine/__init__.py +23 -0
  50. monarch/tensor_worker_main.py +3 -1
  51. monarch/tools/cli.py +3 -1
  52. monarch/tools/commands.py +95 -35
  53. monarch/tools/mesh_spec.py +55 -0
  54. monarch/tools/utils.py +38 -0
  55. monarch/worker/worker.py +1 -1
  56. monarch/world_mesh.py +2 -1
  57. monarch_supervisor/python_executable.py +6 -3
  58. tests/error_test_binary.py +48 -10
  59. tests/test_actor_error.py +370 -21
  60. tests/test_alloc.py +1 -1
  61. tests/test_allocator.py +373 -17
  62. tests/test_controller.py +2 -0
  63. tests/test_debugger.py +416 -0
  64. tests/test_env_before_cuda.py +162 -0
  65. tests/test_python_actors.py +184 -333
  66. tests/test_rdma.py +198 -0
  67. tests/test_remote_functions.py +40 -12
  68. tests/test_rust_backend.py +7 -5
  69. tests/test_sim_backend.py +1 -4
  70. tests/test_tensor_engine.py +55 -1
  71. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
  72. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
  73. torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
  74. monarch/_monarch/hyperactor/__init__.py +0 -58
  75. monarch/_monarch/worker/debugger.py +0 -117
  76. monarch/_monarch/worker/logging.py +0 -107
  77. monarch/debugger.py +0 -379
  78. monarch/future.py +0 -76
  79. monarch/rdma.py +0 -162
  80. torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
  81. /monarch/{_monarch/worker → _src}/__init__.py +0 -0
  82. /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
  83. /monarch/{common → _src/actor}/shape.py +0 -0
  84. /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
  85. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
  86. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
  87. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,56 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import importlib
8
+ from functools import partial
9
+ from typing import Any, Optional, Sequence, TYPE_CHECKING
10
+
11
+ """
12
+ This file provides a type annoated shim for using tensor engine functions
13
+ from within the actor module which only optionally includes the tensor engine.
14
+
15
+ Each function that is needed should have a @shim entry below which gives the name,
16
+ module, and type of the function. Each function is resolved dynamically the first
17
+ time it is used.
18
+ """
19
+
20
+ if TYPE_CHECKING:
21
+ from monarch._src.actor.actor_mesh import ActorEndpoint, Port, Selection
22
+ from monarch._src.actor.endpoint import Endpoint
23
+
24
+
25
+ def shim(fn=None, *, module=None):
26
+ if fn is None:
27
+ return partial(shim, module=module)
28
+
29
+ impl = None
30
+ name = fn.__name__
31
+
32
+ def wrap(*args, **kwargs):
33
+ nonlocal impl
34
+ if impl is None:
35
+ impl = getattr(importlib.import_module(module), name)
36
+ return impl(*args, **kwargs)
37
+
38
+ return wrap
39
+
40
+
41
+ @shim(module="monarch.mesh_controller")
42
+ def actor_send(
43
+ endpoint: "ActorEndpoint",
44
+ args_kwargs_tuple: bytes,
45
+ refs: "Sequence[Any]",
46
+ port: "Optional[Port[Any]]",
47
+ selection: "Selection",
48
+ ) -> None: ...
49
+
50
+
51
+ @shim(module="monarch.common.remote")
52
+ def _cached_propagation(_cache, rfunction: "Endpoint", args, kwargs) -> Any: ...
53
+
54
+
55
+ @shim(module="monarch.common.fake")
56
+ def fake_call(fn, *args, **kwargs): ...
@@ -0,0 +1,180 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import logging
8
+ import warnings
9
+ from typing import Optional
10
+
11
+ import torch
12
+
13
+ try:
14
+ from monarch._rust_bindings.rdma import _RdmaBuffer
15
+ except ImportError as e:
16
+ logging.error("RDMA is not available: {}".format(e))
17
+ raise e
18
+ from monarch._src.actor.actor_mesh import MonarchContext
19
+ from monarch._src.actor.future import Future
20
+
21
+
22
+ # RDMARead/WriteTransferWarnings are warnings that are only printed once per process.
23
+ # Remove these once GPU support is added.
24
+ class RDMAReadTransferWarning(Warning):
25
+ pass
26
+
27
+
28
+ class RDMAWriteTransferWarning(Warning):
29
+ pass
30
+
31
+
32
+ warnings.simplefilter("once", RDMAReadTransferWarning)
33
+ warnings.simplefilter("once", RDMAWriteTransferWarning)
34
+
35
+
36
+ def is_available():
37
+ return _RdmaBuffer.rdma_supported()
38
+
39
+
40
+ def _assert_tensor_is_1d_contiguous_uint8(t: torch.Tensor) -> None:
41
+ if t.ndim != 1:
42
+ raise ValueError(f"Tensor must be 1D, got {t.ndim}D")
43
+ if t.dtype != torch.uint8:
44
+ raise ValueError(f"Tensor must be uint8, got {t.dtype}")
45
+ if not t.is_contiguous():
46
+ raise ValueError("Tensor must be contiguous")
47
+
48
+
49
+ class RDMABuffer:
50
+ def __init__(self, data: torch.Tensor) -> None:
51
+ """
52
+ RDMABuffer only supports 1D contiguous tensors that are 1 byte per item.
53
+
54
+ To create a 1 byte, 1D view, use t.view(torch.uint8).flatten()
55
+
56
+ TODO: Create TensorBuffer, which will be main user API supporting non-contiguous , multi-byte-per-elment tensors
57
+ """
58
+ assert (
59
+ is_available()
60
+ ), "Tried to create an RDMABuffer, but RDMA is not available on this platform."
61
+
62
+ if data.device.type != "cpu":
63
+ # TODO - CUDA support for RDMABuffer exists at the Rust layer, but
64
+ # runs into issues with MR creation. For now, only support CPU tensors.
65
+ # Remove this once GPU support is added.
66
+ raise ValueError(
67
+ "RDMABuffer currently only supports CPU tensors (got device {})".format(
68
+ data.device
69
+ )
70
+ )
71
+
72
+ _assert_tensor_is_1d_contiguous_uint8(data)
73
+ assert data.storage_offset() == 0
74
+
75
+ try:
76
+ storage = data.untyped_storage()
77
+ addr: int = storage.data_ptr()
78
+ size = storage.element_size() * data.numel()
79
+ ctx = MonarchContext.get()
80
+ self._buffer: _RdmaBuffer = _RdmaBuffer.create_rdma_buffer_blocking(
81
+ addr=addr,
82
+ size=size,
83
+ proc_id=ctx.proc_id,
84
+ client=ctx.mailbox,
85
+ )
86
+ # TODO - specific exception
87
+ except Exception as e:
88
+ logging.error("Failed to create buffer %s", e)
89
+ raise e
90
+
91
+ def read_into(
92
+ self,
93
+ dst: torch.Tensor,
94
+ offset: int = 0,
95
+ timeout: int = 3,
96
+ ) -> Future[Optional[int]]:
97
+ """
98
+ Read data from the RDMABuffer into a destination tensor.
99
+
100
+ The destination tensor must be contiguous and 1 byte per item.
101
+
102
+ Returns an ActorFuture that can be awaited or called with .get() for blocking operation.
103
+ """
104
+ _assert_tensor_is_1d_contiguous_uint8(dst)
105
+ dst_gpu = None
106
+ if dst.device.type != "cpu":
107
+ # TODO - remove this once GPU support is added.
108
+ warnings.warn(
109
+ "note: read_into only supports CPU tensors, so `dst` is being copied to CPU.",
110
+ RDMAReadTransferWarning,
111
+ stacklevel=2,
112
+ )
113
+ dst_gpu = dst
114
+ dst = dst.cpu()
115
+ storage = dst.untyped_storage()
116
+ addr: int = storage.data_ptr() + offset
117
+ size = storage.element_size() * dst.numel()
118
+ if offset + size > dst.numel():
119
+ raise ValueError(
120
+ f"offset + size ({offset + size}) must be <= dst.numel() ({dst.numel()})"
121
+ )
122
+
123
+ async def read_into_nonblocking() -> Optional[int]:
124
+ res = await self._buffer.read_into(
125
+ addr=addr,
126
+ size=size,
127
+ local_proc_id=MonarchContext.get().proc_id,
128
+ client=MonarchContext.get().mailbox,
129
+ timeout=timeout,
130
+ )
131
+ # TODO - remove this once GPU support is added.
132
+ if dst_gpu is not None:
133
+ dst_gpu.copy_(dst)
134
+ return res
135
+
136
+ return Future(impl=read_into_nonblocking, requires_loop=False)
137
+
138
+ def write_from(
139
+ self, src: torch.Tensor, offset: int = 0, timeout: int = 3
140
+ ) -> Future[None]:
141
+ """
142
+ Write data from a source tensor into the RDMABuffer.
143
+
144
+ The source tensor must be contiguous and 1 byte per item.
145
+
146
+ Returns an ActorFuture that can be awaited or called with .get() for blocking operation.
147
+ """
148
+ _assert_tensor_is_1d_contiguous_uint8(src)
149
+ src_gpu = None
150
+ if src.device.type != "cpu":
151
+ # TODO - remove this once GPU support is added.
152
+ warnings.warn(
153
+ "note: write_from only supports CPU tensors, so we will write to CPU first, then transfer to `src` in place.",
154
+ RDMAWriteTransferWarning,
155
+ stacklevel=2,
156
+ )
157
+ src_gpu = src # Save the original GPU tensor reference
158
+ src = src.cpu() # Convert to CPU for RDMA operation
159
+ storage = src.untyped_storage()
160
+ addr: int = storage.data_ptr()
161
+ size = storage.element_size() * src.numel()
162
+ if size + offset > src.numel():
163
+ raise ValueError(
164
+ f"size + offset ({size + offset}) must be <= src.numel() ({src.numel()})"
165
+ )
166
+
167
+ async def write_from_nonblocking() -> None:
168
+ res = await self._buffer.write_from(
169
+ addr=addr,
170
+ size=size,
171
+ local_proc_id=MonarchContext.get().proc_id,
172
+ client=MonarchContext.get().mailbox,
173
+ timeout=timeout,
174
+ )
175
+ # TODO - remove this once GPU support is added.
176
+ if src_gpu is not None:
177
+ src_gpu.copy_(src)
178
+ return res
179
+
180
+ return Future(impl=write_from_nonblocking, requires_loop=False)
monarch/_testing.py CHANGED
@@ -13,13 +13,13 @@ from contextlib import contextmanager, ExitStack
13
13
  from typing import Any, Callable, Dict, Generator, Literal, Optional
14
14
 
15
15
  import monarch_supervisor
16
+ from monarch._src.actor.shape import NDSlice
17
+ from monarch.actor import proc_mesh, ProcMesh
16
18
  from monarch.common.client import Client
17
19
  from monarch.common.device_mesh import DeviceMesh
18
20
  from monarch.common.invocation import DeviceException, RemoteException
19
- from monarch.common.shape import NDSlice
20
21
  from monarch.controller.backend import ProcessBackend
21
22
  from monarch.mesh_controller import spawn_tensor_engine
22
- from monarch.proc_mesh import proc_mesh, ProcMesh
23
23
  from monarch.python_local_mesh import PythonLocalContext
24
24
  from monarch.rust_local_mesh import (
25
25
  local_mesh,
@@ -228,3 +228,4 @@ def mock_mesh(hosts: int, gpus: int):
228
228
  class BackendType:
229
229
  PY = "py"
230
230
  RS = "rs"
231
+ MESH = "mesh"
@@ -0,0 +1,51 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Monarch Actor API - Public interface for actor functionality.
9
+ """
10
+
11
+ from monarch._src.actor.actor_mesh import (
12
+ Accumulator,
13
+ Actor,
14
+ ActorError,
15
+ current_actor_name,
16
+ current_rank,
17
+ current_size,
18
+ Point,
19
+ port,
20
+ send,
21
+ ValueMesh,
22
+ )
23
+ from monarch._src.actor.endpoint import endpoint
24
+ from monarch._src.actor.future import Future
25
+ from monarch._src.actor.proc_mesh import (
26
+ debug_client,
27
+ local_proc_mesh,
28
+ proc_mesh,
29
+ ProcMesh,
30
+ sim_proc_mesh,
31
+ )
32
+
33
+ __all__ = [
34
+ "Accumulator",
35
+ "Actor",
36
+ "ActorError",
37
+ "current_actor_name",
38
+ "current_rank",
39
+ "current_size",
40
+ "endpoint",
41
+ "Future",
42
+ "local_proc_mesh",
43
+ "Point",
44
+ "proc_mesh",
45
+ "ProcMesh",
46
+ "port",
47
+ "send",
48
+ "sim_proc_mesh",
49
+ "ValueMesh",
50
+ "debug_client",
51
+ ]