torchmonarch-nightly 2025.8.1__cp310-cp310-manylinux2014_x86_64.whl → 2025.9.3__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. monarch/_rust_bindings.so +0 -0
  2. monarch/_src/actor/actor_mesh.py +414 -216
  3. monarch/_src/actor/allocator.py +75 -6
  4. monarch/_src/actor/bootstrap_main.py +7 -4
  5. monarch/_src/actor/code_sync/__init__.py +2 -0
  6. monarch/_src/actor/debugger/__init__.py +7 -0
  7. monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
  8. monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
  9. monarch/_src/actor/endpoint.py +27 -45
  10. monarch/_src/actor/future.py +86 -24
  11. monarch/_src/actor/host_mesh.py +125 -0
  12. monarch/_src/actor/logging.py +94 -0
  13. monarch/_src/actor/pickle.py +25 -0
  14. monarch/_src/actor/proc_mesh.py +423 -156
  15. monarch/_src/actor/python_extension_methods.py +90 -0
  16. monarch/_src/actor/shape.py +8 -1
  17. monarch/_src/actor/source_loader.py +45 -0
  18. monarch/_src/actor/telemetry/__init__.py +172 -0
  19. monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
  20. monarch/_src/debug_cli/__init__.py +7 -0
  21. monarch/_src/debug_cli/debug_cli.py +43 -0
  22. monarch/_src/tensor_engine/rdma.py +64 -9
  23. monarch/_testing.py +1 -3
  24. monarch/actor/__init__.py +24 -4
  25. monarch/common/_C.so +0 -0
  26. monarch/common/device_mesh.py +14 -0
  27. monarch/common/future.py +10 -0
  28. monarch/common/remote.py +14 -25
  29. monarch/common/tensor.py +12 -0
  30. monarch/debug_cli/__init__.py +7 -0
  31. monarch/debug_cli/__main__.py +12 -0
  32. monarch/fetch.py +2 -2
  33. monarch/gradient/_gradient_generator.so +0 -0
  34. monarch/gradient_generator.py +4 -2
  35. monarch/mesh_controller.py +34 -14
  36. monarch/monarch_controller +0 -0
  37. monarch/tools/colors.py +25 -0
  38. monarch/tools/commands.py +42 -7
  39. monarch/tools/components/hyperactor.py +1 -1
  40. monarch/tools/config/__init__.py +31 -4
  41. monarch/tools/config/defaults.py +13 -3
  42. monarch/tools/config/environment.py +45 -0
  43. monarch/tools/config/workspace.py +165 -0
  44. monarch/tools/mesh_spec.py +2 -0
  45. monarch/utils/__init__.py +9 -0
  46. monarch/utils/utils.py +78 -0
  47. tests/error_test_binary.py +5 -3
  48. tests/python_actor_test_binary.py +52 -0
  49. tests/test_actor_error.py +142 -14
  50. tests/test_alloc.py +1 -1
  51. tests/test_allocator.py +59 -72
  52. tests/test_coalescing.py +1 -1
  53. tests/test_debugger.py +639 -45
  54. tests/test_env_before_cuda.py +4 -4
  55. tests/test_mesh_trait.py +38 -0
  56. tests/test_python_actors.py +979 -75
  57. tests/test_rdma.py +7 -6
  58. tests/test_tensor_engine.py +6 -6
  59. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/METADATA +82 -4
  60. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/RECORD +64 -48
  61. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/WHEEL +0 -0
  62. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/entry_points.txt +0 -0
  63. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/licenses/LICENSE +0 -0
  64. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,8 @@
8
8
 
9
9
  import abc
10
10
  import logging
11
- from typing import Awaitable, final, Optional, TYPE_CHECKING
11
+ from dataclasses import dataclass
12
+ from typing import Dict, final, Literal, Optional
12
13
 
13
14
  from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
14
15
  Alloc,
@@ -18,21 +19,59 @@ from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monar
18
19
  RemoteAllocatorBase,
19
20
  SimAllocatorBase,
20
21
  )
21
- from monarch._src.actor.future import Future
22
22
 
23
- if TYPE_CHECKING:
24
- from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask
23
+ from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask, Shared
24
+ from monarch._src.actor.future import DeprecatedNotAFuture, Future
25
+
25
26
 
26
27
  ALLOC_LABEL_PROC_MESH_NAME = "procmesh.monarch.meta.com/name"
27
28
 
28
29
  logger: logging.Logger = logging.getLogger(__name__)
29
30
 
30
31
 
32
+ @dataclass
33
+ class AllocHandle(DeprecatedNotAFuture):
34
+ _hy_alloc: "Shared[Alloc]"
35
+ _extent: Dict[str, int]
36
+ _stream_logs: bool
37
+
38
+ def reshape(self, extent: Dict[str, int]) -> "AllocHandle":
39
+ async def task() -> Alloc:
40
+ alloc = await self._hy_alloc
41
+ return alloc.reshape(extent)
42
+
43
+ return AllocHandle(
44
+ PythonTask.from_coroutine(task()).spawn(), extent, self._stream_logs
45
+ )
46
+
47
+ @property
48
+ def initialized(self) -> Future[Literal[True]]:
49
+ """
50
+ Future completes with 'True' when the alloc has initialized.
51
+ Because alloc are remote objects, there is no guarentee that the alloc is
52
+ still usable after this completes, only that at some point in the past it was usable.
53
+ """
54
+
55
+ async def task() -> Literal[True]:
56
+ await self._hy_alloc
57
+ return True
58
+
59
+ return Future(coro=task())
60
+
61
+ @property
62
+ def stream_logs(self) -> bool:
63
+ """
64
+ Whether to stream stdout/stderr logs from the allocated processes back to the client.
65
+ The default behavior is determined by the underlying allocator.
66
+ """
67
+ return self._stream_logs
68
+
69
+
31
70
  class AllocateMixin(abc.ABC):
32
71
  @abc.abstractmethod
33
72
  def allocate_nonblocking(self, spec: AllocSpec) -> "PythonTask[Alloc]": ...
34
73
 
35
- def allocate(self, spec: AllocSpec) -> "Future[Alloc]":
74
+ def allocate(self, spec: AllocSpec) -> "AllocHandle":
36
75
  """
37
76
  Allocate a process according to the provided spec.
38
77
 
@@ -42,7 +81,25 @@ class AllocateMixin(abc.ABC):
42
81
  Returns:
43
82
  - A future that will be fulfilled when the requested allocation is fulfilled.
44
83
  """
45
- return Future(coro=self.allocate_nonblocking(spec))
84
+ return AllocHandle(
85
+ self.allocate_nonblocking(spec).spawn(),
86
+ spec.extent,
87
+ self._stream_logs(),
88
+ )
89
+
90
+ @abc.abstractmethod
91
+ def _stream_logs(self) -> bool:
92
+ """
93
+ Whether to stream stdout/stderr logs from the allocated processes back to the client.
94
+ A common pattern is if the processes are allocated on the same host as the client,
95
+ then it is not necessary to stream logs back. But if the processes are remotely allocated,
96
+ it is recommended to stream logs back. It is up to each allocator to decide the default behavior.
97
+
98
+ Returns:
99
+ - A boolean indicating whether to stream logs back to the client.
100
+ """
101
+
102
+ ...
46
103
 
47
104
 
48
105
  @final
@@ -51,6 +108,9 @@ class ProcessAllocator(ProcessAllocatorBase, AllocateMixin):
51
108
  An allocator that allocates by spawning local processes.
52
109
  """
53
110
 
111
+ def _stream_logs(self) -> bool:
112
+ return False
113
+
54
114
 
55
115
  @final
56
116
  class LocalAllocator(LocalAllocatorBase, AllocateMixin):
@@ -58,6 +118,9 @@ class LocalAllocator(LocalAllocatorBase, AllocateMixin):
58
118
  An allocator that allocates by spawning actors into the current process.
59
119
  """
60
120
 
121
+ def _stream_logs(self) -> bool:
122
+ return False
123
+
61
124
 
62
125
  @final
63
126
  class SimAllocator(SimAllocatorBase, AllocateMixin):
@@ -65,6 +128,9 @@ class SimAllocator(SimAllocatorBase, AllocateMixin):
65
128
  An allocator that allocates by spawning actors into the current process using simulated channels for transport
66
129
  """
67
130
 
131
+ def _stream_logs(self) -> bool:
132
+ return False
133
+
68
134
 
69
135
  class RemoteAllocInitializer(abc.ABC):
70
136
  """Subclass-able Python interface for `hyperactor_mesh::alloc::remoteprocess:RemoteProcessAllocInitializer`.
@@ -198,3 +264,6 @@ class RemoteAllocator(RemoteAllocatorBase, AllocateMixin):
198
264
  An allocator that allocates by spawning actors on a remote host.
199
265
  The remote host must be running hyperactor's remote-process-allocator.
200
266
  """
267
+
268
+ def _stream_logs(self) -> bool:
269
+ return True
@@ -4,6 +4,8 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ # pyre-unsafe
8
+
7
9
  """
8
10
  This is the main function for the boostrapping a new process using a ProcessAllocator.
9
11
  """
@@ -17,7 +19,7 @@ import sys
17
19
 
18
20
  # Import torch to avoid import-time races if a spawned actor tries to import torch.
19
21
  try:
20
- import torch # @manual
22
+ import torch # @manual # noqa: F401
21
23
  except ImportError:
22
24
  pass
23
25
 
@@ -36,14 +38,15 @@ def invoke_main():
36
38
  global bootstrap_main
37
39
 
38
40
  # TODO: figure out what from worker_main.py we should reproduce here.
39
- from monarch._src.actor.telemetry import TracingForwarder
41
+ from monarch._src.actor.telemetry import TracingForwarder # noqa
40
42
 
41
43
  if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
42
44
  raise RuntimeError("Error during bootstrap for testing")
43
45
 
44
46
  # forward logs to rust tracing. Defaults to on.
45
47
  if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
46
- logging.root.addHandler(TracingForwarder(level=logging.DEBUG))
48
+ # we can stream python logs now; no need to forward them to rust processes
49
+ pass
47
50
  # install opentelemetry tracing
48
51
 
49
52
  try:
@@ -58,7 +61,7 @@ def invoke_main():
58
61
  except Exception as e:
59
62
  logging.warning(f"Failed to set up py-spy: {e}")
60
63
 
61
- from monarch._src.actor.debugger import remote_breakpointhook
64
+ from monarch._src.actor.debugger.debugger import remote_breakpointhook
62
65
 
63
66
  sys.breakpointhook = remote_breakpointhook
64
67
 
@@ -6,7 +6,9 @@
6
6
 
7
7
  from monarch._rust_bindings.monarch_extension.code_sync import ( # noqa: F401
8
8
  CodeSyncMeshClient,
9
+ CodeSyncMethod,
9
10
  RemoteWorkspace,
11
+ WorkspaceConfig,
10
12
  WorkspaceLocation,
11
13
  WorkspaceShape,
12
14
  )
@@ -0,0 +1,7 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-unsafe