torchmonarch-nightly 2025.6.27__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. monarch/__init__.py +189 -0
  2. monarch/_monarch/__init__.py +5 -0
  3. monarch/_monarch/hyperactor/__init__.py +58 -0
  4. monarch/_monarch/selection/__init__.py +13 -0
  5. monarch/_monarch/worker/__init__.py +0 -0
  6. monarch/_monarch/worker/debugger.py +117 -0
  7. monarch/_monarch/worker/logging.py +107 -0
  8. monarch/_rust_bindings.so +0 -0
  9. monarch/_testing.py +230 -0
  10. monarch/actor_mesh.py +761 -0
  11. monarch/allocator.py +220 -0
  12. monarch/bootstrap_main.py +59 -0
  13. monarch/builtins/__init__.py +14 -0
  14. monarch/builtins/log.py +22 -0
  15. monarch/builtins/random.py +68 -0
  16. monarch/cached_remote_function.py +257 -0
  17. monarch/code_sync.py +10 -0
  18. monarch/common/_C.pyi +11 -0
  19. monarch/common/_C.so +0 -0
  20. monarch/common/__init__.py +0 -0
  21. monarch/common/_coalescing.py +308 -0
  22. monarch/common/_device_utils.py +18 -0
  23. monarch/common/_tensor_to_table.py +172 -0
  24. monarch/common/base_tensor.py +28 -0
  25. monarch/common/borrows.py +143 -0
  26. monarch/common/client.py +690 -0
  27. monarch/common/constants.py +10 -0
  28. monarch/common/context_manager.py +40 -0
  29. monarch/common/controller_api.py +104 -0
  30. monarch/common/device_mesh.py +417 -0
  31. monarch/common/fake.py +55 -0
  32. monarch/common/function.py +160 -0
  33. monarch/common/function_caching.py +164 -0
  34. monarch/common/future.py +168 -0
  35. monarch/common/invocation.py +125 -0
  36. monarch/common/mast.py +221 -0
  37. monarch/common/messages.py +573 -0
  38. monarch/common/mock_cuda.py +41 -0
  39. monarch/common/opaque_ref.py +98 -0
  40. monarch/common/pickle_flatten.py +48 -0
  41. monarch/common/pipe.py +152 -0
  42. monarch/common/process_group.py +55 -0
  43. monarch/common/recording.py +127 -0
  44. monarch/common/reference.py +33 -0
  45. monarch/common/remote.py +297 -0
  46. monarch/common/selection.py +9 -0
  47. monarch/common/shape.py +229 -0
  48. monarch/common/stream.py +114 -0
  49. monarch/common/tensor.py +814 -0
  50. monarch/common/tensor_factory.py +31 -0
  51. monarch/common/tree.py +73 -0
  52. monarch/controller/__init__.py +7 -0
  53. monarch/controller/backend.py +223 -0
  54. monarch/controller/controller.py +223 -0
  55. monarch/controller/debugger.py +47 -0
  56. monarch/controller/history.py +90 -0
  57. monarch/controller/rust_backend/__init__.py +7 -0
  58. monarch/controller/rust_backend/controller.py +245 -0
  59. monarch/debugger.py +379 -0
  60. monarch/fetch.py +55 -0
  61. monarch/future.py +76 -0
  62. monarch/gradient/__init__.py +11 -0
  63. monarch/gradient/_gradient_generator.pyi +22 -0
  64. monarch/gradient/_gradient_generator.so +0 -0
  65. monarch/gradient_generator.py +185 -0
  66. monarch/memory.py +43 -0
  67. monarch/mesh_controller.py +271 -0
  68. monarch/monarch_controller +0 -0
  69. monarch/notebook.py +761 -0
  70. monarch/opaque_module.py +235 -0
  71. monarch/opaque_object.py +88 -0
  72. monarch/parallel/__init__.py +9 -0
  73. monarch/parallel/pipelining/__init__.py +7 -0
  74. monarch/parallel/pipelining/runtime.py +847 -0
  75. monarch/parallel/pipelining/schedule_ir.py +692 -0
  76. monarch/parallel/pipelining/scheduler.py +249 -0
  77. monarch/pdb_wrapper.py +135 -0
  78. monarch/proc_mesh.py +299 -0
  79. monarch/profiler.py +160 -0
  80. monarch/python_local_mesh.py +107 -0
  81. monarch/random.py +61 -0
  82. monarch/rdma.py +162 -0
  83. monarch/remote_class.py +114 -0
  84. monarch/rust_backend_mesh.py +280 -0
  85. monarch/rust_local_mesh.py +1402 -0
  86. monarch/sim_mesh.py +359 -0
  87. monarch/simulator/__init__.py +7 -0
  88. monarch/simulator/command_history.py +424 -0
  89. monarch/simulator/config.py +21 -0
  90. monarch/simulator/interface.py +59 -0
  91. monarch/simulator/ir.py +770 -0
  92. monarch/simulator/mock_controller.py +214 -0
  93. monarch/simulator/profiling.py +424 -0
  94. monarch/simulator/simulator.py +1052 -0
  95. monarch/simulator/task.py +255 -0
  96. monarch/simulator/tensor.py +373 -0
  97. monarch/simulator/trace.py +395 -0
  98. monarch/simulator/utils.py +41 -0
  99. monarch/simulator/worker.py +389 -0
  100. monarch/telemetry.py +19 -0
  101. monarch/tensor_worker_main.py +260 -0
  102. monarch/tensorboard.py +84 -0
  103. monarch/timer/__init__.py +21 -0
  104. monarch/timer/example_monarch.py +78 -0
  105. monarch/timer/example_spmd.py +55 -0
  106. monarch/timer/execution_timer.py +199 -0
  107. monarch/timer/execution_timer_test.py +131 -0
  108. monarch/tools/__init__.py +7 -0
  109. monarch/tools/cli.py +167 -0
  110. monarch/tools/commands.py +251 -0
  111. monarch/tools/components/__init__.py +7 -0
  112. monarch/tools/components/hyperactor.py +58 -0
  113. monarch/tools/config/__init__.py +20 -0
  114. monarch/tools/config/defaults.py +54 -0
  115. monarch/tools/mesh_spec.py +165 -0
  116. monarch/tools/network.py +69 -0
  117. monarch/worker/__init__.py +7 -0
  118. monarch/worker/_testing_function.py +481 -0
  119. monarch/worker/compiled_block.py +270 -0
  120. monarch/worker/debugger.py +125 -0
  121. monarch/worker/lines.py +47 -0
  122. monarch/worker/monitor.py +53 -0
  123. monarch/worker/worker.py +1191 -0
  124. monarch/world_mesh.py +34 -0
  125. monarch_supervisor/__init__.py +1044 -0
  126. monarch_supervisor/_testing.py +44 -0
  127. monarch_supervisor/function_call.py +30 -0
  128. monarch_supervisor/host.py +386 -0
  129. monarch_supervisor/launchers.py +145 -0
  130. monarch_supervisor/log_pstree.py +48 -0
  131. monarch_supervisor/logging.py +103 -0
  132. monarch_supervisor/python_executable.py +42 -0
  133. tests/__init__.py +0 -0
  134. tests/dispatch_bench.py +124 -0
  135. tests/dispatch_bench_helper.py +25 -0
  136. tests/error_test_binary.py +180 -0
  137. tests/simulator/__init__.py +0 -0
  138. tests/simulator/test_profiling.py +136 -0
  139. tests/simulator/test_simulator.py +411 -0
  140. tests/simulator/test_task.py +64 -0
  141. tests/simulator/test_worker.py +102 -0
  142. tests/sleep_binary.py +35 -0
  143. tests/test_actor_error.py +240 -0
  144. tests/test_alloc.py +25 -0
  145. tests/test_allocator.py +365 -0
  146. tests/test_coalescing.py +492 -0
  147. tests/test_controller.py +845 -0
  148. tests/test_device_mesh.py +132 -0
  149. tests/test_fault_tolerance.py +398 -0
  150. tests/test_future.py +94 -0
  151. tests/test_grad_generator.py +121 -0
  152. tests/test_mock_cuda.py +74 -0
  153. tests/test_pdb_actor.py +110 -0
  154. tests/test_python_actors.py +736 -0
  155. tests/test_remote_functions.py +1271 -0
  156. tests/test_rust_backend.py +217 -0
  157. tests/test_signal_safe_block_on.py +103 -0
  158. tests/test_sim_backend.py +54 -0
  159. tests/test_tensor_engine.py +52 -0
  160. torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
  161. torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
  162. torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
  163. torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
  164. torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
  165. torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0
@@ -0,0 +1,165 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ import string
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Optional
11
+
12
+ from monarch.tools.network import get_sockaddr
13
+ from torchx import specs
14
+
15
+ DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
16
+
17
+ _TAG_MESHES_PREFIX = "monarch/meshes/${mesh_name}/"
18
+ _TAG_HOST_TYPE: str = _TAG_MESHES_PREFIX + "host_type"
19
+ _TAG_GPUS: str = _TAG_MESHES_PREFIX + "gpus"
20
+ _TAG_TRANSPORT: str = _TAG_MESHES_PREFIX + "transport"
21
+
22
+ _UNSET_INT = -1
23
+ _UNSET_STR = "__UNSET__"
24
+
25
+
26
+ @dataclass
27
+ class MeshSpec:
28
+ """Doubles as the 'input' specifications of how to setup the mesh role
29
+ when submitting the job and as the 'info' (describe) API's return value.
30
+ """
31
+
32
+ name: str
33
+ num_hosts: int
34
+ host_type: str = _UNSET_STR
35
+ gpus: int = _UNSET_INT
36
+ # NOTE: using str over monarch._rust_bindings.monarch_hyperactor.channel.ChannelTransport enum
37
+ # b/c the rust binding doesn't have Python enum semantics, hence doesn't serialize well
38
+ transport: str = "tcp"
39
+ port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
40
+ hostnames: list[str] = field(default_factory=list)
41
+
42
+ def server_addrs(
43
+ self, transport: Optional[str] = None, port: Optional[int] = None
44
+ ) -> list[str]:
45
+ """
46
+ Returns the hostnames (servers) in channel address format.
47
+ `transport` and `port` is typically taken from this mesh spec's fields, but
48
+ the caller can override them when calling this function.
49
+ """
50
+
51
+ transport = transport or self.transport
52
+ port = port or self.port
53
+
54
+ if transport == "tcp":
55
+ # need to resolve hostnames to ip address for TCP
56
+ return [
57
+ f"tcp!{get_sockaddr(hostname, port)}" for hostname in self.hostnames
58
+ ]
59
+ elif transport == "metatls":
60
+ return [f"metatls!{hostname}:{port}" for hostname in self.hostnames]
61
+ else:
62
+ raise ValueError(
63
+ f"Unsupported transport: {transport}. Must be one of: 'tcp' or 'metatls'"
64
+ )
65
+
66
+
67
+ def _tag(mesh_name: str, tag_template: str) -> str:
68
+ return string.Template(tag_template).substitute(mesh_name=mesh_name)
69
+
70
+
71
+ def tag_as_metadata(mesh_spec: MeshSpec, appdef: specs.AppDef) -> None:
72
+ appdef.metadata[_tag(mesh_spec.name, _TAG_HOST_TYPE)] = mesh_spec.host_type
73
+ appdef.metadata[_tag(mesh_spec.name, _TAG_GPUS)] = str(mesh_spec.gpus)
74
+ appdef.metadata[_tag(mesh_spec.name, _TAG_TRANSPORT)] = mesh_spec.transport
75
+
76
+
77
+ def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[MeshSpec]:
78
+ for role in appdef.roles:
79
+ if role.name == mesh_name:
80
+ return MeshSpec(
81
+ name=mesh_name,
82
+ num_hosts=role.num_replicas,
83
+ host_type=appdef.metadata.get(
84
+ _tag(mesh_name, _TAG_HOST_TYPE), _UNSET_STR
85
+ ),
86
+ gpus=int(
87
+ appdef.metadata.get(_tag(mesh_name, _TAG_GPUS), str(_UNSET_INT))
88
+ ),
89
+ transport=appdef.metadata.get(_tag(mesh_name, _TAG_TRANSPORT), "tcp"),
90
+ port=role.port_map.get("mesh", DEFAULT_REMOTE_ALLOCATOR_PORT),
91
+ )
92
+
93
+ return None
94
+
95
+
96
+ def mesh_spec_from_str(mesh_spec_str: str) -> MeshSpec:
97
+ """Parses the given string into a MeshSpec.
98
+
99
+ Args:
100
+ mesh_spec_str: A string representation of the mesh specification
101
+ in the format 'NAME:NUM_HOSTS:HOST_TYPE' (e.g. 'trainer:8:gpu.medium').
102
+ """
103
+ parts = mesh_spec_str.split(":")
104
+ assert (
105
+ len(parts) == 3
106
+ ), f"`{mesh_spec_str}` is not of the form 'NAME:NUM_HOSTS:HOST_TYPE'"
107
+
108
+ name, num_hosts, host_type = parts
109
+ gpus = specs.resource(h=host_type).gpu
110
+
111
+ assert num_hosts.isdigit(), f"`{num_hosts}` is not a number in: {mesh_spec_str}"
112
+
113
+ return MeshSpec(name, int(num_hosts), host_type, gpus)
114
+
115
+
116
+ @dataclass
117
+ class ServerSpec:
118
+ """Holds information (as returned by the 'describe' API of the scheduler)
119
+ about the monarch server. This is the return value of ``monarch.tools.commands.info` API.
120
+ """
121
+
122
+ name: str
123
+ state: specs.AppState
124
+ meshes: list[MeshSpec]
125
+
126
+ @property
127
+ def is_running(self) -> bool:
128
+ return self.state == specs.AppState.RUNNING
129
+
130
+ def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
131
+ for mesh_spec in self.meshes:
132
+ if mesh_spec.name == mesh_name:
133
+ return mesh_spec
134
+
135
+ raise ValueError(
136
+ f"Mesh: '{mesh_name}' not found in job: {self.name}. Try one of: {self.get_mesh_names()}"
137
+ )
138
+
139
+ def get_mesh_names(self) -> list[str]:
140
+ return [m.name for m in self.meshes]
141
+
142
+ def to_json(self) -> dict[str, Any]:
143
+ """Returns the JSON form of this struct that can be printed to console by:
144
+
145
+ .. code-block:: python
146
+
147
+ import json
148
+
149
+ server_spec = ServerSpec(...)
150
+ print(json.dumps(server_spec, indent=2))
151
+ """
152
+
153
+ return {
154
+ "name": self.name,
155
+ "state": self.state.name,
156
+ "meshes": {
157
+ mesh.name: {
158
+ "host_type": mesh.host_type,
159
+ "hosts": mesh.num_hosts,
160
+ "gpus": mesh.gpus,
161
+ "hostnames": mesh.hostnames,
162
+ }
163
+ for mesh in self.meshes
164
+ },
165
+ }
@@ -0,0 +1,69 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ import logging
9
+ import socket
10
+ from typing import Optional
11
+
12
+ logger: logging.Logger = logging.getLogger(__name__)
13
+
14
+
15
+ def get_sockaddr(hostname: str, port: int) -> str:
16
+ """Returns either an IPv6 or IPv4 socket address (that supports TCP) of the given hostname and port.
17
+ The socket address is of the form:
18
+ 1. `{ipv4.address}:{port}` (e.g. `127.0.0.1:8080`)
19
+ 2. `[{ipv6:address}]:{port}` (e.g. `[::1]:8080`)
20
+
21
+ The hostname is resolved to an IPv6 (or IPv4 if IPv6 is not available on the host) address that
22
+ supports `SOCK_STREAM` (TCP).
23
+
24
+ Raises a `RuntimeError` if neither ipv6 or ipv4 ip can be resolved from hostname.
25
+ """
26
+
27
+ def resolve_sockaddr(family: socket.AddressFamily) -> Optional[str]:
28
+ try:
29
+ # patternlint-disable-next-line python-dns-deps (only used for oss)
30
+ addrs = socket.getaddrinfo(hostname, port, family, type=socket.SOCK_STREAM)
31
+ if addrs:
32
+ family, _, _, _, sockaddr = addrs[0] # use the first address
33
+
34
+ # sockaddr is a tuple (ipv4) or a 4-tuple (ipv6)
35
+ # in both cases the first element is the ip addr
36
+ ipaddr = str(sockaddr[0])
37
+
38
+ if family == socket.AF_INET6:
39
+ socket_address = f"[{ipaddr}]:{port}"
40
+ else: # socket.AF_INET
41
+ socket_address = f"{ipaddr}:{port}"
42
+
43
+ logger.info(
44
+ "resolved %s address `%s` for `%s:%d`",
45
+ family.name,
46
+ socket_address,
47
+ hostname,
48
+ port,
49
+ )
50
+
51
+ return socket_address
52
+ except socket.gaierror as e:
53
+ logger.info(
54
+ "no %s address that can bind TCP sockets for `%s:%d` (error: %s)",
55
+ family.name,
56
+ hostname,
57
+ port,
58
+ e,
59
+ )
60
+ return None
61
+
62
+ for family in [socket.AF_INET6, socket.AF_INET]:
63
+ if ipaddr := resolve_sockaddr(family):
64
+ return ipaddr
65
+
66
+ raise RuntimeError(
67
+ f"Unable to resolve `{hostname}` to ipv6 or ipv4 address that can bind TCP socket."
68
+ " Check the network configuration on the host."
69
+ )
@@ -0,0 +1,7 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict