torchmonarch-nightly 2025.6.27__cp313-cp313-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +189 -0
- monarch/_monarch/__init__.py +5 -0
- monarch/_monarch/hyperactor/__init__.py +58 -0
- monarch/_monarch/selection/__init__.py +13 -0
- monarch/_monarch/worker/__init__.py +0 -0
- monarch/_monarch/worker/debugger.py +117 -0
- monarch/_monarch/worker/logging.py +107 -0
- monarch/_rust_bindings.so +0 -0
- monarch/_testing.py +230 -0
- monarch/actor_mesh.py +761 -0
- monarch/allocator.py +220 -0
- monarch/bootstrap_main.py +59 -0
- monarch/builtins/__init__.py +14 -0
- monarch/builtins/log.py +22 -0
- monarch/builtins/random.py +68 -0
- monarch/cached_remote_function.py +257 -0
- monarch/code_sync.py +10 -0
- monarch/common/_C.pyi +11 -0
- monarch/common/_C.so +0 -0
- monarch/common/__init__.py +0 -0
- monarch/common/_coalescing.py +308 -0
- monarch/common/_device_utils.py +18 -0
- monarch/common/_tensor_to_table.py +172 -0
- monarch/common/base_tensor.py +28 -0
- monarch/common/borrows.py +143 -0
- monarch/common/client.py +690 -0
- monarch/common/constants.py +10 -0
- monarch/common/context_manager.py +40 -0
- monarch/common/controller_api.py +104 -0
- monarch/common/device_mesh.py +417 -0
- monarch/common/fake.py +55 -0
- monarch/common/function.py +160 -0
- monarch/common/function_caching.py +164 -0
- monarch/common/future.py +168 -0
- monarch/common/invocation.py +125 -0
- monarch/common/mast.py +221 -0
- monarch/common/messages.py +573 -0
- monarch/common/mock_cuda.py +41 -0
- monarch/common/opaque_ref.py +98 -0
- monarch/common/pickle_flatten.py +48 -0
- monarch/common/pipe.py +152 -0
- monarch/common/process_group.py +55 -0
- monarch/common/recording.py +127 -0
- monarch/common/reference.py +33 -0
- monarch/common/remote.py +297 -0
- monarch/common/selection.py +9 -0
- monarch/common/shape.py +229 -0
- monarch/common/stream.py +114 -0
- monarch/common/tensor.py +814 -0
- monarch/common/tensor_factory.py +31 -0
- monarch/common/tree.py +73 -0
- monarch/controller/__init__.py +7 -0
- monarch/controller/backend.py +223 -0
- monarch/controller/controller.py +223 -0
- monarch/controller/debugger.py +47 -0
- monarch/controller/history.py +90 -0
- monarch/controller/rust_backend/__init__.py +7 -0
- monarch/controller/rust_backend/controller.py +245 -0
- monarch/debugger.py +379 -0
- monarch/fetch.py +55 -0
- monarch/future.py +76 -0
- monarch/gradient/__init__.py +11 -0
- monarch/gradient/_gradient_generator.pyi +22 -0
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/gradient_generator.py +185 -0
- monarch/memory.py +43 -0
- monarch/mesh_controller.py +271 -0
- monarch/monarch_controller +0 -0
- monarch/notebook.py +761 -0
- monarch/opaque_module.py +235 -0
- monarch/opaque_object.py +88 -0
- monarch/parallel/__init__.py +9 -0
- monarch/parallel/pipelining/__init__.py +7 -0
- monarch/parallel/pipelining/runtime.py +847 -0
- monarch/parallel/pipelining/schedule_ir.py +692 -0
- monarch/parallel/pipelining/scheduler.py +249 -0
- monarch/pdb_wrapper.py +135 -0
- monarch/proc_mesh.py +299 -0
- monarch/profiler.py +160 -0
- monarch/python_local_mesh.py +107 -0
- monarch/random.py +61 -0
- monarch/rdma.py +162 -0
- monarch/remote_class.py +114 -0
- monarch/rust_backend_mesh.py +280 -0
- monarch/rust_local_mesh.py +1402 -0
- monarch/sim_mesh.py +359 -0
- monarch/simulator/__init__.py +7 -0
- monarch/simulator/command_history.py +424 -0
- monarch/simulator/config.py +21 -0
- monarch/simulator/interface.py +59 -0
- monarch/simulator/ir.py +770 -0
- monarch/simulator/mock_controller.py +214 -0
- monarch/simulator/profiling.py +424 -0
- monarch/simulator/simulator.py +1052 -0
- monarch/simulator/task.py +255 -0
- monarch/simulator/tensor.py +373 -0
- monarch/simulator/trace.py +395 -0
- monarch/simulator/utils.py +41 -0
- monarch/simulator/worker.py +389 -0
- monarch/telemetry.py +19 -0
- monarch/tensor_worker_main.py +260 -0
- monarch/tensorboard.py +84 -0
- monarch/timer/__init__.py +21 -0
- monarch/timer/example_monarch.py +78 -0
- monarch/timer/example_spmd.py +55 -0
- monarch/timer/execution_timer.py +199 -0
- monarch/timer/execution_timer_test.py +131 -0
- monarch/tools/__init__.py +7 -0
- monarch/tools/cli.py +167 -0
- monarch/tools/commands.py +251 -0
- monarch/tools/components/__init__.py +7 -0
- monarch/tools/components/hyperactor.py +58 -0
- monarch/tools/config/__init__.py +20 -0
- monarch/tools/config/defaults.py +54 -0
- monarch/tools/mesh_spec.py +165 -0
- monarch/tools/network.py +69 -0
- monarch/worker/__init__.py +7 -0
- monarch/worker/_testing_function.py +481 -0
- monarch/worker/compiled_block.py +270 -0
- monarch/worker/debugger.py +125 -0
- monarch/worker/lines.py +47 -0
- monarch/worker/monitor.py +53 -0
- monarch/worker/worker.py +1191 -0
- monarch/world_mesh.py +34 -0
- monarch_supervisor/__init__.py +1044 -0
- monarch_supervisor/_testing.py +44 -0
- monarch_supervisor/function_call.py +30 -0
- monarch_supervisor/host.py +386 -0
- monarch_supervisor/launchers.py +145 -0
- monarch_supervisor/log_pstree.py +48 -0
- monarch_supervisor/logging.py +103 -0
- monarch_supervisor/python_executable.py +42 -0
- tests/__init__.py +0 -0
- tests/dispatch_bench.py +124 -0
- tests/dispatch_bench_helper.py +25 -0
- tests/error_test_binary.py +180 -0
- tests/simulator/__init__.py +0 -0
- tests/simulator/test_profiling.py +136 -0
- tests/simulator/test_simulator.py +411 -0
- tests/simulator/test_task.py +64 -0
- tests/simulator/test_worker.py +102 -0
- tests/sleep_binary.py +35 -0
- tests/test_actor_error.py +240 -0
- tests/test_alloc.py +25 -0
- tests/test_allocator.py +365 -0
- tests/test_coalescing.py +492 -0
- tests/test_controller.py +845 -0
- tests/test_device_mesh.py +132 -0
- tests/test_fault_tolerance.py +398 -0
- tests/test_future.py +94 -0
- tests/test_grad_generator.py +121 -0
- tests/test_mock_cuda.py +74 -0
- tests/test_pdb_actor.py +110 -0
- tests/test_python_actors.py +736 -0
- tests/test_remote_functions.py +1271 -0
- tests/test_rust_backend.py +217 -0
- tests/test_signal_safe_block_on.py +103 -0
- tests/test_sim_backend.py +54 -0
- tests/test_tensor_engine.py +52 -0
- torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
- torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
- torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
- torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
- torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
- torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0
@@ -0,0 +1,165 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
import string
|
9
|
+
from dataclasses import dataclass, field
|
10
|
+
from typing import Any, Optional
|
11
|
+
|
12
|
+
from monarch.tools.network import get_sockaddr
|
13
|
+
from torchx import specs
|
14
|
+
|
15
|
+
DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
|
16
|
+
|
17
|
+
_TAG_MESHES_PREFIX = "monarch/meshes/${mesh_name}/"
|
18
|
+
_TAG_HOST_TYPE: str = _TAG_MESHES_PREFIX + "host_type"
|
19
|
+
_TAG_GPUS: str = _TAG_MESHES_PREFIX + "gpus"
|
20
|
+
_TAG_TRANSPORT: str = _TAG_MESHES_PREFIX + "transport"
|
21
|
+
|
22
|
+
_UNSET_INT = -1
|
23
|
+
_UNSET_STR = "__UNSET__"
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass
|
27
|
+
class MeshSpec:
|
28
|
+
"""Doubles as the 'input' specifications of how to setup the mesh role
|
29
|
+
when submitting the job and as the 'info' (describe) API's return value.
|
30
|
+
"""
|
31
|
+
|
32
|
+
name: str
|
33
|
+
num_hosts: int
|
34
|
+
host_type: str = _UNSET_STR
|
35
|
+
gpus: int = _UNSET_INT
|
36
|
+
# NOTE: using str over monarch._rust_bindings.monarch_hyperactor.channel.ChannelTransport enum
|
37
|
+
# b/c the rust binding doesn't have Python enum semantics, hence doesn't serialize well
|
38
|
+
transport: str = "tcp"
|
39
|
+
port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
|
40
|
+
hostnames: list[str] = field(default_factory=list)
|
41
|
+
|
42
|
+
def server_addrs(
|
43
|
+
self, transport: Optional[str] = None, port: Optional[int] = None
|
44
|
+
) -> list[str]:
|
45
|
+
"""
|
46
|
+
Returns the hostnames (servers) in channel address format.
|
47
|
+
`transport` and `port` is typically taken from this mesh spec's fields, but
|
48
|
+
the caller can override them when calling this function.
|
49
|
+
"""
|
50
|
+
|
51
|
+
transport = transport or self.transport
|
52
|
+
port = port or self.port
|
53
|
+
|
54
|
+
if transport == "tcp":
|
55
|
+
# need to resolve hostnames to ip address for TCP
|
56
|
+
return [
|
57
|
+
f"tcp!{get_sockaddr(hostname, port)}" for hostname in self.hostnames
|
58
|
+
]
|
59
|
+
elif transport == "metatls":
|
60
|
+
return [f"metatls!{hostname}:{port}" for hostname in self.hostnames]
|
61
|
+
else:
|
62
|
+
raise ValueError(
|
63
|
+
f"Unsupported transport: {transport}. Must be one of: 'tcp' or 'metatls'"
|
64
|
+
)
|
65
|
+
|
66
|
+
|
67
|
+
def _tag(mesh_name: str, tag_template: str) -> str:
|
68
|
+
return string.Template(tag_template).substitute(mesh_name=mesh_name)
|
69
|
+
|
70
|
+
|
71
|
+
def tag_as_metadata(mesh_spec: MeshSpec, appdef: specs.AppDef) -> None:
|
72
|
+
appdef.metadata[_tag(mesh_spec.name, _TAG_HOST_TYPE)] = mesh_spec.host_type
|
73
|
+
appdef.metadata[_tag(mesh_spec.name, _TAG_GPUS)] = str(mesh_spec.gpus)
|
74
|
+
appdef.metadata[_tag(mesh_spec.name, _TAG_TRANSPORT)] = mesh_spec.transport
|
75
|
+
|
76
|
+
|
77
|
+
def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[MeshSpec]:
|
78
|
+
for role in appdef.roles:
|
79
|
+
if role.name == mesh_name:
|
80
|
+
return MeshSpec(
|
81
|
+
name=mesh_name,
|
82
|
+
num_hosts=role.num_replicas,
|
83
|
+
host_type=appdef.metadata.get(
|
84
|
+
_tag(mesh_name, _TAG_HOST_TYPE), _UNSET_STR
|
85
|
+
),
|
86
|
+
gpus=int(
|
87
|
+
appdef.metadata.get(_tag(mesh_name, _TAG_GPUS), str(_UNSET_INT))
|
88
|
+
),
|
89
|
+
transport=appdef.metadata.get(_tag(mesh_name, _TAG_TRANSPORT), "tcp"),
|
90
|
+
port=role.port_map.get("mesh", DEFAULT_REMOTE_ALLOCATOR_PORT),
|
91
|
+
)
|
92
|
+
|
93
|
+
return None
|
94
|
+
|
95
|
+
|
96
|
+
def mesh_spec_from_str(mesh_spec_str: str) -> MeshSpec:
|
97
|
+
"""Parses the given string into a MeshSpec.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
mesh_spec_str: A string representation of the mesh specification
|
101
|
+
in the format 'NAME:NUM_HOSTS:HOST_TYPE' (e.g. 'trainer:8:gpu.medium').
|
102
|
+
"""
|
103
|
+
parts = mesh_spec_str.split(":")
|
104
|
+
assert (
|
105
|
+
len(parts) == 3
|
106
|
+
), f"`{mesh_spec_str}` is not of the form 'NAME:NUM_HOSTS:HOST_TYPE'"
|
107
|
+
|
108
|
+
name, num_hosts, host_type = parts
|
109
|
+
gpus = specs.resource(h=host_type).gpu
|
110
|
+
|
111
|
+
assert num_hosts.isdigit(), f"`{num_hosts}` is not a number in: {mesh_spec_str}"
|
112
|
+
|
113
|
+
return MeshSpec(name, int(num_hosts), host_type, gpus)
|
114
|
+
|
115
|
+
|
116
|
+
@dataclass
|
117
|
+
class ServerSpec:
|
118
|
+
"""Holds information (as returned by the 'describe' API of the scheduler)
|
119
|
+
about the monarch server. This is the return value of ``monarch.tools.commands.info` API.
|
120
|
+
"""
|
121
|
+
|
122
|
+
name: str
|
123
|
+
state: specs.AppState
|
124
|
+
meshes: list[MeshSpec]
|
125
|
+
|
126
|
+
@property
|
127
|
+
def is_running(self) -> bool:
|
128
|
+
return self.state == specs.AppState.RUNNING
|
129
|
+
|
130
|
+
def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
|
131
|
+
for mesh_spec in self.meshes:
|
132
|
+
if mesh_spec.name == mesh_name:
|
133
|
+
return mesh_spec
|
134
|
+
|
135
|
+
raise ValueError(
|
136
|
+
f"Mesh: '{mesh_name}' not found in job: {self.name}. Try one of: {self.get_mesh_names()}"
|
137
|
+
)
|
138
|
+
|
139
|
+
def get_mesh_names(self) -> list[str]:
|
140
|
+
return [m.name for m in self.meshes]
|
141
|
+
|
142
|
+
def to_json(self) -> dict[str, Any]:
|
143
|
+
"""Returns the JSON form of this struct that can be printed to console by:
|
144
|
+
|
145
|
+
.. code-block:: python
|
146
|
+
|
147
|
+
import json
|
148
|
+
|
149
|
+
server_spec = ServerSpec(...)
|
150
|
+
print(json.dumps(server_spec, indent=2))
|
151
|
+
"""
|
152
|
+
|
153
|
+
return {
|
154
|
+
"name": self.name,
|
155
|
+
"state": self.state.name,
|
156
|
+
"meshes": {
|
157
|
+
mesh.name: {
|
158
|
+
"host_type": mesh.host_type,
|
159
|
+
"hosts": mesh.num_hosts,
|
160
|
+
"gpus": mesh.gpus,
|
161
|
+
"hostnames": mesh.hostnames,
|
162
|
+
}
|
163
|
+
for mesh in self.meshes
|
164
|
+
},
|
165
|
+
}
|
monarch/tools/network.py
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
import logging
|
9
|
+
import socket
|
10
|
+
from typing import Optional
|
11
|
+
|
12
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def get_sockaddr(hostname: str, port: int) -> str:
|
16
|
+
"""Returns either an IPv6 or IPv4 socket address (that supports TCP) of the given hostname and port.
|
17
|
+
The socket address is of the form:
|
18
|
+
1. `{ipv4.address}:{port}` (e.g. `127.0.0.1:8080`)
|
19
|
+
2. `[{ipv6:address}]:{port}` (e.g. `[::1]:8080`)
|
20
|
+
|
21
|
+
The hostname is resolved to an IPv6 (or IPv4 if IPv6 is not available on the host) address that
|
22
|
+
supports `SOCK_STREAM` (TCP).
|
23
|
+
|
24
|
+
Raises a `RuntimeError` if neither ipv6 or ipv4 ip can be resolved from hostname.
|
25
|
+
"""
|
26
|
+
|
27
|
+
def resolve_sockaddr(family: socket.AddressFamily) -> Optional[str]:
|
28
|
+
try:
|
29
|
+
# patternlint-disable-next-line python-dns-deps (only used for oss)
|
30
|
+
addrs = socket.getaddrinfo(hostname, port, family, type=socket.SOCK_STREAM)
|
31
|
+
if addrs:
|
32
|
+
family, _, _, _, sockaddr = addrs[0] # use the first address
|
33
|
+
|
34
|
+
# sockaddr is a tuple (ipv4) or a 4-tuple (ipv6)
|
35
|
+
# in both cases the first element is the ip addr
|
36
|
+
ipaddr = str(sockaddr[0])
|
37
|
+
|
38
|
+
if family == socket.AF_INET6:
|
39
|
+
socket_address = f"[{ipaddr}]:{port}"
|
40
|
+
else: # socket.AF_INET
|
41
|
+
socket_address = f"{ipaddr}:{port}"
|
42
|
+
|
43
|
+
logger.info(
|
44
|
+
"resolved %s address `%s` for `%s:%d`",
|
45
|
+
family.name,
|
46
|
+
socket_address,
|
47
|
+
hostname,
|
48
|
+
port,
|
49
|
+
)
|
50
|
+
|
51
|
+
return socket_address
|
52
|
+
except socket.gaierror as e:
|
53
|
+
logger.info(
|
54
|
+
"no %s address that can bind TCP sockets for `%s:%d` (error: %s)",
|
55
|
+
family.name,
|
56
|
+
hostname,
|
57
|
+
port,
|
58
|
+
e,
|
59
|
+
)
|
60
|
+
return None
|
61
|
+
|
62
|
+
for family in [socket.AF_INET6, socket.AF_INET]:
|
63
|
+
if ipaddr := resolve_sockaddr(family):
|
64
|
+
return ipaddr
|
65
|
+
|
66
|
+
raise RuntimeError(
|
67
|
+
f"Unable to resolve `{hostname}` to ipv6 or ipv4 address that can bind TCP socket."
|
68
|
+
" Check the network configuration on the host."
|
69
|
+
)
|