torchmonarch-nightly 2025.6.27__cp311-cp311-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +189 -0
- monarch/_monarch/__init__.py +5 -0
- monarch/_monarch/hyperactor/__init__.py +58 -0
- monarch/_monarch/selection/__init__.py +13 -0
- monarch/_monarch/worker/__init__.py +0 -0
- monarch/_monarch/worker/debugger.py +117 -0
- monarch/_monarch/worker/logging.py +107 -0
- monarch/_rust_bindings.so +0 -0
- monarch/_testing.py +230 -0
- monarch/actor_mesh.py +761 -0
- monarch/allocator.py +220 -0
- monarch/bootstrap_main.py +59 -0
- monarch/builtins/__init__.py +14 -0
- monarch/builtins/log.py +22 -0
- monarch/builtins/random.py +68 -0
- monarch/cached_remote_function.py +257 -0
- monarch/code_sync.py +10 -0
- monarch/common/_C.pyi +11 -0
- monarch/common/_C.so +0 -0
- monarch/common/__init__.py +0 -0
- monarch/common/_coalescing.py +308 -0
- monarch/common/_device_utils.py +18 -0
- monarch/common/_tensor_to_table.py +172 -0
- monarch/common/base_tensor.py +28 -0
- monarch/common/borrows.py +143 -0
- monarch/common/client.py +690 -0
- monarch/common/constants.py +10 -0
- monarch/common/context_manager.py +40 -0
- monarch/common/controller_api.py +104 -0
- monarch/common/device_mesh.py +417 -0
- monarch/common/fake.py +55 -0
- monarch/common/function.py +160 -0
- monarch/common/function_caching.py +164 -0
- monarch/common/future.py +168 -0
- monarch/common/invocation.py +125 -0
- monarch/common/mast.py +221 -0
- monarch/common/messages.py +573 -0
- monarch/common/mock_cuda.py +41 -0
- monarch/common/opaque_ref.py +98 -0
- monarch/common/pickle_flatten.py +48 -0
- monarch/common/pipe.py +152 -0
- monarch/common/process_group.py +55 -0
- monarch/common/recording.py +127 -0
- monarch/common/reference.py +33 -0
- monarch/common/remote.py +297 -0
- monarch/common/selection.py +9 -0
- monarch/common/shape.py +229 -0
- monarch/common/stream.py +114 -0
- monarch/common/tensor.py +814 -0
- monarch/common/tensor_factory.py +31 -0
- monarch/common/tree.py +73 -0
- monarch/controller/__init__.py +7 -0
- monarch/controller/backend.py +223 -0
- monarch/controller/controller.py +223 -0
- monarch/controller/debugger.py +47 -0
- monarch/controller/history.py +90 -0
- monarch/controller/rust_backend/__init__.py +7 -0
- monarch/controller/rust_backend/controller.py +245 -0
- monarch/debugger.py +379 -0
- monarch/fetch.py +55 -0
- monarch/future.py +76 -0
- monarch/gradient/__init__.py +11 -0
- monarch/gradient/_gradient_generator.pyi +22 -0
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/gradient_generator.py +185 -0
- monarch/memory.py +43 -0
- monarch/mesh_controller.py +271 -0
- monarch/monarch_controller +0 -0
- monarch/notebook.py +761 -0
- monarch/opaque_module.py +235 -0
- monarch/opaque_object.py +88 -0
- monarch/parallel/__init__.py +9 -0
- monarch/parallel/pipelining/__init__.py +7 -0
- monarch/parallel/pipelining/runtime.py +847 -0
- monarch/parallel/pipelining/schedule_ir.py +692 -0
- monarch/parallel/pipelining/scheduler.py +249 -0
- monarch/pdb_wrapper.py +135 -0
- monarch/proc_mesh.py +299 -0
- monarch/profiler.py +160 -0
- monarch/python_local_mesh.py +107 -0
- monarch/random.py +61 -0
- monarch/rdma.py +162 -0
- monarch/remote_class.py +114 -0
- monarch/rust_backend_mesh.py +280 -0
- monarch/rust_local_mesh.py +1402 -0
- monarch/sim_mesh.py +359 -0
- monarch/simulator/__init__.py +7 -0
- monarch/simulator/command_history.py +424 -0
- monarch/simulator/config.py +21 -0
- monarch/simulator/interface.py +59 -0
- monarch/simulator/ir.py +770 -0
- monarch/simulator/mock_controller.py +214 -0
- monarch/simulator/profiling.py +424 -0
- monarch/simulator/simulator.py +1052 -0
- monarch/simulator/task.py +255 -0
- monarch/simulator/tensor.py +373 -0
- monarch/simulator/trace.py +395 -0
- monarch/simulator/utils.py +41 -0
- monarch/simulator/worker.py +389 -0
- monarch/telemetry.py +19 -0
- monarch/tensor_worker_main.py +260 -0
- monarch/tensorboard.py +84 -0
- monarch/timer/__init__.py +21 -0
- monarch/timer/example_monarch.py +78 -0
- monarch/timer/example_spmd.py +55 -0
- monarch/timer/execution_timer.py +199 -0
- monarch/timer/execution_timer_test.py +131 -0
- monarch/tools/__init__.py +7 -0
- monarch/tools/cli.py +167 -0
- monarch/tools/commands.py +251 -0
- monarch/tools/components/__init__.py +7 -0
- monarch/tools/components/hyperactor.py +58 -0
- monarch/tools/config/__init__.py +20 -0
- monarch/tools/config/defaults.py +54 -0
- monarch/tools/mesh_spec.py +165 -0
- monarch/tools/network.py +69 -0
- monarch/worker/__init__.py +7 -0
- monarch/worker/_testing_function.py +481 -0
- monarch/worker/compiled_block.py +270 -0
- monarch/worker/debugger.py +125 -0
- monarch/worker/lines.py +47 -0
- monarch/worker/monitor.py +53 -0
- monarch/worker/worker.py +1191 -0
- monarch/world_mesh.py +34 -0
- monarch_supervisor/__init__.py +1044 -0
- monarch_supervisor/_testing.py +44 -0
- monarch_supervisor/function_call.py +30 -0
- monarch_supervisor/host.py +386 -0
- monarch_supervisor/launchers.py +145 -0
- monarch_supervisor/log_pstree.py +48 -0
- monarch_supervisor/logging.py +103 -0
- monarch_supervisor/python_executable.py +42 -0
- tests/__init__.py +0 -0
- tests/dispatch_bench.py +124 -0
- tests/dispatch_bench_helper.py +25 -0
- tests/error_test_binary.py +180 -0
- tests/simulator/__init__.py +0 -0
- tests/simulator/test_profiling.py +136 -0
- tests/simulator/test_simulator.py +411 -0
- tests/simulator/test_task.py +64 -0
- tests/simulator/test_worker.py +102 -0
- tests/sleep_binary.py +35 -0
- tests/test_actor_error.py +240 -0
- tests/test_alloc.py +25 -0
- tests/test_allocator.py +365 -0
- tests/test_coalescing.py +492 -0
- tests/test_controller.py +845 -0
- tests/test_device_mesh.py +132 -0
- tests/test_fault_tolerance.py +398 -0
- tests/test_future.py +94 -0
- tests/test_grad_generator.py +121 -0
- tests/test_mock_cuda.py +74 -0
- tests/test_pdb_actor.py +110 -0
- tests/test_python_actors.py +736 -0
- tests/test_remote_functions.py +1271 -0
- tests/test_rust_backend.py +217 -0
- tests/test_signal_safe_block_on.py +103 -0
- tests/test_sim_backend.py +54 -0
- tests/test_tensor_engine.py +52 -0
- torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
- torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
- torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
- torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
- torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
- torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0
monarch/common/mast.py
ADDED
@@ -0,0 +1,221 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# TODO: This can all be replaced using cleanrer MAST python library.
|
8
|
+
# See https://www.internalfb.com/wiki/Components_in_AI/MAST/References/MAST_API_Reference/Read_APIs
|
9
|
+
|
10
|
+
import json
|
11
|
+
import logging
|
12
|
+
import subprocess
|
13
|
+
import time
|
14
|
+
from datetime import datetime
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
def _job_definition(jobname):
|
20
|
+
return json.loads(
|
21
|
+
subprocess.check_output(["mast", "get-job-definition", jobname, "--json"])
|
22
|
+
)
|
23
|
+
|
24
|
+
|
25
|
+
def _job_status(jobname):
|
26
|
+
return json.loads(
|
27
|
+
subprocess.check_output(["mast", "get-status", jobname, "--json"])
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
def _user_jobs(jobname=None):
|
32
|
+
lines = []
|
33
|
+
command = ["mast", "list-jobs", "--my", "--json"]
|
34
|
+
if jobname is not None:
|
35
|
+
command.append(["--job-name", jobname])
|
36
|
+
for line in subprocess.check_output(command).split(b"\n"):
|
37
|
+
if line:
|
38
|
+
lines.append(json.loads(line))
|
39
|
+
return lines
|
40
|
+
|
41
|
+
|
42
|
+
class MastJob:
|
43
|
+
def __init__(self, name, default_task_group=None):
|
44
|
+
self._name = name
|
45
|
+
self._def = None
|
46
|
+
self._status = None
|
47
|
+
self._details = None
|
48
|
+
self._twjob_specs = None
|
49
|
+
self._default_task_group = default_task_group
|
50
|
+
|
51
|
+
def _get_task_group(self, task_group):
|
52
|
+
if task_group is None:
|
53
|
+
task_group = self._default_task_group
|
54
|
+
if task_group is None:
|
55
|
+
raise ValueError("No default task group set and none specified")
|
56
|
+
return task_group
|
57
|
+
|
58
|
+
def _get_status(self, force_reload=False):
|
59
|
+
if self._status is None or force_reload:
|
60
|
+
self._status = _job_status(self._name)
|
61
|
+
return self._status
|
62
|
+
|
63
|
+
def _get_definitions(self):
|
64
|
+
if self._def is None:
|
65
|
+
self._def = _job_definition(self._name)
|
66
|
+
return self._def
|
67
|
+
|
68
|
+
def _get_definition(self, task_group=None):
|
69
|
+
task_group = self._get_task_group(task_group)
|
70
|
+
for d in self._get_definitions()["hpcTaskGroups"]:
|
71
|
+
if d["name"] == task_group:
|
72
|
+
return d
|
73
|
+
raise ValueError(f"Task group {task_group} not found in job definition")
|
74
|
+
|
75
|
+
def _get_details(self):
|
76
|
+
if self._details is None:
|
77
|
+
self._details = _user_jobs(self._name)
|
78
|
+
return self._details
|
79
|
+
|
80
|
+
def _get_twjob_specs(self, task_group=None):
|
81
|
+
task_group = self._get_task_group(task_group)
|
82
|
+
handle = self.get_twjob_handle(task_group)
|
83
|
+
if self._twjob_specs is None:
|
84
|
+
self._twjob_specs = json.loads(
|
85
|
+
subprocess.check_output(["tw", "print", handle, "--json"])
|
86
|
+
)[handle]
|
87
|
+
return self._twjob_specs
|
88
|
+
|
89
|
+
def name(self):
|
90
|
+
return self._name
|
91
|
+
|
92
|
+
def is_running(self):
|
93
|
+
status = self._get_status(force_reload=True)
|
94
|
+
if status["state"] != "RUNNING":
|
95
|
+
return False
|
96
|
+
else:
|
97
|
+
for task_group in status["latestAttempt"][
|
98
|
+
"taskGroupExecutionAttempts"
|
99
|
+
].values():
|
100
|
+
if task_group[-1]["state"] != "RUNNING":
|
101
|
+
return False
|
102
|
+
return True
|
103
|
+
|
104
|
+
def get_arguments(self, task_group=None):
|
105
|
+
return self._get_definition(task_group)["spec"]["arguments"]
|
106
|
+
|
107
|
+
def get_task_count(self, task_group=None):
|
108
|
+
return self._get_definition(task_group)["taskCount"]
|
109
|
+
|
110
|
+
def uses_nfs(self, task_group=None):
|
111
|
+
return "nfs" in self._get_definition(task_group)["spec"]["command"]
|
112
|
+
|
113
|
+
def wait_for_running(self, timeout, task_group=None):
|
114
|
+
start_time = datetime.now()
|
115
|
+
while True:
|
116
|
+
status = self._get_status(force_reload=True)
|
117
|
+
if status["state"] == "RUNNING":
|
118
|
+
app_state = self._get_status()["latestAttempt"][
|
119
|
+
"taskGroupExecutionAttempts"
|
120
|
+
][self._get_task_group(task_group)][0]["state"]
|
121
|
+
if app_state == "RUNNING":
|
122
|
+
break
|
123
|
+
logger.warning(
|
124
|
+
f"waiting for mast job {self.name()} to start, current worker state: {app_state}"
|
125
|
+
)
|
126
|
+
else:
|
127
|
+
logger.warning(
|
128
|
+
f"waiting for mast job {self.name()} to start, current state: {status['state']}"
|
129
|
+
)
|
130
|
+
|
131
|
+
if (datetime.now() - start_time).total_seconds() > timeout:
|
132
|
+
raise TimeoutError(
|
133
|
+
f"Timed out waiting for {self.name()} to start running."
|
134
|
+
)
|
135
|
+
time.sleep(10)
|
136
|
+
|
137
|
+
def get_port(self, task_group=None):
|
138
|
+
args = self._get_definition(task_group)["spec"]["arguments"]
|
139
|
+
try:
|
140
|
+
return int(args[3].removeprefix("tcp://").split(":")[1])
|
141
|
+
except Exception as e:
|
142
|
+
raise RuntimeError(
|
143
|
+
f"Failed to parse endpoint from mast job {self._name}. "
|
144
|
+
f"Invalid args in job definition: {' '.join(args)}. "
|
145
|
+
f"Expected format: -mmonarch.notebook worker --endpoint tcp://<hostname>:<port>"
|
146
|
+
) from e
|
147
|
+
|
148
|
+
def get_create_time(self):
|
149
|
+
return self._get_details()["createdTimestamp"]
|
150
|
+
|
151
|
+
def get_start_time(self):
|
152
|
+
return self._get_status()["latestAttempt"]["jobStateTransitionTimestampSecs"][
|
153
|
+
"PENDING"
|
154
|
+
]
|
155
|
+
|
156
|
+
def get_num_hosts(self, task_group=None):
|
157
|
+
return self._get_definition(task_group)["taskCount"]
|
158
|
+
|
159
|
+
def get_gpus_per_host(self, task_group=None):
|
160
|
+
return self._get_definition(task_group)["spec"]["resourceLimit"]["compute"][
|
161
|
+
"gpu"
|
162
|
+
]
|
163
|
+
|
164
|
+
def get_twjob_handle(self, task_group=None):
|
165
|
+
return self._get_status()["latestAttempt"]["taskGroupExecutionAttempts"][
|
166
|
+
self._get_task_group(task_group)
|
167
|
+
][0]["twJobHandle"]
|
168
|
+
|
169
|
+
def get_hostnames(self, task_group=None):
|
170
|
+
return self._get_twjob_specs(task_group)["envVariables"][
|
171
|
+
"MAST_HPC_TASK_GROUP_HOSTNAMES"
|
172
|
+
].split(",")
|
173
|
+
|
174
|
+
def _get_job_spec_env(self, task_group=None):
|
175
|
+
return self._get_definition(task_group)["spec"]["env"]
|
176
|
+
|
177
|
+
def get_nfs_home_dir(self, task_group=None):
|
178
|
+
return self._get_job_spec_env(task_group).get("MONARCH_NFS_HOME_DIR")
|
179
|
+
|
180
|
+
def get_oilfs_home_dir(self, task_group=None):
|
181
|
+
return self._get_job_spec_env(task_group).get("MONARCH_OILFS_HOME_DIR")
|
182
|
+
|
183
|
+
def get_nfs_workspace_dir(self, task_group=None):
|
184
|
+
return (
|
185
|
+
self._get_job_spec_env(task_group).get("WORKSPACE_DIR")
|
186
|
+
if self.get_nfs_home_dir(task_group) is not None
|
187
|
+
else None
|
188
|
+
)
|
189
|
+
|
190
|
+
def get_oilfs_workspace_dir(self, task_group=None):
|
191
|
+
return (
|
192
|
+
self._get_job_spec_env(task_group).get("WORKSPACE_DIR")
|
193
|
+
if self.get_oilfs_home_dir(task_group) is not None
|
194
|
+
else None
|
195
|
+
)
|
196
|
+
|
197
|
+
def __repr__(self):
|
198
|
+
job = {}
|
199
|
+
job["name"] = self._name
|
200
|
+
job["latest_attempt_start_time"] = str(
|
201
|
+
datetime.fromtimestamp(self.get_start_time())
|
202
|
+
)
|
203
|
+
job["hosts"] = self.get_num_hosts()
|
204
|
+
job["gpus_per_host"] = self.get_gpus_per_host()
|
205
|
+
status = self._get_status()
|
206
|
+
job["job_state"] = status["state"]
|
207
|
+
job["task_states"] = {
|
208
|
+
task_group_name: task_group_states[-1]["state"]
|
209
|
+
for task_group_name, task_group_states in status["latestAttempt"][
|
210
|
+
"taskGroupExecutionAttempts"
|
211
|
+
].items()
|
212
|
+
}
|
213
|
+
return json.dumps(job, indent=2)
|
214
|
+
|
215
|
+
|
216
|
+
def mast_get_jobs(default_task_group=None):
|
217
|
+
jobs = []
|
218
|
+
for job in _user_jobs():
|
219
|
+
mast_job = MastJob(job["hpcJobName"], default_task_group)
|
220
|
+
jobs.append(mast_job)
|
221
|
+
return sorted(jobs, key=lambda j: j.get_start_time(), reverse=True)
|