torchmonarch-nightly 2025.6.27__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. monarch/__init__.py +189 -0
  2. monarch/_monarch/__init__.py +5 -0
  3. monarch/_monarch/hyperactor/__init__.py +58 -0
  4. monarch/_monarch/selection/__init__.py +13 -0
  5. monarch/_monarch/worker/__init__.py +0 -0
  6. monarch/_monarch/worker/debugger.py +117 -0
  7. monarch/_monarch/worker/logging.py +107 -0
  8. monarch/_rust_bindings.so +0 -0
  9. monarch/_testing.py +230 -0
  10. monarch/actor_mesh.py +761 -0
  11. monarch/allocator.py +220 -0
  12. monarch/bootstrap_main.py +59 -0
  13. monarch/builtins/__init__.py +14 -0
  14. monarch/builtins/log.py +22 -0
  15. monarch/builtins/random.py +68 -0
  16. monarch/cached_remote_function.py +257 -0
  17. monarch/code_sync.py +10 -0
  18. monarch/common/_C.pyi +11 -0
  19. monarch/common/_C.so +0 -0
  20. monarch/common/__init__.py +0 -0
  21. monarch/common/_coalescing.py +308 -0
  22. monarch/common/_device_utils.py +18 -0
  23. monarch/common/_tensor_to_table.py +172 -0
  24. monarch/common/base_tensor.py +28 -0
  25. monarch/common/borrows.py +143 -0
  26. monarch/common/client.py +690 -0
  27. monarch/common/constants.py +10 -0
  28. monarch/common/context_manager.py +40 -0
  29. monarch/common/controller_api.py +104 -0
  30. monarch/common/device_mesh.py +417 -0
  31. monarch/common/fake.py +55 -0
  32. monarch/common/function.py +160 -0
  33. monarch/common/function_caching.py +164 -0
  34. monarch/common/future.py +168 -0
  35. monarch/common/invocation.py +125 -0
  36. monarch/common/mast.py +221 -0
  37. monarch/common/messages.py +573 -0
  38. monarch/common/mock_cuda.py +41 -0
  39. monarch/common/opaque_ref.py +98 -0
  40. monarch/common/pickle_flatten.py +48 -0
  41. monarch/common/pipe.py +152 -0
  42. monarch/common/process_group.py +55 -0
  43. monarch/common/recording.py +127 -0
  44. monarch/common/reference.py +33 -0
  45. monarch/common/remote.py +297 -0
  46. monarch/common/selection.py +9 -0
  47. monarch/common/shape.py +229 -0
  48. monarch/common/stream.py +114 -0
  49. monarch/common/tensor.py +814 -0
  50. monarch/common/tensor_factory.py +31 -0
  51. monarch/common/tree.py +73 -0
  52. monarch/controller/__init__.py +7 -0
  53. monarch/controller/backend.py +223 -0
  54. monarch/controller/controller.py +223 -0
  55. monarch/controller/debugger.py +47 -0
  56. monarch/controller/history.py +90 -0
  57. monarch/controller/rust_backend/__init__.py +7 -0
  58. monarch/controller/rust_backend/controller.py +245 -0
  59. monarch/debugger.py +379 -0
  60. monarch/fetch.py +55 -0
  61. monarch/future.py +76 -0
  62. monarch/gradient/__init__.py +11 -0
  63. monarch/gradient/_gradient_generator.pyi +22 -0
  64. monarch/gradient/_gradient_generator.so +0 -0
  65. monarch/gradient_generator.py +185 -0
  66. monarch/memory.py +43 -0
  67. monarch/mesh_controller.py +271 -0
  68. monarch/monarch_controller +0 -0
  69. monarch/notebook.py +761 -0
  70. monarch/opaque_module.py +235 -0
  71. monarch/opaque_object.py +88 -0
  72. monarch/parallel/__init__.py +9 -0
  73. monarch/parallel/pipelining/__init__.py +7 -0
  74. monarch/parallel/pipelining/runtime.py +847 -0
  75. monarch/parallel/pipelining/schedule_ir.py +692 -0
  76. monarch/parallel/pipelining/scheduler.py +249 -0
  77. monarch/pdb_wrapper.py +135 -0
  78. monarch/proc_mesh.py +299 -0
  79. monarch/profiler.py +160 -0
  80. monarch/python_local_mesh.py +107 -0
  81. monarch/random.py +61 -0
  82. monarch/rdma.py +162 -0
  83. monarch/remote_class.py +114 -0
  84. monarch/rust_backend_mesh.py +280 -0
  85. monarch/rust_local_mesh.py +1402 -0
  86. monarch/sim_mesh.py +359 -0
  87. monarch/simulator/__init__.py +7 -0
  88. monarch/simulator/command_history.py +424 -0
  89. monarch/simulator/config.py +21 -0
  90. monarch/simulator/interface.py +59 -0
  91. monarch/simulator/ir.py +770 -0
  92. monarch/simulator/mock_controller.py +214 -0
  93. monarch/simulator/profiling.py +424 -0
  94. monarch/simulator/simulator.py +1052 -0
  95. monarch/simulator/task.py +255 -0
  96. monarch/simulator/tensor.py +373 -0
  97. monarch/simulator/trace.py +395 -0
  98. monarch/simulator/utils.py +41 -0
  99. monarch/simulator/worker.py +389 -0
  100. monarch/telemetry.py +19 -0
  101. monarch/tensor_worker_main.py +260 -0
  102. monarch/tensorboard.py +84 -0
  103. monarch/timer/__init__.py +21 -0
  104. monarch/timer/example_monarch.py +78 -0
  105. monarch/timer/example_spmd.py +55 -0
  106. monarch/timer/execution_timer.py +199 -0
  107. monarch/timer/execution_timer_test.py +131 -0
  108. monarch/tools/__init__.py +7 -0
  109. monarch/tools/cli.py +167 -0
  110. monarch/tools/commands.py +251 -0
  111. monarch/tools/components/__init__.py +7 -0
  112. monarch/tools/components/hyperactor.py +58 -0
  113. monarch/tools/config/__init__.py +20 -0
  114. monarch/tools/config/defaults.py +54 -0
  115. monarch/tools/mesh_spec.py +165 -0
  116. monarch/tools/network.py +69 -0
  117. monarch/worker/__init__.py +7 -0
  118. monarch/worker/_testing_function.py +481 -0
  119. monarch/worker/compiled_block.py +270 -0
  120. monarch/worker/debugger.py +125 -0
  121. monarch/worker/lines.py +47 -0
  122. monarch/worker/monitor.py +53 -0
  123. monarch/worker/worker.py +1191 -0
  124. monarch/world_mesh.py +34 -0
  125. monarch_supervisor/__init__.py +1044 -0
  126. monarch_supervisor/_testing.py +44 -0
  127. monarch_supervisor/function_call.py +30 -0
  128. monarch_supervisor/host.py +386 -0
  129. monarch_supervisor/launchers.py +145 -0
  130. monarch_supervisor/log_pstree.py +48 -0
  131. monarch_supervisor/logging.py +103 -0
  132. monarch_supervisor/python_executable.py +42 -0
  133. tests/__init__.py +0 -0
  134. tests/dispatch_bench.py +124 -0
  135. tests/dispatch_bench_helper.py +25 -0
  136. tests/error_test_binary.py +180 -0
  137. tests/simulator/__init__.py +0 -0
  138. tests/simulator/test_profiling.py +136 -0
  139. tests/simulator/test_simulator.py +411 -0
  140. tests/simulator/test_task.py +64 -0
  141. tests/simulator/test_worker.py +102 -0
  142. tests/sleep_binary.py +35 -0
  143. tests/test_actor_error.py +240 -0
  144. tests/test_alloc.py +25 -0
  145. tests/test_allocator.py +365 -0
  146. tests/test_coalescing.py +492 -0
  147. tests/test_controller.py +845 -0
  148. tests/test_device_mesh.py +132 -0
  149. tests/test_fault_tolerance.py +398 -0
  150. tests/test_future.py +94 -0
  151. tests/test_grad_generator.py +121 -0
  152. tests/test_mock_cuda.py +74 -0
  153. tests/test_pdb_actor.py +110 -0
  154. tests/test_python_actors.py +736 -0
  155. tests/test_remote_functions.py +1271 -0
  156. tests/test_rust_backend.py +217 -0
  157. tests/test_signal_safe_block_on.py +103 -0
  158. tests/test_sim_backend.py +54 -0
  159. tests/test_tensor_engine.py +52 -0
  160. torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
  161. torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
  162. torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
  163. torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
  164. torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
  165. torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0
monarch/common/mast.py ADDED
@@ -0,0 +1,221 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # TODO: This can all be replaced using cleanrer MAST python library.
8
+ # See https://www.internalfb.com/wiki/Components_in_AI/MAST/References/MAST_API_Reference/Read_APIs
9
+
10
+ import json
11
+ import logging
12
+ import subprocess
13
+ import time
14
+ from datetime import datetime
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def _job_definition(jobname):
20
+ return json.loads(
21
+ subprocess.check_output(["mast", "get-job-definition", jobname, "--json"])
22
+ )
23
+
24
+
25
+ def _job_status(jobname):
26
+ return json.loads(
27
+ subprocess.check_output(["mast", "get-status", jobname, "--json"])
28
+ )
29
+
30
+
31
+ def _user_jobs(jobname=None):
32
+ lines = []
33
+ command = ["mast", "list-jobs", "--my", "--json"]
34
+ if jobname is not None:
35
+ command.append(["--job-name", jobname])
36
+ for line in subprocess.check_output(command).split(b"\n"):
37
+ if line:
38
+ lines.append(json.loads(line))
39
+ return lines
40
+
41
+
42
+ class MastJob:
43
+ def __init__(self, name, default_task_group=None):
44
+ self._name = name
45
+ self._def = None
46
+ self._status = None
47
+ self._details = None
48
+ self._twjob_specs = None
49
+ self._default_task_group = default_task_group
50
+
51
+ def _get_task_group(self, task_group):
52
+ if task_group is None:
53
+ task_group = self._default_task_group
54
+ if task_group is None:
55
+ raise ValueError("No default task group set and none specified")
56
+ return task_group
57
+
58
+ def _get_status(self, force_reload=False):
59
+ if self._status is None or force_reload:
60
+ self._status = _job_status(self._name)
61
+ return self._status
62
+
63
+ def _get_definitions(self):
64
+ if self._def is None:
65
+ self._def = _job_definition(self._name)
66
+ return self._def
67
+
68
+ def _get_definition(self, task_group=None):
69
+ task_group = self._get_task_group(task_group)
70
+ for d in self._get_definitions()["hpcTaskGroups"]:
71
+ if d["name"] == task_group:
72
+ return d
73
+ raise ValueError(f"Task group {task_group} not found in job definition")
74
+
75
+ def _get_details(self):
76
+ if self._details is None:
77
+ self._details = _user_jobs(self._name)
78
+ return self._details
79
+
80
+ def _get_twjob_specs(self, task_group=None):
81
+ task_group = self._get_task_group(task_group)
82
+ handle = self.get_twjob_handle(task_group)
83
+ if self._twjob_specs is None:
84
+ self._twjob_specs = json.loads(
85
+ subprocess.check_output(["tw", "print", handle, "--json"])
86
+ )[handle]
87
+ return self._twjob_specs
88
+
89
+ def name(self):
90
+ return self._name
91
+
92
+ def is_running(self):
93
+ status = self._get_status(force_reload=True)
94
+ if status["state"] != "RUNNING":
95
+ return False
96
+ else:
97
+ for task_group in status["latestAttempt"][
98
+ "taskGroupExecutionAttempts"
99
+ ].values():
100
+ if task_group[-1]["state"] != "RUNNING":
101
+ return False
102
+ return True
103
+
104
+ def get_arguments(self, task_group=None):
105
+ return self._get_definition(task_group)["spec"]["arguments"]
106
+
107
+ def get_task_count(self, task_group=None):
108
+ return self._get_definition(task_group)["taskCount"]
109
+
110
+ def uses_nfs(self, task_group=None):
111
+ return "nfs" in self._get_definition(task_group)["spec"]["command"]
112
+
113
+ def wait_for_running(self, timeout, task_group=None):
114
+ start_time = datetime.now()
115
+ while True:
116
+ status = self._get_status(force_reload=True)
117
+ if status["state"] == "RUNNING":
118
+ app_state = self._get_status()["latestAttempt"][
119
+ "taskGroupExecutionAttempts"
120
+ ][self._get_task_group(task_group)][0]["state"]
121
+ if app_state == "RUNNING":
122
+ break
123
+ logger.warning(
124
+ f"waiting for mast job {self.name()} to start, current worker state: {app_state}"
125
+ )
126
+ else:
127
+ logger.warning(
128
+ f"waiting for mast job {self.name()} to start, current state: {status['state']}"
129
+ )
130
+
131
+ if (datetime.now() - start_time).total_seconds() > timeout:
132
+ raise TimeoutError(
133
+ f"Timed out waiting for {self.name()} to start running."
134
+ )
135
+ time.sleep(10)
136
+
137
+ def get_port(self, task_group=None):
138
+ args = self._get_definition(task_group)["spec"]["arguments"]
139
+ try:
140
+ return int(args[3].removeprefix("tcp://").split(":")[1])
141
+ except Exception as e:
142
+ raise RuntimeError(
143
+ f"Failed to parse endpoint from mast job {self._name}. "
144
+ f"Invalid args in job definition: {' '.join(args)}. "
145
+ f"Expected format: -mmonarch.notebook worker --endpoint tcp://<hostname>:<port>"
146
+ ) from e
147
+
148
+ def get_create_time(self):
149
+ return self._get_details()["createdTimestamp"]
150
+
151
+ def get_start_time(self):
152
+ return self._get_status()["latestAttempt"]["jobStateTransitionTimestampSecs"][
153
+ "PENDING"
154
+ ]
155
+
156
+ def get_num_hosts(self, task_group=None):
157
+ return self._get_definition(task_group)["taskCount"]
158
+
159
+ def get_gpus_per_host(self, task_group=None):
160
+ return self._get_definition(task_group)["spec"]["resourceLimit"]["compute"][
161
+ "gpu"
162
+ ]
163
+
164
+ def get_twjob_handle(self, task_group=None):
165
+ return self._get_status()["latestAttempt"]["taskGroupExecutionAttempts"][
166
+ self._get_task_group(task_group)
167
+ ][0]["twJobHandle"]
168
+
169
+ def get_hostnames(self, task_group=None):
170
+ return self._get_twjob_specs(task_group)["envVariables"][
171
+ "MAST_HPC_TASK_GROUP_HOSTNAMES"
172
+ ].split(",")
173
+
174
+ def _get_job_spec_env(self, task_group=None):
175
+ return self._get_definition(task_group)["spec"]["env"]
176
+
177
+ def get_nfs_home_dir(self, task_group=None):
178
+ return self._get_job_spec_env(task_group).get("MONARCH_NFS_HOME_DIR")
179
+
180
+ def get_oilfs_home_dir(self, task_group=None):
181
+ return self._get_job_spec_env(task_group).get("MONARCH_OILFS_HOME_DIR")
182
+
183
+ def get_nfs_workspace_dir(self, task_group=None):
184
+ return (
185
+ self._get_job_spec_env(task_group).get("WORKSPACE_DIR")
186
+ if self.get_nfs_home_dir(task_group) is not None
187
+ else None
188
+ )
189
+
190
+ def get_oilfs_workspace_dir(self, task_group=None):
191
+ return (
192
+ self._get_job_spec_env(task_group).get("WORKSPACE_DIR")
193
+ if self.get_oilfs_home_dir(task_group) is not None
194
+ else None
195
+ )
196
+
197
+ def __repr__(self):
198
+ job = {}
199
+ job["name"] = self._name
200
+ job["latest_attempt_start_time"] = str(
201
+ datetime.fromtimestamp(self.get_start_time())
202
+ )
203
+ job["hosts"] = self.get_num_hosts()
204
+ job["gpus_per_host"] = self.get_gpus_per_host()
205
+ status = self._get_status()
206
+ job["job_state"] = status["state"]
207
+ job["task_states"] = {
208
+ task_group_name: task_group_states[-1]["state"]
209
+ for task_group_name, task_group_states in status["latestAttempt"][
210
+ "taskGroupExecutionAttempts"
211
+ ].items()
212
+ }
213
+ return json.dumps(job, indent=2)
214
+
215
+
216
+ def mast_get_jobs(default_task_group=None):
217
+ jobs = []
218
+ for job in _user_jobs():
219
+ mast_job = MastJob(job["hpcJobName"], default_task_group)
220
+ jobs.append(mast_job)
221
+ return sorted(jobs, key=lambda j: j.get_start_time(), reverse=True)