torchmonarch-nightly 2025.6.27__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. monarch/__init__.py +189 -0
  2. monarch/_monarch/__init__.py +5 -0
  3. monarch/_monarch/hyperactor/__init__.py +58 -0
  4. monarch/_monarch/selection/__init__.py +13 -0
  5. monarch/_monarch/worker/__init__.py +0 -0
  6. monarch/_monarch/worker/debugger.py +117 -0
  7. monarch/_monarch/worker/logging.py +107 -0
  8. monarch/_rust_bindings.so +0 -0
  9. monarch/_testing.py +230 -0
  10. monarch/actor_mesh.py +761 -0
  11. monarch/allocator.py +220 -0
  12. monarch/bootstrap_main.py +59 -0
  13. monarch/builtins/__init__.py +14 -0
  14. monarch/builtins/log.py +22 -0
  15. monarch/builtins/random.py +68 -0
  16. monarch/cached_remote_function.py +257 -0
  17. monarch/code_sync.py +10 -0
  18. monarch/common/_C.pyi +11 -0
  19. monarch/common/_C.so +0 -0
  20. monarch/common/__init__.py +0 -0
  21. monarch/common/_coalescing.py +308 -0
  22. monarch/common/_device_utils.py +18 -0
  23. monarch/common/_tensor_to_table.py +172 -0
  24. monarch/common/base_tensor.py +28 -0
  25. monarch/common/borrows.py +143 -0
  26. monarch/common/client.py +690 -0
  27. monarch/common/constants.py +10 -0
  28. monarch/common/context_manager.py +40 -0
  29. monarch/common/controller_api.py +104 -0
  30. monarch/common/device_mesh.py +417 -0
  31. monarch/common/fake.py +55 -0
  32. monarch/common/function.py +160 -0
  33. monarch/common/function_caching.py +164 -0
  34. monarch/common/future.py +168 -0
  35. monarch/common/invocation.py +125 -0
  36. monarch/common/mast.py +221 -0
  37. monarch/common/messages.py +573 -0
  38. monarch/common/mock_cuda.py +41 -0
  39. monarch/common/opaque_ref.py +98 -0
  40. monarch/common/pickle_flatten.py +48 -0
  41. monarch/common/pipe.py +152 -0
  42. monarch/common/process_group.py +55 -0
  43. monarch/common/recording.py +127 -0
  44. monarch/common/reference.py +33 -0
  45. monarch/common/remote.py +297 -0
  46. monarch/common/selection.py +9 -0
  47. monarch/common/shape.py +229 -0
  48. monarch/common/stream.py +114 -0
  49. monarch/common/tensor.py +814 -0
  50. monarch/common/tensor_factory.py +31 -0
  51. monarch/common/tree.py +73 -0
  52. monarch/controller/__init__.py +7 -0
  53. monarch/controller/backend.py +223 -0
  54. monarch/controller/controller.py +223 -0
  55. monarch/controller/debugger.py +47 -0
  56. monarch/controller/history.py +90 -0
  57. monarch/controller/rust_backend/__init__.py +7 -0
  58. monarch/controller/rust_backend/controller.py +245 -0
  59. monarch/debugger.py +379 -0
  60. monarch/fetch.py +55 -0
  61. monarch/future.py +76 -0
  62. monarch/gradient/__init__.py +11 -0
  63. monarch/gradient/_gradient_generator.pyi +22 -0
  64. monarch/gradient/_gradient_generator.so +0 -0
  65. monarch/gradient_generator.py +185 -0
  66. monarch/memory.py +43 -0
  67. monarch/mesh_controller.py +271 -0
  68. monarch/monarch_controller +0 -0
  69. monarch/notebook.py +761 -0
  70. monarch/opaque_module.py +235 -0
  71. monarch/opaque_object.py +88 -0
  72. monarch/parallel/__init__.py +9 -0
  73. monarch/parallel/pipelining/__init__.py +7 -0
  74. monarch/parallel/pipelining/runtime.py +847 -0
  75. monarch/parallel/pipelining/schedule_ir.py +692 -0
  76. monarch/parallel/pipelining/scheduler.py +249 -0
  77. monarch/pdb_wrapper.py +135 -0
  78. monarch/proc_mesh.py +299 -0
  79. monarch/profiler.py +160 -0
  80. monarch/python_local_mesh.py +107 -0
  81. monarch/random.py +61 -0
  82. monarch/rdma.py +162 -0
  83. monarch/remote_class.py +114 -0
  84. monarch/rust_backend_mesh.py +280 -0
  85. monarch/rust_local_mesh.py +1402 -0
  86. monarch/sim_mesh.py +359 -0
  87. monarch/simulator/__init__.py +7 -0
  88. monarch/simulator/command_history.py +424 -0
  89. monarch/simulator/config.py +21 -0
  90. monarch/simulator/interface.py +59 -0
  91. monarch/simulator/ir.py +770 -0
  92. monarch/simulator/mock_controller.py +214 -0
  93. monarch/simulator/profiling.py +424 -0
  94. monarch/simulator/simulator.py +1052 -0
  95. monarch/simulator/task.py +255 -0
  96. monarch/simulator/tensor.py +373 -0
  97. monarch/simulator/trace.py +395 -0
  98. monarch/simulator/utils.py +41 -0
  99. monarch/simulator/worker.py +389 -0
  100. monarch/telemetry.py +19 -0
  101. monarch/tensor_worker_main.py +260 -0
  102. monarch/tensorboard.py +84 -0
  103. monarch/timer/__init__.py +21 -0
  104. monarch/timer/example_monarch.py +78 -0
  105. monarch/timer/example_spmd.py +55 -0
  106. monarch/timer/execution_timer.py +199 -0
  107. monarch/timer/execution_timer_test.py +131 -0
  108. monarch/tools/__init__.py +7 -0
  109. monarch/tools/cli.py +167 -0
  110. monarch/tools/commands.py +251 -0
  111. monarch/tools/components/__init__.py +7 -0
  112. monarch/tools/components/hyperactor.py +58 -0
  113. monarch/tools/config/__init__.py +20 -0
  114. monarch/tools/config/defaults.py +54 -0
  115. monarch/tools/mesh_spec.py +165 -0
  116. monarch/tools/network.py +69 -0
  117. monarch/worker/__init__.py +7 -0
  118. monarch/worker/_testing_function.py +481 -0
  119. monarch/worker/compiled_block.py +270 -0
  120. monarch/worker/debugger.py +125 -0
  121. monarch/worker/lines.py +47 -0
  122. monarch/worker/monitor.py +53 -0
  123. monarch/worker/worker.py +1191 -0
  124. monarch/world_mesh.py +34 -0
  125. monarch_supervisor/__init__.py +1044 -0
  126. monarch_supervisor/_testing.py +44 -0
  127. monarch_supervisor/function_call.py +30 -0
  128. monarch_supervisor/host.py +386 -0
  129. monarch_supervisor/launchers.py +145 -0
  130. monarch_supervisor/log_pstree.py +48 -0
  131. monarch_supervisor/logging.py +103 -0
  132. monarch_supervisor/python_executable.py +42 -0
  133. tests/__init__.py +0 -0
  134. tests/dispatch_bench.py +124 -0
  135. tests/dispatch_bench_helper.py +25 -0
  136. tests/error_test_binary.py +180 -0
  137. tests/simulator/__init__.py +0 -0
  138. tests/simulator/test_profiling.py +136 -0
  139. tests/simulator/test_simulator.py +411 -0
  140. tests/simulator/test_task.py +64 -0
  141. tests/simulator/test_worker.py +102 -0
  142. tests/sleep_binary.py +35 -0
  143. tests/test_actor_error.py +240 -0
  144. tests/test_alloc.py +25 -0
  145. tests/test_allocator.py +365 -0
  146. tests/test_coalescing.py +492 -0
  147. tests/test_controller.py +845 -0
  148. tests/test_device_mesh.py +132 -0
  149. tests/test_fault_tolerance.py +398 -0
  150. tests/test_future.py +94 -0
  151. tests/test_grad_generator.py +121 -0
  152. tests/test_mock_cuda.py +74 -0
  153. tests/test_pdb_actor.py +110 -0
  154. tests/test_python_actors.py +736 -0
  155. tests/test_remote_functions.py +1271 -0
  156. tests/test_rust_backend.py +217 -0
  157. tests/test_signal_safe_block_on.py +103 -0
  158. tests/test_sim_backend.py +54 -0
  159. tests/test_tensor_engine.py +52 -0
  160. torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
  161. torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
  162. torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
  163. torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
  164. torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
  165. torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0
monarch/tools/cli.py ADDED
@@ -0,0 +1,167 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ import argparse
9
+ import json
10
+ import sys
11
+
12
+ from monarch.tools.commands import (
13
+ bounce,
14
+ component_args_from_cli,
15
+ create,
16
+ info,
17
+ kill,
18
+ stop,
19
+ torchx_runner,
20
+ )
21
+ from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults
22
+ Config,
23
+ defaults,
24
+ )
25
+ from torchx.specs.finder import get_component
26
+
27
+
28
+ def config_from_cli_args(args: argparse.Namespace) -> Config:
29
+ config = defaults.config(args.scheduler, args.workspace)
30
+
31
+ if args.scheduler_args:
32
+ with torchx_runner() as runner:
33
+ opts = runner.scheduler_run_opts(config.scheduler)
34
+ for cfg_str in args.scheduler_args:
35
+ parsed_cfg = opts.cfg_from_str(cfg_str)
36
+ config.scheduler_args.update(parsed_cfg)
37
+
38
+ config.dryrun = args.dryrun
39
+ return config
40
+
41
+
42
+ class CreateCmd:
43
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
44
+ subparser.add_argument(
45
+ "-s",
46
+ "--scheduler",
47
+ type=str,
48
+ help="Scheduler to submit to",
49
+ )
50
+ subparser.add_argument(
51
+ "-cfg",
52
+ "--scheduler_args",
53
+ default=[],
54
+ action="append",
55
+ help="Scheduler args (e.g. `-cfg cluster=foo -cfg user=bar`)",
56
+ )
57
+ subparser.add_argument(
58
+ "--dryrun",
59
+ action="store_true",
60
+ default=False,
61
+ help="Just prints the scheduler request",
62
+ )
63
+ subparser.add_argument(
64
+ "--workspace",
65
+ help="The local directory to build into the job's image and make available on the job."
66
+ " Pass --workspace='' to disable any default workspaces configured for the scheduler",
67
+ )
68
+ subparser.add_argument(
69
+ "--component",
70
+ help="A custom TorchX component to use",
71
+ )
72
+ subparser.add_argument(
73
+ "-arg",
74
+ "--component_args",
75
+ default=[],
76
+ action="append",
77
+ help="Arguments to the component fn (e.g. `-arg a=b -arg c=d` to pass as `component_fn(a=b, c=d)`)",
78
+ )
79
+
80
+ def run(self, args: argparse.Namespace) -> None:
81
+ config = config_from_cli_args(args)
82
+
83
+ component_fn = (
84
+ get_component(args.component).fn
85
+ if args.component
86
+ else defaults.component_fn(config.scheduler)
87
+ )
88
+ component_args = component_args_from_cli(component_fn, args.component_args)
89
+ handle = create(config, component_fn)(**component_args)
90
+ print(handle)
91
+
92
+
93
+ class CommonArguments:
94
+ @staticmethod
95
+ def add_server_handle(subparser: argparse.ArgumentParser) -> None:
96
+ subparser.add_argument(
97
+ "server_handle",
98
+ type=str,
99
+ help="monarch server handle (e.g. slurm:///job_id)",
100
+ )
101
+
102
+
103
+ class InfoCmd:
104
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
105
+ CommonArguments.add_server_handle(subparser)
106
+
107
+ def run(self, args: argparse.Namespace) -> None:
108
+ server_spec = info(args.server_handle)
109
+ if server_spec is None:
110
+ print(
111
+ f"Server: {args.server_handle} does not exist",
112
+ file=sys.stderr,
113
+ )
114
+ else:
115
+ json.dump(server_spec.to_json(), indent=2, fp=sys.stdout)
116
+
117
+
118
+ class KillCmd:
119
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
120
+ CommonArguments.add_server_handle(subparser)
121
+
122
+ def run(self, args: argparse.Namespace) -> None:
123
+ kill(args.server_handle)
124
+
125
+
126
+ class BounceCmd:
127
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
128
+ CommonArguments.add_server_handle(subparser)
129
+
130
+ def run(self, args: argparse.Namespace) -> None:
131
+ bounce(args.server_handle)
132
+
133
+
134
+ class StopCmd:
135
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
136
+ CommonArguments.add_server_handle(subparser)
137
+
138
+ def run(self, args: argparse.Namespace) -> None:
139
+ stop(args.server_handle)
140
+
141
+
142
+ def get_parser() -> argparse.ArgumentParser:
143
+ parser = argparse.ArgumentParser(description="Monarch CLI")
144
+ subparser = parser.add_subparsers(title="COMMANDS")
145
+
146
+ for cmd_name, cmd in {
147
+ "create": CreateCmd(),
148
+ "info": InfoCmd(),
149
+ "kill": KillCmd(),
150
+ # --- placeholder subcommands (not yet implemented) ---
151
+ "bounce": BounceCmd(),
152
+ "stop": StopCmd(),
153
+ }.items():
154
+ cmd_parser = subparser.add_parser(cmd_name)
155
+ cmd.add_arguments(cmd_parser)
156
+ cmd_parser.set_defaults(func=cmd.run)
157
+ return parser
158
+
159
+
160
+ def main(argv: list[str] = sys.argv[1:]) -> None:
161
+ parser = get_parser()
162
+ args = parser.parse_args(argv)
163
+ args.func(args)
164
+
165
+
166
+ if __name__ == "__main__":
167
+ main()
@@ -0,0 +1,251 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import argparse
10
+ import functools
11
+ import inspect
12
+ import logging
13
+ import os
14
+ import time
15
+ from datetime import timedelta
16
+ from typing import Any, Callable, Mapping, Optional, Union
17
+
18
+ from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults
19
+ Config,
20
+ defaults,
21
+ )
22
+
23
+ from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
24
+ from torchx.runner import Runner
25
+ from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal
26
+ from torchx.specs.builders import parse_args
27
+ from torchx.util.types import decode, decode_optional
28
+
29
+ logger: logging.Logger = logging.getLogger(__name__)
30
+
31
+
32
+ def torchx_runner() -> Runner:
33
+ # namespace is currently unused so make it empty str
34
+ # so that server handle is short (e.g. slurm:///job-id)
35
+ _EMPTY_NS = ""
36
+ return Runner(_EMPTY_NS, defaults.scheduler_factories())
37
+
38
+
39
+ def component_args_from_cli(
40
+ component_fn: Callable[..., AppDef], component_args: list[str]
41
+ ) -> dict[str, Any]:
42
+ """Parses component function's arguments from 'argname=argvalue' strings.
43
+
44
+ Returns: component arguments kwarg-ified.
45
+ """
46
+
47
+ cli_fied_component_args = []
48
+ for arg in component_args:
49
+ argname = arg.split("=")[0]
50
+ # torchx auto-generates an argparse parser for component function based
51
+ # type-hints and docstring as if the component was a CLI itself so we have to
52
+ # CLI arg-ify the component arguments by adding a "-" for
53
+ # single-char argnames (short arg) and "--" for multi-char (long arg)
54
+ cli_fied_component_args.append(f"-{arg}" if len(argname) == 1 else f"--{arg}")
55
+
56
+ parsed_args: argparse.Namespace = parse_args(component_fn, cli_fied_component_args)
57
+
58
+ # TODO kiuk@ logic below needs to move into torchx.specs.builders.parse_args()
59
+ # which is copied from torchx.specs.builders.materialize_appdef()
60
+ # parse_args() returns all the component parameters parsed from cli inputs
61
+ # as a string. Additional parameter type matching needs to be done (as below)
62
+ # to turn the CLI inputs to component function arguments.
63
+ component_kwargs = {}
64
+
65
+ parameters = inspect.signature(component_fn).parameters
66
+ for param_name, parameter in parameters.items():
67
+ arg_value = getattr(parsed_args, param_name)
68
+ parameter_type = parameter.annotation
69
+ parameter_type = decode_optional(parameter_type)
70
+ arg_value = decode(arg_value, parameter_type)
71
+ if parameter.kind == inspect.Parameter.VAR_POSITIONAL:
72
+ raise TypeError(
73
+ f"component fn param `{param_name}` is a '*arg' which is not supported; consider changing the type to a list"
74
+ )
75
+ elif parameter.kind == inspect.Parameter.VAR_KEYWORD:
76
+ raise TypeError(
77
+ f"component fn param `{param_name}` is a '**kwargs' which is not supported; consider changing the type to a dict or explicitly declare the params"
78
+ )
79
+ else:
80
+ component_kwargs[param_name] = arg_value
81
+
82
+ return component_kwargs
83
+
84
+
85
+ def create(
86
+ config: Config,
87
+ component_fn: Optional[Callable[..., AppDef]] = None,
88
+ ) -> Callable[..., Union[str, AppDryRunInfo]]:
89
+ """Creates a monarch server by submitting it as a job to the target scheduler.
90
+
91
+ Note that this function returns a `Callable` that has to be called with the
92
+ same arguments that one would call the `component_fn` to actually submit
93
+ the job that runs the monarch server.
94
+
95
+ Usage:
96
+
97
+ .. doc-test::
98
+
99
+ from monarch.tools.config import defaults
100
+
101
+ config = defaults.config(scheduler="slurm")
102
+ config.scheduler_args.update(
103
+ {
104
+ "partition": "prod",
105
+ "mail-user": "foo@bar.com",
106
+ "mail-type": "FAIL",
107
+ }
108
+ )
109
+ config.dryrun = True
110
+
111
+ create(default_config)(host_type="gpu.medium", num_hosts=4)
112
+
113
+
114
+ Args:
115
+ scheduler: where to submit a job that runs the server
116
+ scheduler_args: scheduler configs
117
+ component_fn: a function that returns the AppDef (job def).
118
+ If not provided, defaults to the configured default for the scheduler
119
+ (in most cases ``monarch.tools.components.hyperactor.proc_mesh``)
120
+ """
121
+ scheduler: str = config.scheduler
122
+ cfg: Mapping[str, CfgVal] = config.scheduler_args
123
+ component: Callable[..., AppDef] = component_fn or defaults.component_fn(scheduler)
124
+
125
+ @functools.wraps(component)
126
+ def _run(*args: Any, **kwargs: Any) -> Union[str, AppDryRunInfo]:
127
+ # for logging call-site context in application metadata
128
+ os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
129
+
130
+ appdef = component(*args, **kwargs)
131
+
132
+ with torchx_runner() as runner:
133
+ info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
134
+
135
+ info_json_fmt = AppDryRunInfo(
136
+ info.request,
137
+ fmt=defaults.dryrun_info_formatter(info),
138
+ )
139
+ info_json_fmt._app = info._app
140
+ info_json_fmt._cfg = info._cfg
141
+ info_json_fmt._scheduler = info._scheduler
142
+
143
+ if config.dryrun:
144
+ return info_json_fmt
145
+ else:
146
+ server_handle = runner.schedule(info)
147
+ return server_handle
148
+
149
+ return _run
150
+
151
+
152
+ def info(server_handle: str) -> Optional[ServerSpec]:
153
+ """Calls the ``describe`` API on the scheduler hosting the server to get
154
+ information about it.
155
+
156
+ Returns ``None`` if the server's job is not found in the scheduler's
157
+ control-plane. This can happen if the job does not exist
158
+ (e.g. typo in the server_handle) or the job already exited a long time ago.
159
+
160
+ NOTE: This function can return non-empty info for jobs that have
161
+ exited recently.
162
+ """
163
+ with torchx_runner() as runner:
164
+ status = runner.status(server_handle)
165
+ if status is None:
166
+ return None
167
+
168
+ appdef = runner.describe(server_handle)
169
+ if appdef is None:
170
+ return None
171
+
172
+ # host status grouped by mesh (role) names
173
+ replica_status = {r.role: r.replicas for r in status.roles}
174
+
175
+ mesh_specs = []
176
+ for role in appdef.roles:
177
+ spec = mesh_spec_from_metadata(appdef, role.name)
178
+ assert spec is not None, "cannot be 'None' since we iterate over appdef's roles"
179
+
180
+ # null-guard since some schedulers do not fill replica_status
181
+ if host_status := replica_status.get(role.name):
182
+ spec.hostnames = [h.hostname for h in host_status]
183
+
184
+ mesh_specs.append(spec)
185
+
186
+ return ServerSpec(name=appdef.name, state=status.state, meshes=mesh_specs)
187
+
188
+
189
+ _5_SECONDS = timedelta(seconds=5)
190
+
191
+
192
+ async def server_ready(
193
+ server_handle: str, check_interval: timedelta = _5_SECONDS
194
+ ) -> Optional[ServerSpec]:
195
+ """Waits until the server's job is in RUNNING state to returns the server spec.
196
+ Returns `None` if the server does not exist.
197
+
198
+ NOTE: Certain fields such as `hostnames` is only filled (and valid) when the server is RUNNING.
199
+
200
+ Usage:
201
+
202
+ .. code-block:: python
203
+
204
+ server_info = await server_ready("slurm:///123")
205
+ if not server_info:
206
+ print(f"Job does not exist")
207
+ else:
208
+ if server_info.is_running:
209
+ for mesh in server_info.meshes:
210
+ connect_to(mesh.hostnames)
211
+ else:
212
+ print(f"Job in {server_info.state} state. Hostnames are not available")
213
+
214
+ """
215
+
216
+ while True:
217
+ server_spec = info(server_handle)
218
+
219
+ if not server_spec: # server not found
220
+ return None
221
+
222
+ if server_spec.state <= AppState.PENDING: # UNSUBMITTED or SUBMITTED or PENDING
223
+ # NOTE: TorchX currently does not have async APIs so need to loop-on-interval
224
+ # TODO maybe inverse exponential backoff instead of constant interval?
225
+ check_interval_seconds = check_interval.total_seconds()
226
+ logger.info(
227
+ "waiting for %s to be %s (current: %s), will check again in %g seconds...",
228
+ server_handle,
229
+ AppState.RUNNING,
230
+ server_spec.state,
231
+ check_interval_seconds,
232
+ )
233
+ time.sleep(check_interval_seconds)
234
+ continue
235
+ else:
236
+ return server_spec
237
+
238
+
239
+ def kill(server_handle: str) -> None:
240
+ with torchx_runner() as runner:
241
+ runner.cancel(server_handle)
242
+
243
+
244
+ def bounce(server_handle: str) -> None:
245
+ """(re)starts the server's processes without tearing down the server's job."""
246
+ raise NotImplementedError("`bounce` is not yet implemented")
247
+
248
+
249
+ def stop(server_handle: str) -> None:
250
+ """Stops the server's unix processes without tearing down the server's job."""
251
+ raise NotImplementedError("`stop` is not yet implemented")
@@ -0,0 +1,7 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
@@ -0,0 +1,58 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ import getpass
9
+ from typing import Optional
10
+
11
+ from monarch.tools import mesh_spec
12
+ from monarch.tools.mesh_spec import mesh_spec_from_str
13
+ from torchx import specs
14
+
15
+ _DEFAULT_MESHES = ["mesh_0:1:gpu.small"]
16
+
17
+ _USER: str = getpass.getuser()
18
+
19
+ __version__ = "latest" # TODO get version from monarch.__version_
20
+
21
+
22
+ def proc_mesh(
23
+ name: str = f"monarch-{_USER}",
24
+ image: str = f"ghcr.io/pytorch-labs/monarch:{__version__}", # TODO docker needs to be built and pushed to ghcr
25
+ meshes: list[str] = _DEFAULT_MESHES,
26
+ env: Optional[dict[str, str]] = None,
27
+ port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT,
28
+ program: str = "monarch_bootstrap", # installed with monarch wheel (as console script)
29
+ ) -> specs.AppDef:
30
+ """
31
+ Args:
32
+ name: the name of the monarch server job
33
+ image: docker image to run the job on, for slurm, image is the dir the job is run from
34
+ meshes: list of mesh specs of the form "{name}:{num_hosts}:{host_type}"
35
+ env: environment variables to be passed to the main command (e.g. ENV1=v1,ENV2=v2,ENV3=v3)
36
+ port: the port that the remote process allocator runs on (must be reachable from the client)
37
+ program: path to the binary that the remote process allocator spawns on an allocation request
38
+ """
39
+
40
+ appdef = specs.AppDef(name)
41
+
42
+ for mesh in [mesh_spec_from_str(mesh) for mesh in meshes]:
43
+ mesh_role = specs.Role(
44
+ name=mesh.name,
45
+ image=image,
46
+ entrypoint="process_allocator", # run "cargo install monarch_hyperactor" to get this binary
47
+ args=[
48
+ f"--port={port}",
49
+ f"--program={program}",
50
+ ],
51
+ num_replicas=mesh.num_hosts,
52
+ resource=specs.resource(h=mesh.host_type),
53
+ env=env or {},
54
+ port_map={"mesh": port},
55
+ )
56
+ appdef.roles.append(mesh_role)
57
+
58
+ return appdef
@@ -0,0 +1,20 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ from dataclasses import dataclass, field
9
+ from typing import Any, Optional
10
+
11
+
12
+ NOT_SET: str = "__NOT_SET__"
13
+
14
+
15
+ @dataclass
16
+ class Config:
17
+ scheduler: str = NOT_SET
18
+ scheduler_args: dict[str, Any] = field(default_factory=dict)
19
+ workspace: Optional[str] = None
20
+ dryrun: bool = False
@@ -0,0 +1,54 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ """Defines defaults for ``monarch.tools``"""
10
+
11
+ from typing import Callable, Optional
12
+
13
+ from monarch.tools.components import hyperactor
14
+ from monarch.tools.config import Config
15
+
16
+ from torchx import specs
17
+ from torchx.schedulers import (
18
+ docker_scheduler,
19
+ kubernetes_scheduler,
20
+ local_scheduler,
21
+ SchedulerFactory,
22
+ slurm_scheduler,
23
+ )
24
+
25
+
26
+ def component_fn(scheduler: str) -> Callable[..., specs.AppDef]:
27
+ """The default TorchX component function for the scheduler"""
28
+ return hyperactor.proc_mesh
29
+
30
+
31
+ def scheduler_factories() -> dict[str, SchedulerFactory]:
32
+ """Supported schedulers (name -> scheduler static factory method)"""
33
+ return { # pyre-ignore[7]
34
+ # --- local schedulers (no multi-host support) ---
35
+ "local_cwd": local_scheduler.create_scheduler,
36
+ "local_docker": docker_scheduler.create_scheduler,
37
+ # --- remote schedulers (yes multi-host support) ---
38
+ "slurm": slurm_scheduler.create_scheduler,
39
+ "k8s": kubernetes_scheduler.create_scheduler,
40
+ }
41
+
42
+
43
+ def config(scheduler: str, workspace: Optional[str] = None) -> Config:
44
+ """The default :py:class:`~monarch.tools.config.Config` to use when submitting to the provided ``scheduler``."""
45
+ return Config(scheduler=scheduler, workspace=workspace)
46
+
47
+
48
+ def dryrun_info_formatter(dryrun_info: specs.AppDryRunInfo) -> Callable[..., str]:
49
+ """Used to attach a formatter to the dryrun info when running
50
+ :py:function:`~monarch.tools.commands.create` in ``dryrun`` mode so that
51
+ the returned ``AppDryrunInfo`` can be printed to console.
52
+ """
53
+ # no-op, use the default formatter already attached to the dryrun info
54
+ return dryrun_info._fmt