torchmonarch-nightly 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. monarch/__init__.py +189 -0
  2. monarch/_monarch/__init__.py +5 -0
  3. monarch/_monarch/hyperactor/__init__.py +74 -0
  4. monarch/_monarch/selection/__init__.py +13 -0
  5. monarch/_monarch/worker/__init__.py +0 -0
  6. monarch/_monarch/worker/debugger.py +117 -0
  7. monarch/_monarch/worker/logging.py +107 -0
  8. monarch/_rust_bindings.so +0 -0
  9. monarch/_testing.py +198 -0
  10. monarch/actor_mesh.py +692 -0
  11. monarch/allocator.py +62 -0
  12. monarch/bootstrap_main.py +75 -0
  13. monarch/builtins/__init__.py +14 -0
  14. monarch/builtins/log.py +22 -0
  15. monarch/builtins/random.py +69 -0
  16. monarch/cached_remote_function.py +257 -0
  17. monarch/common/_C.pyi +11 -0
  18. monarch/common/_C.so +0 -0
  19. monarch/common/__init__.py +0 -0
  20. monarch/common/_coalescing.py +308 -0
  21. monarch/common/_device_utils.py +18 -0
  22. monarch/common/_tensor_to_table.py +172 -0
  23. monarch/common/base_tensor.py +28 -0
  24. monarch/common/borrows.py +143 -0
  25. monarch/common/client.py +646 -0
  26. monarch/common/constants.py +10 -0
  27. monarch/common/context_manager.py +40 -0
  28. monarch/common/controller_api.py +104 -0
  29. monarch/common/device_mesh.py +443 -0
  30. monarch/common/fake.py +55 -0
  31. monarch/common/function.py +160 -0
  32. monarch/common/function_caching.py +164 -0
  33. monarch/common/future.py +168 -0
  34. monarch/common/invocation.py +125 -0
  35. monarch/common/mast.py +221 -0
  36. monarch/common/messages.py +572 -0
  37. monarch/common/mock_cuda.py +41 -0
  38. monarch/common/opaque_ref.py +98 -0
  39. monarch/common/pickle_flatten.py +48 -0
  40. monarch/common/pipe.py +152 -0
  41. monarch/common/process_group.py +55 -0
  42. monarch/common/recording.py +127 -0
  43. monarch/common/reference.py +33 -0
  44. monarch/common/remote.py +304 -0
  45. monarch/common/selection.py +9 -0
  46. monarch/common/shape.py +204 -0
  47. monarch/common/stream.py +111 -0
  48. monarch/common/tensor.py +793 -0
  49. monarch/common/tensor_factory.py +31 -0
  50. monarch/common/tree.py +73 -0
  51. monarch/controller/__init__.py +7 -0
  52. monarch/controller/backend.py +223 -0
  53. monarch/controller/controller.py +223 -0
  54. monarch/controller/debugger.py +47 -0
  55. monarch/controller/history.py +90 -0
  56. monarch/controller/rust_backend/__init__.py +7 -0
  57. monarch/controller/rust_backend/controller.py +245 -0
  58. monarch/fetch.py +55 -0
  59. monarch/future.py +25 -0
  60. monarch/gradient/__init__.py +11 -0
  61. monarch/gradient/_gradient_generator.pyi +22 -0
  62. monarch/gradient/_gradient_generator.so +0 -0
  63. monarch/gradient_generator.py +185 -0
  64. monarch/memory.py +43 -0
  65. monarch/monarch_controller +0 -0
  66. monarch/notebook.py +761 -0
  67. monarch/opaque_module.py +235 -0
  68. monarch/opaque_object.py +88 -0
  69. monarch/parallel/__init__.py +9 -0
  70. monarch/parallel/pipelining/__init__.py +7 -0
  71. monarch/parallel/pipelining/runtime.py +847 -0
  72. monarch/parallel/pipelining/schedule_ir.py +692 -0
  73. monarch/parallel/pipelining/scheduler.py +249 -0
  74. monarch/proc_mesh.py +188 -0
  75. monarch/profiler.py +160 -0
  76. monarch/python_local_mesh.py +107 -0
  77. monarch/random.py +61 -0
  78. monarch/rdma.py +190 -0
  79. monarch/remote_class.py +114 -0
  80. monarch/rust_backend_mesh.py +280 -0
  81. monarch/rust_local_mesh.py +1402 -0
  82. monarch/sim_mesh.py +357 -0
  83. monarch/simulator/__init__.py +7 -0
  84. monarch/simulator/command_history.py +424 -0
  85. monarch/simulator/config.py +21 -0
  86. monarch/simulator/interface.py +59 -0
  87. monarch/simulator/ir.py +770 -0
  88. monarch/simulator/mock_controller.py +214 -0
  89. monarch/simulator/profiling.py +424 -0
  90. monarch/simulator/simulator.py +1052 -0
  91. monarch/simulator/task.py +255 -0
  92. monarch/simulator/tensor.py +373 -0
  93. monarch/simulator/trace.py +395 -0
  94. monarch/simulator/utils.py +41 -0
  95. monarch/simulator/worker.py +389 -0
  96. monarch/tensor_worker_main.py +260 -0
  97. monarch/tensorboard.py +84 -0
  98. monarch/timer/__init__.py +21 -0
  99. monarch/timer/example_monarch.py +78 -0
  100. monarch/timer/example_spmd.py +55 -0
  101. monarch/timer/execution_timer.py +199 -0
  102. monarch/timer/execution_timer_test.py +131 -0
  103. monarch/tools/__init__.py +7 -0
  104. monarch/tools/cli.py +167 -0
  105. monarch/tools/commands.py +189 -0
  106. monarch/tools/components/__init__.py +7 -0
  107. monarch/tools/components/hyperactor.py +57 -0
  108. monarch/tools/config/__init__.py +20 -0
  109. monarch/tools/config/defaults.py +54 -0
  110. monarch/tools/mesh_spec.py +121 -0
  111. monarch/worker/__init__.py +7 -0
  112. monarch/worker/_testing_function.py +481 -0
  113. monarch/worker/compiled_block.py +270 -0
  114. monarch/worker/debugger.py +125 -0
  115. monarch/worker/lines.py +47 -0
  116. monarch/worker/monitor.py +53 -0
  117. monarch/worker/worker.py +1191 -0
  118. monarch/world_mesh.py +34 -0
  119. monarch_supervisor/__init__.py +1044 -0
  120. monarch_supervisor/_testing.py +44 -0
  121. monarch_supervisor/function_call.py +30 -0
  122. monarch_supervisor/host.py +386 -0
  123. monarch_supervisor/launchers.py +145 -0
  124. monarch_supervisor/log_pstree.py +48 -0
  125. monarch_supervisor/logging.py +103 -0
  126. monarch_supervisor/python_executable.py +42 -0
  127. tests/__init__.py +0 -0
  128. tests/dispatch_bench.py +124 -0
  129. tests/dispatch_bench_helper.py +25 -0
  130. tests/error_test_binary.py +139 -0
  131. tests/simulator/__init__.py +0 -0
  132. tests/simulator/test_profiling.py +136 -0
  133. tests/simulator/test_simulator.py +411 -0
  134. tests/simulator/test_task.py +64 -0
  135. tests/simulator/test_worker.py +102 -0
  136. tests/sleep_binary.py +35 -0
  137. tests/test_actor_error.py +112 -0
  138. tests/test_alloc.py +25 -0
  139. tests/test_coalescing.py +492 -0
  140. tests/test_controller.py +835 -0
  141. tests/test_device_mesh.py +132 -0
  142. tests/test_fault_tolerance.py +398 -0
  143. tests/test_future.py +94 -0
  144. tests/test_grad_generator.py +121 -0
  145. tests/test_mock_cuda.py +74 -0
  146. tests/test_pdb_actor.py +110 -0
  147. tests/test_python_actors.py +372 -0
  148. tests/test_remote_functions.py +1271 -0
  149. tests/test_rust_backend.py +182 -0
  150. tests/test_signal_safe_block_on.py +103 -0
  151. tests/test_sim_backend.py +54 -0
  152. torchmonarch_nightly-2025.6.4.dist-info/METADATA +94 -0
  153. torchmonarch_nightly-2025.6.4.dist-info/RECORD +157 -0
  154. torchmonarch_nightly-2025.6.4.dist-info/WHEEL +5 -0
  155. torchmonarch_nightly-2025.6.4.dist-info/entry_points.txt +3 -0
  156. torchmonarch_nightly-2025.6.4.dist-info/licenses/LICENSE +29 -0
  157. torchmonarch_nightly-2025.6.4.dist-info/top_level.txt +3 -0
monarch/tools/cli.py ADDED
@@ -0,0 +1,167 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ import argparse
9
+ import json
10
+ import sys
11
+
12
+ from monarch.tools.commands import (
13
+ bounce,
14
+ component_args_from_cli,
15
+ create,
16
+ info,
17
+ kill,
18
+ stop,
19
+ torchx_runner,
20
+ )
21
+ from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults
22
+ Config,
23
+ defaults,
24
+ )
25
+ from torchx.specs.finder import get_component
26
+
27
+
28
+ def config_from_cli_args(args: argparse.Namespace) -> Config:
29
+ config = defaults.config(args.scheduler, args.workspace)
30
+
31
+ if args.scheduler_args:
32
+ with torchx_runner() as runner:
33
+ opts = runner.scheduler_run_opts(config.scheduler)
34
+ for cfg_str in args.scheduler_args:
35
+ parsed_cfg = opts.cfg_from_str(cfg_str)
36
+ config.scheduler_args.update(parsed_cfg)
37
+
38
+ config.dryrun = args.dryrun
39
+ return config
40
+
41
+
42
+ class CreateCmd:
43
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
44
+ subparser.add_argument(
45
+ "-s",
46
+ "--scheduler",
47
+ type=str,
48
+ help="Scheduler to submit to",
49
+ )
50
+ subparser.add_argument(
51
+ "-cfg",
52
+ "--scheduler_args",
53
+ default=[],
54
+ action="append",
55
+ help="Scheduler args (e.g. `-cfg cluster=foo -cfg user=bar`)",
56
+ )
57
+ subparser.add_argument(
58
+ "--dryrun",
59
+ action="store_true",
60
+ default=False,
61
+ help="Just prints the scheduler request",
62
+ )
63
+ subparser.add_argument(
64
+ "--workspace",
65
+ help="The local directory to build into the job's image and make available on the job."
66
+ " Pass --workspace='' to disable any default workspaces configured for the scheduler",
67
+ )
68
+ subparser.add_argument(
69
+ "--component",
70
+ help="A custom TorchX component to use",
71
+ )
72
+ subparser.add_argument(
73
+ "-arg",
74
+ "--component_args",
75
+ default=[],
76
+ action="append",
77
+ help="Arguments to the component fn (e.g. `-arg a=b -arg c=d` to pass as `component_fn(a=b, c=d)`)",
78
+ )
79
+
80
+ def run(self, args: argparse.Namespace) -> None:
81
+ config = config_from_cli_args(args)
82
+
83
+ component_fn = (
84
+ get_component(args.component).fn
85
+ if args.component
86
+ else defaults.component_fn(config.scheduler)
87
+ )
88
+ component_args = component_args_from_cli(component_fn, args.component_args)
89
+ handle = create(config, component_fn)(**component_args)
90
+ print(handle)
91
+
92
+
93
+ class CommonArguments:
94
+ @staticmethod
95
+ def add_server_handle(subparser: argparse.ArgumentParser) -> None:
96
+ subparser.add_argument(
97
+ "server_handle",
98
+ type=str,
99
+ help="monarch server handle (e.g. slurm:///job_id)",
100
+ )
101
+
102
+
103
+ class InfoCmd:
104
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
105
+ CommonArguments.add_server_handle(subparser)
106
+
107
+ def run(self, args: argparse.Namespace) -> None:
108
+ server_spec = info(args.server_handle)
109
+ if server_spec is None:
110
+ print(
111
+ f"Server: {args.server_handle} does not exist",
112
+ file=sys.stderr,
113
+ )
114
+ else:
115
+ json.dump(server_spec.to_json(), fp=sys.stdout)
116
+
117
+
118
+ class KillCmd:
119
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
120
+ CommonArguments.add_server_handle(subparser)
121
+
122
+ def run(self, args: argparse.Namespace) -> None:
123
+ kill(args.server_handle)
124
+
125
+
126
+ class BounceCmd:
127
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
128
+ CommonArguments.add_server_handle(subparser)
129
+
130
+ def run(self, args: argparse.Namespace) -> None:
131
+ bounce(args.server_handle)
132
+
133
+
134
+ class StopCmd:
135
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
136
+ CommonArguments.add_server_handle(subparser)
137
+
138
+ def run(self, args: argparse.Namespace) -> None:
139
+ stop(args.server_handle)
140
+
141
+
142
+ def get_parser() -> argparse.ArgumentParser:
143
+ parser = argparse.ArgumentParser(description="Monarch CLI")
144
+ subparser = parser.add_subparsers(title="COMMANDS")
145
+
146
+ for cmd_name, cmd in {
147
+ "create": CreateCmd(),
148
+ "info": InfoCmd(),
149
+ "kill": KillCmd(),
150
+ # --- placeholder subcommands (not yet implemented) ---
151
+ "bounce": BounceCmd(),
152
+ "stop": StopCmd(),
153
+ }.items():
154
+ cmd_parser = subparser.add_parser(cmd_name)
155
+ cmd.add_arguments(cmd_parser)
156
+ cmd_parser.set_defaults(func=cmd.run)
157
+ return parser
158
+
159
+
160
+ def main(argv: list[str] = sys.argv[1:]) -> None:
161
+ parser = get_parser()
162
+ args = parser.parse_args(argv)
163
+ args.func(args)
164
+
165
+
166
+ if __name__ == "__main__":
167
+ main()
@@ -0,0 +1,189 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import argparse
10
+ import functools
11
+ import inspect
12
+ import os
13
+ from typing import Any, Callable, Mapping, Optional, Union
14
+
15
+ from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults
16
+ Config,
17
+ defaults,
18
+ )
19
+
20
+ from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
21
+
22
+ from torchx.runner import Runner
23
+ from torchx.specs import AppDef, AppDryRunInfo, CfgVal
24
+ from torchx.specs.builders import parse_args
25
+ from torchx.util.types import decode, decode_optional
26
+
27
+
28
+ def torchx_runner() -> Runner:
29
+ # namespace is currently unused so make it empty str
30
+ # so that server handle is short (e.g. slurm:///job-id)
31
+ _EMPTY_NS = ""
32
+ return Runner(_EMPTY_NS, defaults.scheduler_factories())
33
+
34
+
35
+ def component_args_from_cli(
36
+ component_fn: Callable[..., AppDef], component_args: list[str]
37
+ ) -> dict[str, Any]:
38
+ """Parses component function's arguments from 'argname=argvalue' strings.
39
+
40
+ Returns: component arguments kwarg-ified.
41
+ """
42
+
43
+ cli_fied_component_args = []
44
+ for arg in component_args:
45
+ argname = arg.split("=")[0]
46
+ # torchx auto-generates an argparse parser for component function based
47
+ # type-hints and docstring as if the component was a CLI itself so we have to
48
+ # CLI arg-ify the component arguments by adding a "-" for
49
+ # single-char argnames (short arg) and "--" for multi-char (long arg)
50
+ cli_fied_component_args.append(f"-{arg}" if len(argname) == 1 else f"--{arg}")
51
+
52
+ parsed_args: argparse.Namespace = parse_args(component_fn, cli_fied_component_args)
53
+
54
+ # TODO kiuk@ logic below needs to move into torchx.specs.builders.parse_args()
55
+ # which is copied from torchx.specs.builders.materialize_appdef()
56
+ # parse_args() returns all the component parameters parsed from cli inputs
57
+ # as a string. Additional parameter type matching needs to be done (as below)
58
+ # to turn the CLI inputs to component function arguments.
59
+ component_kwargs = {}
60
+
61
+ parameters = inspect.signature(component_fn).parameters
62
+ for param_name, parameter in parameters.items():
63
+ arg_value = getattr(parsed_args, param_name)
64
+ parameter_type = parameter.annotation
65
+ parameter_type = decode_optional(parameter_type)
66
+ arg_value = decode(arg_value, parameter_type)
67
+ if parameter.kind == inspect.Parameter.VAR_POSITIONAL:
68
+ raise TypeError(
69
+ f"component fn param `{param_name}` is a '*arg' which is not supported; consider changing the type to a list"
70
+ )
71
+ elif parameter.kind == inspect.Parameter.VAR_KEYWORD:
72
+ raise TypeError(
73
+ f"component fn param `{param_name}` is a '**kwargs' which is not supported; consider changing the type to a dict or explicitly declare the params"
74
+ )
75
+ else:
76
+ component_kwargs[param_name] = arg_value
77
+
78
+ return component_kwargs
79
+
80
+
81
+ def create(
82
+ config: Config,
83
+ component_fn: Optional[Callable[..., AppDef]] = None,
84
+ ) -> Callable[..., Union[str, AppDryRunInfo]]:
85
+ """Creates a monarch server by submitting it as a job to the target scheduler.
86
+
87
+ Note that this function returns a `Callable` that has to be called with the
88
+ same arguments that one would call the `component_fn` to actually submit
89
+ the job that runs the monarch server.
90
+
91
+ Usage:
92
+
93
+ .. doc-test::
94
+
95
+ from monarch.tools.config import defaults
96
+
97
+ config = defaults.config(scheduler="slurm")
98
+ config.scheduler_args.update(
99
+ {
100
+ "partition": "prod",
101
+ "mail-user": "foo@bar.com",
102
+ "mail-type": "FAIL",
103
+ }
104
+ )
105
+ config.dryrun = True
106
+
107
+ create(default_config)(host_type="gpu.medium", num_hosts=4)
108
+
109
+
110
+ Args:
111
+ scheduler: where to submit a job that runs the server
112
+ scheduler_args: scheduler configs
113
+ component_fn: a function that returns the AppDef (job def).
114
+ If not provided, defaults to the configured default for the scheduler
115
+ (in most cases ``monarch.tools.components.hyperactor.proc_mesh``)
116
+ """
117
+ scheduler: str = config.scheduler
118
+ cfg: Mapping[str, CfgVal] = config.scheduler_args
119
+ component: Callable[..., AppDef] = component_fn or defaults.component_fn(scheduler)
120
+
121
+ @functools.wraps(component)
122
+ def _run(*args: Any, **kwargs: Any) -> Union[str, AppDryRunInfo]:
123
+ # for logging call-site context in application metadata
124
+ os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
125
+
126
+ appdef = component(*args, **kwargs)
127
+
128
+ with torchx_runner() as runner:
129
+ info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
130
+
131
+ info_json_fmt = AppDryRunInfo(
132
+ info.request,
133
+ fmt=defaults.dryrun_info_formatter(info),
134
+ )
135
+ info_json_fmt._app = info._app
136
+ info_json_fmt._cfg = info._cfg
137
+ info_json_fmt._scheduler = info._scheduler
138
+
139
+ if config.dryrun:
140
+ return info_json_fmt
141
+ else:
142
+ server_handle = runner.schedule(info)
143
+ return server_handle
144
+
145
+ return _run
146
+
147
+
148
+ def info(server_handle: str) -> Optional[ServerSpec]:
149
+ """Calls the ``describe`` API on the scheduler hosting the server to get
150
+ information about it.
151
+
152
+ Returns ``None`` if the server's job is not found in the scheduler's
153
+ control-plane. This can happen if the job does not exist
154
+ (e.g. typo in the server_handle) or the job already exited a long time ago.
155
+
156
+ NOTE: This function can return non-empty info for jobs that have
157
+ exited recently.
158
+ """
159
+ with torchx_runner() as runner:
160
+ status = runner.status(server_handle)
161
+ if status is None:
162
+ return None
163
+
164
+ appdef = runner.describe(server_handle)
165
+ if appdef is None:
166
+ return None
167
+
168
+ mesh_specs = []
169
+ for role in appdef.roles:
170
+ spec = mesh_spec_from_metadata(appdef, role.name)
171
+ assert spec is not None, "cannot be 'None' since we iterate over appdef's roles"
172
+ mesh_specs.append(spec)
173
+
174
+ return ServerSpec(name=appdef.name, state=status.state, meshes=mesh_specs)
175
+
176
+
177
+ def kill(server_handle: str) -> None:
178
+ with torchx_runner() as runner:
179
+ runner.cancel(server_handle)
180
+
181
+
182
+ def bounce(server_handle: str) -> None:
183
+ """(re)starts the server's processes without tearing down the server's job."""
184
+ raise NotImplementedError("`bounce` is not yet implemented")
185
+
186
+
187
+ def stop(server_handle: str) -> None:
188
+ """Stops the server's unix processes without tearing down the server's job."""
189
+ raise NotImplementedError("`stop` is not yet implemented")
@@ -0,0 +1,7 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
@@ -0,0 +1,57 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ import getpass
9
+ from typing import Optional
10
+
11
+ from monarch.tools import mesh_spec
12
+ from monarch.tools.mesh_spec import mesh_spec_from_str
13
+ from torchx import specs
14
+
15
+ _DEFAULT_MESHES = ["mesh_0:1:gpu.small"]
16
+
17
+ _USER: str = getpass.getuser()
18
+
19
+ __version__ = "latest" # TODO get version from monarch.__version_
20
+
21
+
22
+ def proc_mesh(
23
+ name: str = f"monarch-{_USER}",
24
+ image: str = f"ghcr.io/pytorch-labs/monarch:{__version__}", # TODO docker needs to be built and pushed to ghcr
25
+ meshes: list[str] = _DEFAULT_MESHES,
26
+ env: Optional[dict[str, str]] = None,
27
+ port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT,
28
+ ) -> specs.AppDef:
29
+ """
30
+ Args:
31
+ name: the name of the monarch server job
32
+ image: docker image to run the job on, for slurm, image is the dir the job is run from
33
+ meshes: list of mesh specs of the form "{name}:{num_hosts}:{host_type}"
34
+ env: environment variables to be passed to the main command (e.g. ENV1=v1,ENV2=v2,ENV3=v3)
35
+ port: the port that the remote process allocator runs on (must be reachable from the client)
36
+ """
37
+
38
+ appdef = specs.AppDef(name)
39
+
40
+ for mesh in [mesh_spec_from_str(mesh) for mesh in meshes]:
41
+ mesh_role = specs.Role(
42
+ name=mesh.name,
43
+ image=image,
44
+ entrypoint="process_allocator", # 'cargo install monarch_hyperactor' to get this binary
45
+ args=[
46
+ "mesh-worker",
47
+ f"--port={port}",
48
+ "--program=monarch_bootstrap", # installed with monarch wheel (as console script)
49
+ ],
50
+ num_replicas=mesh.num_hosts,
51
+ resource=specs.resource(h=mesh.host_type),
52
+ env=env or {},
53
+ port_map={"mesh": port},
54
+ )
55
+ appdef.roles.append(mesh_role)
56
+
57
+ return appdef
@@ -0,0 +1,20 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ from dataclasses import dataclass, field
9
+ from typing import Any, Optional
10
+
11
+
12
+ NOT_SET: str = "__NOT_SET__"
13
+
14
+
15
+ @dataclass
16
+ class Config:
17
+ scheduler: str = NOT_SET
18
+ scheduler_args: dict[str, Any] = field(default_factory=dict)
19
+ workspace: Optional[str] = None
20
+ dryrun: bool = False
@@ -0,0 +1,54 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ """Defines defaults for ``monarch.tools``"""
10
+
11
+ from typing import Callable, Optional
12
+
13
+ from monarch.tools.components import hyperactor
14
+ from monarch.tools.config import Config
15
+
16
+ from torchx import specs
17
+ from torchx.schedulers import (
18
+ docker_scheduler,
19
+ kubernetes_scheduler,
20
+ local_scheduler,
21
+ SchedulerFactory,
22
+ slurm_scheduler,
23
+ )
24
+
25
+
26
+ def component_fn(scheduler: str) -> Callable[..., specs.AppDef]:
27
+ """The default TorchX component function for the scheduler"""
28
+ return hyperactor.proc_mesh
29
+
30
+
31
+ def scheduler_factories() -> dict[str, SchedulerFactory]:
32
+ """Supported schedulers (name -> scheduler static factory method)"""
33
+ return { # pyre-ignore[7]
34
+ # --- local schedulers (no multi-host support) ---
35
+ "local_cwd": local_scheduler.create_scheduler,
36
+ "local_docker": docker_scheduler.create_scheduler,
37
+ # --- remote schedulers (yes multi-host support) ---
38
+ "slurm": slurm_scheduler.create_scheduler,
39
+ "k8s": kubernetes_scheduler.create_scheduler,
40
+ }
41
+
42
+
43
+ def config(scheduler: str, workspace: Optional[str] = None) -> Config:
44
+ """The default :py:class:`~monarch.tools.config.Config` to use when submitting to the provided ``scheduler``."""
45
+ return Config(scheduler=scheduler, workspace=workspace)
46
+
47
+
48
+ def dryrun_info_formatter(dryrun_info: specs.AppDryRunInfo) -> Callable[..., str]:
49
+ """Used to attach a formatter to the dryrun info when running
50
+ :py:function:`~monarch.tools.commands.create` in ``dryrun`` mode so that
51
+ the returned ``AppDryrunInfo`` can be printed to console.
52
+ """
53
+ # no-op, use the default formatter already attached to the dryrun info
54
+ return dryrun_info._fmt
@@ -0,0 +1,121 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ import string
9
+ from dataclasses import dataclass
10
+ from typing import Any, Optional
11
+
12
+ from torchx import specs
13
+
14
+ DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
15
+
16
+ _TAG_MESHES_PREFIX = "monarch/meshes/${mesh_name}/"
17
+ _TAG_HOST_TYPE: str = _TAG_MESHES_PREFIX + "host_type"
18
+ _TAG_GPUS: str = _TAG_MESHES_PREFIX + "gpus"
19
+
20
+
21
+ @dataclass
22
+ class MeshSpec:
23
+ """Doubles as the 'input' specifications of how to setup the mesh role
24
+ when submitting the job and as the 'info' (describe) API's return value.
25
+ """
26
+
27
+ name: str
28
+ num_hosts: int
29
+ host_type: str
30
+ gpus: int
31
+ port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
32
+
33
+
34
+ def _tag(mesh_name: str, tag_template: str) -> str:
35
+ return string.Template(tag_template).substitute(mesh_name=mesh_name)
36
+
37
+
38
+ def tag_as_metadata(mesh_spec: MeshSpec, appdef: specs.AppDef) -> None:
39
+ appdef.metadata[_tag(mesh_spec.name, _TAG_HOST_TYPE)] = mesh_spec.host_type
40
+ appdef.metadata[_tag(mesh_spec.name, _TAG_GPUS)] = str(mesh_spec.gpus)
41
+
42
+
43
+ def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[MeshSpec]:
44
+ for role in appdef.roles:
45
+ if role.name == mesh_name:
46
+ return MeshSpec(
47
+ name=mesh_name,
48
+ num_hosts=role.num_replicas,
49
+ host_type=appdef.metadata.get(_tag(mesh_name, _TAG_HOST_TYPE), ""),
50
+ gpus=int(appdef.metadata.get(_tag(mesh_name, _TAG_GPUS), "-1")),
51
+ port=role.port_map.get("mesh", DEFAULT_REMOTE_ALLOCATOR_PORT),
52
+ )
53
+
54
+ return None
55
+
56
+
57
+ def mesh_spec_from_str(mesh_spec_str: str) -> MeshSpec:
58
+ """Parses the given string into a MeshSpec.
59
+
60
+ Args:
61
+ mesh_spec_str: A string representation of the mesh specification
62
+ in the format 'NAME:NUM_HOSTS:HOST_TYPE' (e.g. 'trainer:8:gpu.medium').
63
+ """
64
+ parts = mesh_spec_str.split(":")
65
+ assert (
66
+ len(parts) == 3
67
+ ), f"`{mesh_spec_str}` is not of the form 'NAME:NUM_HOSTS:HOST_TYPE'"
68
+
69
+ name, num_hosts, host_type = parts
70
+ gpus = specs.resource(h=host_type).gpu
71
+
72
+ assert num_hosts.isdigit(), f"`{num_hosts}` is not a number in: {mesh_spec_str}"
73
+
74
+ return MeshSpec(name, int(num_hosts), host_type, gpus)
75
+
76
+
77
+ @dataclass
78
+ class ServerSpec:
79
+ """Holds information (as returned by the 'describe' API of the scheduler)
80
+ about the monarch server. This is the return value of ``monarch.tools.commands.info` API.
81
+ """
82
+
83
+ name: str
84
+ state: specs.AppState
85
+ meshes: list[MeshSpec]
86
+
87
+ def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
88
+ for mesh_spec in self.meshes:
89
+ if mesh_spec.name == mesh_name:
90
+ return mesh_spec
91
+
92
+ raise ValueError(
93
+ f"Mesh: '{mesh_name}' not found in job: {self.name}. Try one of: {self.get_mesh_names()}"
94
+ )
95
+
96
+ def get_mesh_names(self) -> list[str]:
97
+ return [m.name for m in self.meshes]
98
+
99
+ def to_json(self) -> dict[str, Any]:
100
+ """Returns the JSON form of this struct that can be printed to console by:
101
+
102
+ .. code-block:: python
103
+
104
+ import json
105
+
106
+ server_spec = ServerSpec(...)
107
+ print(json.dumps(server_spec, indent=2))
108
+ """
109
+
110
+ return {
111
+ "name": self.name,
112
+ "state": self.state.name,
113
+ "meshes": {
114
+ mesh.name: {
115
+ "host_type": mesh.host_type,
116
+ "hosts": mesh.num_hosts,
117
+ "gpus": mesh.gpus,
118
+ }
119
+ for mesh in self.meshes
120
+ },
121
+ }
@@ -0,0 +1,7 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict