torchmonarch-nightly 2025.6.27__cp312-cp312-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +189 -0
- monarch/_monarch/__init__.py +5 -0
- monarch/_monarch/hyperactor/__init__.py +58 -0
- monarch/_monarch/selection/__init__.py +13 -0
- monarch/_monarch/worker/__init__.py +0 -0
- monarch/_monarch/worker/debugger.py +117 -0
- monarch/_monarch/worker/logging.py +107 -0
- monarch/_rust_bindings.so +0 -0
- monarch/_testing.py +230 -0
- monarch/actor_mesh.py +761 -0
- monarch/allocator.py +220 -0
- monarch/bootstrap_main.py +59 -0
- monarch/builtins/__init__.py +14 -0
- monarch/builtins/log.py +22 -0
- monarch/builtins/random.py +68 -0
- monarch/cached_remote_function.py +257 -0
- monarch/code_sync.py +10 -0
- monarch/common/_C.pyi +11 -0
- monarch/common/_C.so +0 -0
- monarch/common/__init__.py +0 -0
- monarch/common/_coalescing.py +308 -0
- monarch/common/_device_utils.py +18 -0
- monarch/common/_tensor_to_table.py +172 -0
- monarch/common/base_tensor.py +28 -0
- monarch/common/borrows.py +143 -0
- monarch/common/client.py +690 -0
- monarch/common/constants.py +10 -0
- monarch/common/context_manager.py +40 -0
- monarch/common/controller_api.py +104 -0
- monarch/common/device_mesh.py +417 -0
- monarch/common/fake.py +55 -0
- monarch/common/function.py +160 -0
- monarch/common/function_caching.py +164 -0
- monarch/common/future.py +168 -0
- monarch/common/invocation.py +125 -0
- monarch/common/mast.py +221 -0
- monarch/common/messages.py +573 -0
- monarch/common/mock_cuda.py +41 -0
- monarch/common/opaque_ref.py +98 -0
- monarch/common/pickle_flatten.py +48 -0
- monarch/common/pipe.py +152 -0
- monarch/common/process_group.py +55 -0
- monarch/common/recording.py +127 -0
- monarch/common/reference.py +33 -0
- monarch/common/remote.py +297 -0
- monarch/common/selection.py +9 -0
- monarch/common/shape.py +229 -0
- monarch/common/stream.py +114 -0
- monarch/common/tensor.py +814 -0
- monarch/common/tensor_factory.py +31 -0
- monarch/common/tree.py +73 -0
- monarch/controller/__init__.py +7 -0
- monarch/controller/backend.py +223 -0
- monarch/controller/controller.py +223 -0
- monarch/controller/debugger.py +47 -0
- monarch/controller/history.py +90 -0
- monarch/controller/rust_backend/__init__.py +7 -0
- monarch/controller/rust_backend/controller.py +245 -0
- monarch/debugger.py +379 -0
- monarch/fetch.py +55 -0
- monarch/future.py +76 -0
- monarch/gradient/__init__.py +11 -0
- monarch/gradient/_gradient_generator.pyi +22 -0
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/gradient_generator.py +185 -0
- monarch/memory.py +43 -0
- monarch/mesh_controller.py +271 -0
- monarch/monarch_controller +0 -0
- monarch/notebook.py +761 -0
- monarch/opaque_module.py +235 -0
- monarch/opaque_object.py +88 -0
- monarch/parallel/__init__.py +9 -0
- monarch/parallel/pipelining/__init__.py +7 -0
- monarch/parallel/pipelining/runtime.py +847 -0
- monarch/parallel/pipelining/schedule_ir.py +692 -0
- monarch/parallel/pipelining/scheduler.py +249 -0
- monarch/pdb_wrapper.py +135 -0
- monarch/proc_mesh.py +299 -0
- monarch/profiler.py +160 -0
- monarch/python_local_mesh.py +107 -0
- monarch/random.py +61 -0
- monarch/rdma.py +162 -0
- monarch/remote_class.py +114 -0
- monarch/rust_backend_mesh.py +280 -0
- monarch/rust_local_mesh.py +1402 -0
- monarch/sim_mesh.py +359 -0
- monarch/simulator/__init__.py +7 -0
- monarch/simulator/command_history.py +424 -0
- monarch/simulator/config.py +21 -0
- monarch/simulator/interface.py +59 -0
- monarch/simulator/ir.py +770 -0
- monarch/simulator/mock_controller.py +214 -0
- monarch/simulator/profiling.py +424 -0
- monarch/simulator/simulator.py +1052 -0
- monarch/simulator/task.py +255 -0
- monarch/simulator/tensor.py +373 -0
- monarch/simulator/trace.py +395 -0
- monarch/simulator/utils.py +41 -0
- monarch/simulator/worker.py +389 -0
- monarch/telemetry.py +19 -0
- monarch/tensor_worker_main.py +260 -0
- monarch/tensorboard.py +84 -0
- monarch/timer/__init__.py +21 -0
- monarch/timer/example_monarch.py +78 -0
- monarch/timer/example_spmd.py +55 -0
- monarch/timer/execution_timer.py +199 -0
- monarch/timer/execution_timer_test.py +131 -0
- monarch/tools/__init__.py +7 -0
- monarch/tools/cli.py +167 -0
- monarch/tools/commands.py +251 -0
- monarch/tools/components/__init__.py +7 -0
- monarch/tools/components/hyperactor.py +58 -0
- monarch/tools/config/__init__.py +20 -0
- monarch/tools/config/defaults.py +54 -0
- monarch/tools/mesh_spec.py +165 -0
- monarch/tools/network.py +69 -0
- monarch/worker/__init__.py +7 -0
- monarch/worker/_testing_function.py +481 -0
- monarch/worker/compiled_block.py +270 -0
- monarch/worker/debugger.py +125 -0
- monarch/worker/lines.py +47 -0
- monarch/worker/monitor.py +53 -0
- monarch/worker/worker.py +1191 -0
- monarch/world_mesh.py +34 -0
- monarch_supervisor/__init__.py +1044 -0
- monarch_supervisor/_testing.py +44 -0
- monarch_supervisor/function_call.py +30 -0
- monarch_supervisor/host.py +386 -0
- monarch_supervisor/launchers.py +145 -0
- monarch_supervisor/log_pstree.py +48 -0
- monarch_supervisor/logging.py +103 -0
- monarch_supervisor/python_executable.py +42 -0
- tests/__init__.py +0 -0
- tests/dispatch_bench.py +124 -0
- tests/dispatch_bench_helper.py +25 -0
- tests/error_test_binary.py +180 -0
- tests/simulator/__init__.py +0 -0
- tests/simulator/test_profiling.py +136 -0
- tests/simulator/test_simulator.py +411 -0
- tests/simulator/test_task.py +64 -0
- tests/simulator/test_worker.py +102 -0
- tests/sleep_binary.py +35 -0
- tests/test_actor_error.py +240 -0
- tests/test_alloc.py +25 -0
- tests/test_allocator.py +365 -0
- tests/test_coalescing.py +492 -0
- tests/test_controller.py +845 -0
- tests/test_device_mesh.py +132 -0
- tests/test_fault_tolerance.py +398 -0
- tests/test_future.py +94 -0
- tests/test_grad_generator.py +121 -0
- tests/test_mock_cuda.py +74 -0
- tests/test_pdb_actor.py +110 -0
- tests/test_python_actors.py +736 -0
- tests/test_remote_functions.py +1271 -0
- tests/test_rust_backend.py +217 -0
- tests/test_signal_safe_block_on.py +103 -0
- tests/test_sim_backend.py +54 -0
- tests/test_tensor_engine.py +52 -0
- torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
- torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
- torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
- torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
- torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
- torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0
monarch/tools/cli.py
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
import argparse
|
9
|
+
import json
|
10
|
+
import sys
|
11
|
+
|
12
|
+
from monarch.tools.commands import (
|
13
|
+
bounce,
|
14
|
+
component_args_from_cli,
|
15
|
+
create,
|
16
|
+
info,
|
17
|
+
kill,
|
18
|
+
stop,
|
19
|
+
torchx_runner,
|
20
|
+
)
|
21
|
+
from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults
|
22
|
+
Config,
|
23
|
+
defaults,
|
24
|
+
)
|
25
|
+
from torchx.specs.finder import get_component
|
26
|
+
|
27
|
+
|
28
|
+
def config_from_cli_args(args: argparse.Namespace) -> Config:
|
29
|
+
config = defaults.config(args.scheduler, args.workspace)
|
30
|
+
|
31
|
+
if args.scheduler_args:
|
32
|
+
with torchx_runner() as runner:
|
33
|
+
opts = runner.scheduler_run_opts(config.scheduler)
|
34
|
+
for cfg_str in args.scheduler_args:
|
35
|
+
parsed_cfg = opts.cfg_from_str(cfg_str)
|
36
|
+
config.scheduler_args.update(parsed_cfg)
|
37
|
+
|
38
|
+
config.dryrun = args.dryrun
|
39
|
+
return config
|
40
|
+
|
41
|
+
|
42
|
+
class CreateCmd:
|
43
|
+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
44
|
+
subparser.add_argument(
|
45
|
+
"-s",
|
46
|
+
"--scheduler",
|
47
|
+
type=str,
|
48
|
+
help="Scheduler to submit to",
|
49
|
+
)
|
50
|
+
subparser.add_argument(
|
51
|
+
"-cfg",
|
52
|
+
"--scheduler_args",
|
53
|
+
default=[],
|
54
|
+
action="append",
|
55
|
+
help="Scheduler args (e.g. `-cfg cluster=foo -cfg user=bar`)",
|
56
|
+
)
|
57
|
+
subparser.add_argument(
|
58
|
+
"--dryrun",
|
59
|
+
action="store_true",
|
60
|
+
default=False,
|
61
|
+
help="Just prints the scheduler request",
|
62
|
+
)
|
63
|
+
subparser.add_argument(
|
64
|
+
"--workspace",
|
65
|
+
help="The local directory to build into the job's image and make available on the job."
|
66
|
+
" Pass --workspace='' to disable any default workspaces configured for the scheduler",
|
67
|
+
)
|
68
|
+
subparser.add_argument(
|
69
|
+
"--component",
|
70
|
+
help="A custom TorchX component to use",
|
71
|
+
)
|
72
|
+
subparser.add_argument(
|
73
|
+
"-arg",
|
74
|
+
"--component_args",
|
75
|
+
default=[],
|
76
|
+
action="append",
|
77
|
+
help="Arguments to the component fn (e.g. `-arg a=b -arg c=d` to pass as `component_fn(a=b, c=d)`)",
|
78
|
+
)
|
79
|
+
|
80
|
+
def run(self, args: argparse.Namespace) -> None:
|
81
|
+
config = config_from_cli_args(args)
|
82
|
+
|
83
|
+
component_fn = (
|
84
|
+
get_component(args.component).fn
|
85
|
+
if args.component
|
86
|
+
else defaults.component_fn(config.scheduler)
|
87
|
+
)
|
88
|
+
component_args = component_args_from_cli(component_fn, args.component_args)
|
89
|
+
handle = create(config, component_fn)(**component_args)
|
90
|
+
print(handle)
|
91
|
+
|
92
|
+
|
93
|
+
class CommonArguments:
|
94
|
+
@staticmethod
|
95
|
+
def add_server_handle(subparser: argparse.ArgumentParser) -> None:
|
96
|
+
subparser.add_argument(
|
97
|
+
"server_handle",
|
98
|
+
type=str,
|
99
|
+
help="monarch server handle (e.g. slurm:///job_id)",
|
100
|
+
)
|
101
|
+
|
102
|
+
|
103
|
+
class InfoCmd:
|
104
|
+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
105
|
+
CommonArguments.add_server_handle(subparser)
|
106
|
+
|
107
|
+
def run(self, args: argparse.Namespace) -> None:
|
108
|
+
server_spec = info(args.server_handle)
|
109
|
+
if server_spec is None:
|
110
|
+
print(
|
111
|
+
f"Server: {args.server_handle} does not exist",
|
112
|
+
file=sys.stderr,
|
113
|
+
)
|
114
|
+
else:
|
115
|
+
json.dump(server_spec.to_json(), indent=2, fp=sys.stdout)
|
116
|
+
|
117
|
+
|
118
|
+
class KillCmd:
|
119
|
+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
120
|
+
CommonArguments.add_server_handle(subparser)
|
121
|
+
|
122
|
+
def run(self, args: argparse.Namespace) -> None:
|
123
|
+
kill(args.server_handle)
|
124
|
+
|
125
|
+
|
126
|
+
class BounceCmd:
|
127
|
+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
128
|
+
CommonArguments.add_server_handle(subparser)
|
129
|
+
|
130
|
+
def run(self, args: argparse.Namespace) -> None:
|
131
|
+
bounce(args.server_handle)
|
132
|
+
|
133
|
+
|
134
|
+
class StopCmd:
|
135
|
+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
136
|
+
CommonArguments.add_server_handle(subparser)
|
137
|
+
|
138
|
+
def run(self, args: argparse.Namespace) -> None:
|
139
|
+
stop(args.server_handle)
|
140
|
+
|
141
|
+
|
142
|
+
def get_parser() -> argparse.ArgumentParser:
|
143
|
+
parser = argparse.ArgumentParser(description="Monarch CLI")
|
144
|
+
subparser = parser.add_subparsers(title="COMMANDS")
|
145
|
+
|
146
|
+
for cmd_name, cmd in {
|
147
|
+
"create": CreateCmd(),
|
148
|
+
"info": InfoCmd(),
|
149
|
+
"kill": KillCmd(),
|
150
|
+
# --- placeholder subcommands (not yet implemented) ---
|
151
|
+
"bounce": BounceCmd(),
|
152
|
+
"stop": StopCmd(),
|
153
|
+
}.items():
|
154
|
+
cmd_parser = subparser.add_parser(cmd_name)
|
155
|
+
cmd.add_arguments(cmd_parser)
|
156
|
+
cmd_parser.set_defaults(func=cmd.run)
|
157
|
+
return parser
|
158
|
+
|
159
|
+
|
160
|
+
def main(argv: list[str] = sys.argv[1:]) -> None:
|
161
|
+
parser = get_parser()
|
162
|
+
args = parser.parse_args(argv)
|
163
|
+
args.func(args)
|
164
|
+
|
165
|
+
|
166
|
+
if __name__ == "__main__":
|
167
|
+
main()
|
@@ -0,0 +1,251 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
import argparse
|
10
|
+
import functools
|
11
|
+
import inspect
|
12
|
+
import logging
|
13
|
+
import os
|
14
|
+
import time
|
15
|
+
from datetime import timedelta
|
16
|
+
from typing import Any, Callable, Mapping, Optional, Union
|
17
|
+
|
18
|
+
from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults
|
19
|
+
Config,
|
20
|
+
defaults,
|
21
|
+
)
|
22
|
+
|
23
|
+
from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
|
24
|
+
from torchx.runner import Runner
|
25
|
+
from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal
|
26
|
+
from torchx.specs.builders import parse_args
|
27
|
+
from torchx.util.types import decode, decode_optional
|
28
|
+
|
29
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
|
32
|
+
def torchx_runner() -> Runner:
|
33
|
+
# namespace is currently unused so make it empty str
|
34
|
+
# so that server handle is short (e.g. slurm:///job-id)
|
35
|
+
_EMPTY_NS = ""
|
36
|
+
return Runner(_EMPTY_NS, defaults.scheduler_factories())
|
37
|
+
|
38
|
+
|
39
|
+
def component_args_from_cli(
|
40
|
+
component_fn: Callable[..., AppDef], component_args: list[str]
|
41
|
+
) -> dict[str, Any]:
|
42
|
+
"""Parses component function's arguments from 'argname=argvalue' strings.
|
43
|
+
|
44
|
+
Returns: component arguments kwarg-ified.
|
45
|
+
"""
|
46
|
+
|
47
|
+
cli_fied_component_args = []
|
48
|
+
for arg in component_args:
|
49
|
+
argname = arg.split("=")[0]
|
50
|
+
# torchx auto-generates an argparse parser for component function based
|
51
|
+
# type-hints and docstring as if the component was a CLI itself so we have to
|
52
|
+
# CLI arg-ify the component arguments by adding a "-" for
|
53
|
+
# single-char argnames (short arg) and "--" for multi-char (long arg)
|
54
|
+
cli_fied_component_args.append(f"-{arg}" if len(argname) == 1 else f"--{arg}")
|
55
|
+
|
56
|
+
parsed_args: argparse.Namespace = parse_args(component_fn, cli_fied_component_args)
|
57
|
+
|
58
|
+
# TODO kiuk@ logic below needs to move into torchx.specs.builders.parse_args()
|
59
|
+
# which is copied from torchx.specs.builders.materialize_appdef()
|
60
|
+
# parse_args() returns all the component parameters parsed from cli inputs
|
61
|
+
# as a string. Additional parameter type matching needs to be done (as below)
|
62
|
+
# to turn the CLI inputs to component function arguments.
|
63
|
+
component_kwargs = {}
|
64
|
+
|
65
|
+
parameters = inspect.signature(component_fn).parameters
|
66
|
+
for param_name, parameter in parameters.items():
|
67
|
+
arg_value = getattr(parsed_args, param_name)
|
68
|
+
parameter_type = parameter.annotation
|
69
|
+
parameter_type = decode_optional(parameter_type)
|
70
|
+
arg_value = decode(arg_value, parameter_type)
|
71
|
+
if parameter.kind == inspect.Parameter.VAR_POSITIONAL:
|
72
|
+
raise TypeError(
|
73
|
+
f"component fn param `{param_name}` is a '*arg' which is not supported; consider changing the type to a list"
|
74
|
+
)
|
75
|
+
elif parameter.kind == inspect.Parameter.VAR_KEYWORD:
|
76
|
+
raise TypeError(
|
77
|
+
f"component fn param `{param_name}` is a '**kwargs' which is not supported; consider changing the type to a dict or explicitly declare the params"
|
78
|
+
)
|
79
|
+
else:
|
80
|
+
component_kwargs[param_name] = arg_value
|
81
|
+
|
82
|
+
return component_kwargs
|
83
|
+
|
84
|
+
|
85
|
+
def create(
|
86
|
+
config: Config,
|
87
|
+
component_fn: Optional[Callable[..., AppDef]] = None,
|
88
|
+
) -> Callable[..., Union[str, AppDryRunInfo]]:
|
89
|
+
"""Creates a monarch server by submitting it as a job to the target scheduler.
|
90
|
+
|
91
|
+
Note that this function returns a `Callable` that has to be called with the
|
92
|
+
same arguments that one would call the `component_fn` to actually submit
|
93
|
+
the job that runs the monarch server.
|
94
|
+
|
95
|
+
Usage:
|
96
|
+
|
97
|
+
.. doc-test::
|
98
|
+
|
99
|
+
from monarch.tools.config import defaults
|
100
|
+
|
101
|
+
config = defaults.config(scheduler="slurm")
|
102
|
+
config.scheduler_args.update(
|
103
|
+
{
|
104
|
+
"partition": "prod",
|
105
|
+
"mail-user": "foo@bar.com",
|
106
|
+
"mail-type": "FAIL",
|
107
|
+
}
|
108
|
+
)
|
109
|
+
config.dryrun = True
|
110
|
+
|
111
|
+
create(default_config)(host_type="gpu.medium", num_hosts=4)
|
112
|
+
|
113
|
+
|
114
|
+
Args:
|
115
|
+
scheduler: where to submit a job that runs the server
|
116
|
+
scheduler_args: scheduler configs
|
117
|
+
component_fn: a function that returns the AppDef (job def).
|
118
|
+
If not provided, defaults to the configured default for the scheduler
|
119
|
+
(in most cases ``monarch.tools.components.hyperactor.proc_mesh``)
|
120
|
+
"""
|
121
|
+
scheduler: str = config.scheduler
|
122
|
+
cfg: Mapping[str, CfgVal] = config.scheduler_args
|
123
|
+
component: Callable[..., AppDef] = component_fn or defaults.component_fn(scheduler)
|
124
|
+
|
125
|
+
@functools.wraps(component)
|
126
|
+
def _run(*args: Any, **kwargs: Any) -> Union[str, AppDryRunInfo]:
|
127
|
+
# for logging call-site context in application metadata
|
128
|
+
os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
|
129
|
+
|
130
|
+
appdef = component(*args, **kwargs)
|
131
|
+
|
132
|
+
with torchx_runner() as runner:
|
133
|
+
info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
|
134
|
+
|
135
|
+
info_json_fmt = AppDryRunInfo(
|
136
|
+
info.request,
|
137
|
+
fmt=defaults.dryrun_info_formatter(info),
|
138
|
+
)
|
139
|
+
info_json_fmt._app = info._app
|
140
|
+
info_json_fmt._cfg = info._cfg
|
141
|
+
info_json_fmt._scheduler = info._scheduler
|
142
|
+
|
143
|
+
if config.dryrun:
|
144
|
+
return info_json_fmt
|
145
|
+
else:
|
146
|
+
server_handle = runner.schedule(info)
|
147
|
+
return server_handle
|
148
|
+
|
149
|
+
return _run
|
150
|
+
|
151
|
+
|
152
|
+
def info(server_handle: str) -> Optional[ServerSpec]:
|
153
|
+
"""Calls the ``describe`` API on the scheduler hosting the server to get
|
154
|
+
information about it.
|
155
|
+
|
156
|
+
Returns ``None`` if the server's job is not found in the scheduler's
|
157
|
+
control-plane. This can happen if the job does not exist
|
158
|
+
(e.g. typo in the server_handle) or the job already exited a long time ago.
|
159
|
+
|
160
|
+
NOTE: This function can return non-empty info for jobs that have
|
161
|
+
exited recently.
|
162
|
+
"""
|
163
|
+
with torchx_runner() as runner:
|
164
|
+
status = runner.status(server_handle)
|
165
|
+
if status is None:
|
166
|
+
return None
|
167
|
+
|
168
|
+
appdef = runner.describe(server_handle)
|
169
|
+
if appdef is None:
|
170
|
+
return None
|
171
|
+
|
172
|
+
# host status grouped by mesh (role) names
|
173
|
+
replica_status = {r.role: r.replicas for r in status.roles}
|
174
|
+
|
175
|
+
mesh_specs = []
|
176
|
+
for role in appdef.roles:
|
177
|
+
spec = mesh_spec_from_metadata(appdef, role.name)
|
178
|
+
assert spec is not None, "cannot be 'None' since we iterate over appdef's roles"
|
179
|
+
|
180
|
+
# null-guard since some schedulers do not fill replica_status
|
181
|
+
if host_status := replica_status.get(role.name):
|
182
|
+
spec.hostnames = [h.hostname for h in host_status]
|
183
|
+
|
184
|
+
mesh_specs.append(spec)
|
185
|
+
|
186
|
+
return ServerSpec(name=appdef.name, state=status.state, meshes=mesh_specs)
|
187
|
+
|
188
|
+
|
189
|
+
_5_SECONDS = timedelta(seconds=5)
|
190
|
+
|
191
|
+
|
192
|
+
async def server_ready(
|
193
|
+
server_handle: str, check_interval: timedelta = _5_SECONDS
|
194
|
+
) -> Optional[ServerSpec]:
|
195
|
+
"""Waits until the server's job is in RUNNING state to returns the server spec.
|
196
|
+
Returns `None` if the server does not exist.
|
197
|
+
|
198
|
+
NOTE: Certain fields such as `hostnames` is only filled (and valid) when the server is RUNNING.
|
199
|
+
|
200
|
+
Usage:
|
201
|
+
|
202
|
+
.. code-block:: python
|
203
|
+
|
204
|
+
server_info = await server_ready("slurm:///123")
|
205
|
+
if not server_info:
|
206
|
+
print(f"Job does not exist")
|
207
|
+
else:
|
208
|
+
if server_info.is_running:
|
209
|
+
for mesh in server_info.meshes:
|
210
|
+
connect_to(mesh.hostnames)
|
211
|
+
else:
|
212
|
+
print(f"Job in {server_info.state} state. Hostnames are not available")
|
213
|
+
|
214
|
+
"""
|
215
|
+
|
216
|
+
while True:
|
217
|
+
server_spec = info(server_handle)
|
218
|
+
|
219
|
+
if not server_spec: # server not found
|
220
|
+
return None
|
221
|
+
|
222
|
+
if server_spec.state <= AppState.PENDING: # UNSUBMITTED or SUBMITTED or PENDING
|
223
|
+
# NOTE: TorchX currently does not have async APIs so need to loop-on-interval
|
224
|
+
# TODO maybe inverse exponential backoff instead of constant interval?
|
225
|
+
check_interval_seconds = check_interval.total_seconds()
|
226
|
+
logger.info(
|
227
|
+
"waiting for %s to be %s (current: %s), will check again in %g seconds...",
|
228
|
+
server_handle,
|
229
|
+
AppState.RUNNING,
|
230
|
+
server_spec.state,
|
231
|
+
check_interval_seconds,
|
232
|
+
)
|
233
|
+
time.sleep(check_interval_seconds)
|
234
|
+
continue
|
235
|
+
else:
|
236
|
+
return server_spec
|
237
|
+
|
238
|
+
|
239
|
+
def kill(server_handle: str) -> None:
|
240
|
+
with torchx_runner() as runner:
|
241
|
+
runner.cancel(server_handle)
|
242
|
+
|
243
|
+
|
244
|
+
def bounce(server_handle: str) -> None:
|
245
|
+
"""(re)starts the server's processes without tearing down the server's job."""
|
246
|
+
raise NotImplementedError("`bounce` is not yet implemented")
|
247
|
+
|
248
|
+
|
249
|
+
def stop(server_handle: str) -> None:
|
250
|
+
"""Stops the server's unix processes without tearing down the server's job."""
|
251
|
+
raise NotImplementedError("`stop` is not yet implemented")
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
import getpass
|
9
|
+
from typing import Optional
|
10
|
+
|
11
|
+
from monarch.tools import mesh_spec
|
12
|
+
from monarch.tools.mesh_spec import mesh_spec_from_str
|
13
|
+
from torchx import specs
|
14
|
+
|
15
|
+
_DEFAULT_MESHES = ["mesh_0:1:gpu.small"]
|
16
|
+
|
17
|
+
_USER: str = getpass.getuser()
|
18
|
+
|
19
|
+
__version__ = "latest" # TODO get version from monarch.__version_
|
20
|
+
|
21
|
+
|
22
|
+
def proc_mesh(
|
23
|
+
name: str = f"monarch-{_USER}",
|
24
|
+
image: str = f"ghcr.io/pytorch-labs/monarch:{__version__}", # TODO docker needs to be built and pushed to ghcr
|
25
|
+
meshes: list[str] = _DEFAULT_MESHES,
|
26
|
+
env: Optional[dict[str, str]] = None,
|
27
|
+
port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT,
|
28
|
+
program: str = "monarch_bootstrap", # installed with monarch wheel (as console script)
|
29
|
+
) -> specs.AppDef:
|
30
|
+
"""
|
31
|
+
Args:
|
32
|
+
name: the name of the monarch server job
|
33
|
+
image: docker image to run the job on, for slurm, image is the dir the job is run from
|
34
|
+
meshes: list of mesh specs of the form "{name}:{num_hosts}:{host_type}"
|
35
|
+
env: environment variables to be passed to the main command (e.g. ENV1=v1,ENV2=v2,ENV3=v3)
|
36
|
+
port: the port that the remote process allocator runs on (must be reachable from the client)
|
37
|
+
program: path to the binary that the remote process allocator spawns on an allocation request
|
38
|
+
"""
|
39
|
+
|
40
|
+
appdef = specs.AppDef(name)
|
41
|
+
|
42
|
+
for mesh in [mesh_spec_from_str(mesh) for mesh in meshes]:
|
43
|
+
mesh_role = specs.Role(
|
44
|
+
name=mesh.name,
|
45
|
+
image=image,
|
46
|
+
entrypoint="process_allocator", # run "cargo install monarch_hyperactor" to get this binary
|
47
|
+
args=[
|
48
|
+
f"--port={port}",
|
49
|
+
f"--program={program}",
|
50
|
+
],
|
51
|
+
num_replicas=mesh.num_hosts,
|
52
|
+
resource=specs.resource(h=mesh.host_type),
|
53
|
+
env=env or {},
|
54
|
+
port_map={"mesh": port},
|
55
|
+
)
|
56
|
+
appdef.roles.append(mesh_role)
|
57
|
+
|
58
|
+
return appdef
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
from dataclasses import dataclass, field
|
9
|
+
from typing import Any, Optional
|
10
|
+
|
11
|
+
|
12
|
+
NOT_SET: str = "__NOT_SET__"
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass
|
16
|
+
class Config:
|
17
|
+
scheduler: str = NOT_SET
|
18
|
+
scheduler_args: dict[str, Any] = field(default_factory=dict)
|
19
|
+
workspace: Optional[str] = None
|
20
|
+
dryrun: bool = False
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
"""Defines defaults for ``monarch.tools``"""
|
10
|
+
|
11
|
+
from typing import Callable, Optional
|
12
|
+
|
13
|
+
from monarch.tools.components import hyperactor
|
14
|
+
from monarch.tools.config import Config
|
15
|
+
|
16
|
+
from torchx import specs
|
17
|
+
from torchx.schedulers import (
|
18
|
+
docker_scheduler,
|
19
|
+
kubernetes_scheduler,
|
20
|
+
local_scheduler,
|
21
|
+
SchedulerFactory,
|
22
|
+
slurm_scheduler,
|
23
|
+
)
|
24
|
+
|
25
|
+
|
26
|
+
def component_fn(scheduler: str) -> Callable[..., specs.AppDef]:
|
27
|
+
"""The default TorchX component function for the scheduler"""
|
28
|
+
return hyperactor.proc_mesh
|
29
|
+
|
30
|
+
|
31
|
+
def scheduler_factories() -> dict[str, SchedulerFactory]:
|
32
|
+
"""Supported schedulers (name -> scheduler static factory method)"""
|
33
|
+
return { # pyre-ignore[7]
|
34
|
+
# --- local schedulers (no multi-host support) ---
|
35
|
+
"local_cwd": local_scheduler.create_scheduler,
|
36
|
+
"local_docker": docker_scheduler.create_scheduler,
|
37
|
+
# --- remote schedulers (yes multi-host support) ---
|
38
|
+
"slurm": slurm_scheduler.create_scheduler,
|
39
|
+
"k8s": kubernetes_scheduler.create_scheduler,
|
40
|
+
}
|
41
|
+
|
42
|
+
|
43
|
+
def config(scheduler: str, workspace: Optional[str] = None) -> Config:
|
44
|
+
"""The default :py:class:`~monarch.tools.config.Config` to use when submitting to the provided ``scheduler``."""
|
45
|
+
return Config(scheduler=scheduler, workspace=workspace)
|
46
|
+
|
47
|
+
|
48
|
+
def dryrun_info_formatter(dryrun_info: specs.AppDryRunInfo) -> Callable[..., str]:
|
49
|
+
"""Used to attach a formatter to the dryrun info when running
|
50
|
+
:py:function:`~monarch.tools.commands.create` in ``dryrun`` mode so that
|
51
|
+
the returned ``AppDryrunInfo`` can be printed to console.
|
52
|
+
"""
|
53
|
+
# no-op, use the default formatter already attached to the dryrun info
|
54
|
+
return dryrun_info._fmt
|