torchx-nightly 2025.9.17__py3-none-any.whl → 2025.9.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/components/__init__.py +0 -7
- torchx/schedulers/__init__.py +0 -1
- torchx/specs/__init__.py +10 -5
- torchx/specs/api.py +4 -1
- {torchx_nightly-2025.9.17.dist-info → torchx_nightly-2025.9.19.dist-info}/METADATA +1 -15
- {torchx_nightly-2025.9.17.dist-info → torchx_nightly-2025.9.19.dist-info}/RECORD +10 -22
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
- torchx/pipelines/kfp/__init__.py +0 -30
- torchx/pipelines/kfp/adapter.py +0 -274
- torchx/pipelines/kfp/version.py +0 -19
- torchx/schedulers/ray/__init__.py +0 -6
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -454
- {torchx_nightly-2025.9.17.dist-info → torchx_nightly-2025.9.19.dist-info}/LICENSE +0 -0
- {torchx_nightly-2025.9.17.dist-info → torchx_nightly-2025.9.19.dist-info}/WHEEL +0 -0
- {torchx_nightly-2025.9.17.dist-info → torchx_nightly-2025.9.19.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.9.17.dist-info → torchx_nightly-2025.9.19.dist-info}/top_level.txt +0 -0
torchx/pipelines/kfp/__init__.py
DELETED
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
-
# All rights reserved.
|
|
4
|
-
#
|
|
5
|
-
# This source code is licensed under the BSD-style license found in the
|
|
6
|
-
# LICENSE file in the root directory of this source tree.
|
|
7
|
-
|
|
8
|
-
# pyre-strict
|
|
9
|
-
|
|
10
|
-
"""
|
|
11
|
-
This module contains adapters for converting TorchX components into KubeFlow
|
|
12
|
-
Pipeline components.
|
|
13
|
-
|
|
14
|
-
The current KFP adapters only support single node (1 role and 1 replica)
|
|
15
|
-
components.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
import kfp
|
|
19
|
-
|
|
20
|
-
from .version import __version__ as __version__ # noqa F401
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def _check_kfp_version() -> None:
|
|
24
|
-
if not kfp.__version__.startswith("1."):
|
|
25
|
-
raise ImportError(
|
|
26
|
-
f"Only kfp version 1.x.x is supported! kfp version {kfp.__version__}"
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
_check_kfp_version()
|
torchx/pipelines/kfp/adapter.py
DELETED
|
@@ -1,274 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
-
# All rights reserved.
|
|
4
|
-
#
|
|
5
|
-
# This source code is licensed under the BSD-style license found in the
|
|
6
|
-
# LICENSE file in the root directory of this source tree.
|
|
7
|
-
|
|
8
|
-
# pyre-strict
|
|
9
|
-
|
|
10
|
-
import json
|
|
11
|
-
import os
|
|
12
|
-
import os.path
|
|
13
|
-
import shlex
|
|
14
|
-
from typing import Mapping, Optional, Tuple
|
|
15
|
-
|
|
16
|
-
import yaml
|
|
17
|
-
from kfp import components, dsl
|
|
18
|
-
|
|
19
|
-
# @manual=fbsource//third-party/pypi/kfp:kfp
|
|
20
|
-
from kfp.components.structures import ComponentSpec, OutputSpec
|
|
21
|
-
from kubernetes.client.models import (
|
|
22
|
-
V1ContainerPort,
|
|
23
|
-
V1EmptyDirVolumeSource,
|
|
24
|
-
V1Volume,
|
|
25
|
-
V1VolumeMount,
|
|
26
|
-
)
|
|
27
|
-
from torchx.schedulers.kubernetes_scheduler import app_to_resource, pod_labels
|
|
28
|
-
from torchx.specs import api
|
|
29
|
-
from typing_extensions import Protocol
|
|
30
|
-
|
|
31
|
-
from .version import __version__ as __version__ # noqa F401
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def component_spec_from_app(app: api.AppDef) -> Tuple[str, api.Role]:
|
|
35
|
-
"""
|
|
36
|
-
component_spec_from_app takes in a TorchX component and generates the yaml
|
|
37
|
-
spec for it. Notably this doesn't apply resources or port_maps since those
|
|
38
|
-
must be applied at runtime which is why it returns the role spec as well.
|
|
39
|
-
|
|
40
|
-
>>> from torchx import specs
|
|
41
|
-
>>> from torchx.pipelines.kfp.adapter import component_spec_from_app
|
|
42
|
-
>>> app_def = specs.AppDef(
|
|
43
|
-
... name="trainer",
|
|
44
|
-
... roles=[specs.Role("trainer", image="foo:latest")],
|
|
45
|
-
... )
|
|
46
|
-
>>> component_spec_from_app(app_def)
|
|
47
|
-
('description: ...', Role(...))
|
|
48
|
-
"""
|
|
49
|
-
assert len(app.roles) == 1, f"KFP adapter only support one role, got {app.roles}"
|
|
50
|
-
|
|
51
|
-
role = app.roles[0]
|
|
52
|
-
assert (
|
|
53
|
-
role.num_replicas
|
|
54
|
-
== 1
|
|
55
|
-
# pyre-fixme[16]: `AppDef` has no attribute `num_replicas`.
|
|
56
|
-
), f"KFP adapter only supports one replica, got {app.num_replicas}"
|
|
57
|
-
|
|
58
|
-
command = [role.entrypoint, *role.args]
|
|
59
|
-
|
|
60
|
-
spec = {
|
|
61
|
-
"name": f"{app.name}-{role.name}",
|
|
62
|
-
"description": f"KFP wrapper for TorchX component {app.name}, role {role.name}",
|
|
63
|
-
"implementation": {
|
|
64
|
-
"container": {
|
|
65
|
-
"image": role.image,
|
|
66
|
-
"command": command,
|
|
67
|
-
"env": role.env,
|
|
68
|
-
}
|
|
69
|
-
},
|
|
70
|
-
"outputs": [],
|
|
71
|
-
}
|
|
72
|
-
return yaml.dump(spec), role
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
class ContainerFactory(Protocol):
|
|
76
|
-
"""
|
|
77
|
-
ContainerFactory is a protocol that represents a function that when called produces a
|
|
78
|
-
kfp.dsl.ContainerOp.
|
|
79
|
-
"""
|
|
80
|
-
|
|
81
|
-
def __call__(self, *args: object, **kwargs: object) -> dsl.ContainerOp: ...
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
class KFPContainerFactory(ContainerFactory, Protocol):
|
|
85
|
-
"""
|
|
86
|
-
KFPContainerFactory is a ContainerFactory that also has some KFP metadata
|
|
87
|
-
attached to it.
|
|
88
|
-
"""
|
|
89
|
-
|
|
90
|
-
component_spec: ComponentSpec
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
METADATA_FILE = "/tmp/outputs/mlpipeline-ui-metadata/data.json"
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def component_from_app(
|
|
97
|
-
app: api.AppDef, ui_metadata: Optional[Mapping[str, object]] = None
|
|
98
|
-
) -> ContainerFactory:
|
|
99
|
-
"""
|
|
100
|
-
component_from_app takes in a TorchX component/AppDef and returns a KFP
|
|
101
|
-
ContainerOp factory. This is equivalent to the
|
|
102
|
-
`kfp.components.load_component_from_*
|
|
103
|
-
<https://kubeflow-pipelines.readthedocs.io/en/1.8.22/source/kfp.components.html#kfp.components.load_component_from_text>`_
|
|
104
|
-
methods.
|
|
105
|
-
|
|
106
|
-
Args:
|
|
107
|
-
app: The AppDef to generate a KFP container factory for.
|
|
108
|
-
ui_metadata: KFP UI Metadata to output so you can have model results show
|
|
109
|
-
up in the UI. See
|
|
110
|
-
https://www.kubeflow.org/docs/components/pipelines/legacy-v1/sdk/output-viewer/
|
|
111
|
-
for more info on the format.
|
|
112
|
-
|
|
113
|
-
>>> from torchx import specs
|
|
114
|
-
>>> from torchx.pipelines.kfp.adapter import component_from_app
|
|
115
|
-
>>> app_def = specs.AppDef(
|
|
116
|
-
... name="trainer",
|
|
117
|
-
... roles=[specs.Role("trainer", image="foo:latest")],
|
|
118
|
-
... )
|
|
119
|
-
>>> component_from_app(app_def)
|
|
120
|
-
<function component_from_app...>
|
|
121
|
-
"""
|
|
122
|
-
|
|
123
|
-
role_spec: api.Role
|
|
124
|
-
spec, role_spec = component_spec_from_app(app)
|
|
125
|
-
resources: api.Resource = role_spec.resource
|
|
126
|
-
assert (
|
|
127
|
-
len(resources.capabilities) == 0
|
|
128
|
-
), f"KFP doesn't support capabilities, got {resources.capabilities}"
|
|
129
|
-
component_factory: KFPContainerFactory = components.load_component_from_text(spec)
|
|
130
|
-
|
|
131
|
-
if ui_metadata is not None:
|
|
132
|
-
# pyre-fixme[16]: `ComponentSpec` has no attribute `outputs`
|
|
133
|
-
component_factory.component_spec.outputs.append(
|
|
134
|
-
OutputSpec(
|
|
135
|
-
name="mlpipeline-ui-metadata",
|
|
136
|
-
type="MLPipeline UI Metadata",
|
|
137
|
-
description="ui metadata",
|
|
138
|
-
)
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
def factory_wrapper(*args: object, **kwargs: object) -> dsl.ContainerOp:
|
|
142
|
-
c = component_factory(*args, **kwargs)
|
|
143
|
-
container = c.container
|
|
144
|
-
|
|
145
|
-
if ui_metadata is not None:
|
|
146
|
-
# We generate the UI metadata from the sidecar so we need to make
|
|
147
|
-
# both the container and the sidecar share the same tmp directory so
|
|
148
|
-
# the outputs appear in the original container.
|
|
149
|
-
c.add_volume(V1Volume(name="tmp", empty_dir=V1EmptyDirVolumeSource()))
|
|
150
|
-
container.add_volume_mount(
|
|
151
|
-
V1VolumeMount(
|
|
152
|
-
name="tmp",
|
|
153
|
-
mount_path="/tmp/",
|
|
154
|
-
)
|
|
155
|
-
)
|
|
156
|
-
c.output_artifact_paths["mlpipeline-ui-metadata"] = METADATA_FILE
|
|
157
|
-
c.add_sidecar(_ui_metadata_sidecar(ui_metadata))
|
|
158
|
-
|
|
159
|
-
cpu = resources.cpu
|
|
160
|
-
if cpu >= 0:
|
|
161
|
-
cpu_str = f"{int(cpu*1000)}m"
|
|
162
|
-
container.set_cpu_request(cpu_str)
|
|
163
|
-
container.set_cpu_limit(cpu_str)
|
|
164
|
-
mem = resources.memMB
|
|
165
|
-
if mem >= 0:
|
|
166
|
-
mem_str = f"{int(mem)}M"
|
|
167
|
-
container.set_memory_request(mem_str)
|
|
168
|
-
container.set_memory_limit(mem_str)
|
|
169
|
-
gpu = resources.gpu
|
|
170
|
-
if gpu > 0:
|
|
171
|
-
container.set_gpu_limit(str(gpu))
|
|
172
|
-
|
|
173
|
-
for name, port in role_spec.port_map.items():
|
|
174
|
-
container.add_port(
|
|
175
|
-
V1ContainerPort(
|
|
176
|
-
name=name,
|
|
177
|
-
container_port=port,
|
|
178
|
-
),
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
c.pod_labels.update(pod_labels(app, 0, role_spec, 0, app.name))
|
|
182
|
-
|
|
183
|
-
return c
|
|
184
|
-
|
|
185
|
-
return factory_wrapper
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
def _ui_metadata_sidecar(
|
|
189
|
-
ui_metadata: Mapping[str, object], image: str = "alpine"
|
|
190
|
-
) -> dsl.Sidecar:
|
|
191
|
-
shell_encoded = shlex.quote(json.dumps(ui_metadata))
|
|
192
|
-
dirname = os.path.dirname(METADATA_FILE)
|
|
193
|
-
return dsl.Sidecar(
|
|
194
|
-
name="ui-metadata-sidecar",
|
|
195
|
-
image=image,
|
|
196
|
-
command=[
|
|
197
|
-
"sh",
|
|
198
|
-
"-c",
|
|
199
|
-
f"mkdir -p {dirname}; echo {shell_encoded} > {METADATA_FILE}",
|
|
200
|
-
],
|
|
201
|
-
mirror_volume_mounts=True,
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def container_from_app(
|
|
206
|
-
app: api.AppDef,
|
|
207
|
-
*args: object,
|
|
208
|
-
ui_metadata: Optional[Mapping[str, object]] = None,
|
|
209
|
-
**kwargs: object,
|
|
210
|
-
) -> dsl.ContainerOp:
|
|
211
|
-
"""
|
|
212
|
-
container_from_app transforms the app into a KFP component and returns a
|
|
213
|
-
corresponding ContainerOp instance.
|
|
214
|
-
|
|
215
|
-
See component_from_app for description on the arguments. Any unspecified
|
|
216
|
-
arguments are passed through to the KFP container factory method.
|
|
217
|
-
|
|
218
|
-
>>> import kfp
|
|
219
|
-
>>> from torchx import specs
|
|
220
|
-
>>> from torchx.pipelines.kfp.adapter import container_from_app
|
|
221
|
-
>>> app_def = specs.AppDef(
|
|
222
|
-
... name="trainer",
|
|
223
|
-
... roles=[specs.Role("trainer", image="foo:latest")],
|
|
224
|
-
... )
|
|
225
|
-
>>> def pipeline():
|
|
226
|
-
... trainer = container_from_app(app_def)
|
|
227
|
-
... print(trainer)
|
|
228
|
-
>>> kfp.compiler.Compiler().compile(
|
|
229
|
-
... pipeline_func=pipeline,
|
|
230
|
-
... package_path="/tmp/pipeline.yaml",
|
|
231
|
-
... )
|
|
232
|
-
{'ContainerOp': {... 'name': 'trainer-trainer', ...}}
|
|
233
|
-
"""
|
|
234
|
-
factory = component_from_app(app, ui_metadata)
|
|
235
|
-
return factory(*args, **kwargs)
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def resource_from_app(
|
|
239
|
-
app: api.AppDef,
|
|
240
|
-
queue: str,
|
|
241
|
-
service_account: Optional[str] = None,
|
|
242
|
-
) -> dsl.ResourceOp:
|
|
243
|
-
"""
|
|
244
|
-
resource_from_app generates a KFP ResourceOp from the provided app that uses
|
|
245
|
-
the Volcano job scheduler on Kubernetes to run distributed apps. See
|
|
246
|
-
https://volcano.sh/en/docs/ for more info on Volcano and how to install.
|
|
247
|
-
|
|
248
|
-
Args:
|
|
249
|
-
app: The torchx AppDef to adapt.
|
|
250
|
-
queue: the Volcano queue to schedule the operator in.
|
|
251
|
-
|
|
252
|
-
>>> import kfp
|
|
253
|
-
>>> from torchx import specs
|
|
254
|
-
>>> from torchx.pipelines.kfp.adapter import resource_from_app
|
|
255
|
-
>>> app_def = specs.AppDef(
|
|
256
|
-
... name="trainer",
|
|
257
|
-
... roles=[specs.Role("trainer", image="foo:latest", num_replicas=3)],
|
|
258
|
-
... )
|
|
259
|
-
>>> def pipeline():
|
|
260
|
-
... trainer = resource_from_app(app_def, queue="test")
|
|
261
|
-
... print(trainer)
|
|
262
|
-
>>> kfp.compiler.Compiler().compile(
|
|
263
|
-
... pipeline_func=pipeline,
|
|
264
|
-
... package_path="/tmp/pipeline.yaml",
|
|
265
|
-
... )
|
|
266
|
-
{'ResourceOp': {... 'name': 'trainer-0', ... 'name': 'trainer-1', ... 'name': 'trainer-2', ...}}
|
|
267
|
-
"""
|
|
268
|
-
return dsl.ResourceOp(
|
|
269
|
-
name=app.name,
|
|
270
|
-
action="create",
|
|
271
|
-
success_condition="status.state.phase = Completed",
|
|
272
|
-
failure_condition="status.state.phase = Failed",
|
|
273
|
-
k8s_resource=app_to_resource(app, queue, service_account=service_account),
|
|
274
|
-
)
|
torchx/pipelines/kfp/version.py
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
-
# All rights reserved.
|
|
4
|
-
#
|
|
5
|
-
# This source code is licensed under the BSD-style license found in the
|
|
6
|
-
# LICENSE file in the root directory of this source tree.
|
|
7
|
-
|
|
8
|
-
# pyre-strict
|
|
9
|
-
|
|
10
|
-
# Follows PEP-0440 version scheme guidelines
|
|
11
|
-
# https://www.python.org/dev/peps/pep-0440/#version-scheme
|
|
12
|
-
#
|
|
13
|
-
# Examples:
|
|
14
|
-
# 0.1.0.devN # Developmental release
|
|
15
|
-
# 0.1.0aN # Alpha release
|
|
16
|
-
# 0.1.0bN # Beta release
|
|
17
|
-
# 0.1.0rcN # Release Candidate
|
|
18
|
-
# 0.1.0 # Final release
|
|
19
|
-
__version__ = "0.1.0.dev0"
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
#
|
|
4
|
-
# This source code is licensed under the BSD-style license found in the
|
|
5
|
-
# LICENSE file in the root directory of this source tree.
|
|
6
|
-
|
|
7
|
-
from dataclasses import dataclass, field
|
|
8
|
-
from typing import Dict, List, Optional
|
|
9
|
-
|
|
10
|
-
TORCHX_RANK0_HOST: str = "TORCHX_RANK0_HOST"
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclass
|
|
14
|
-
class RayActor:
|
|
15
|
-
"""Describes an actor (a.k.a. worker/replica in TorchX terms)."""
|
|
16
|
-
|
|
17
|
-
name: str
|
|
18
|
-
command: List[str]
|
|
19
|
-
env: Dict[str, str] = field(default_factory=dict)
|
|
20
|
-
num_cpus: int = 1
|
|
21
|
-
num_gpus: int = 0
|
|
22
|
-
min_replicas: Optional[int] = None
|
|
@@ -1,307 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
#
|
|
4
|
-
# This source code is licensed under the BSD-style license found in the
|
|
5
|
-
# LICENSE file in the root directory of this source tree.
|
|
6
|
-
|
|
7
|
-
"""
|
|
8
|
-
We use placement groups to reserve resources in the ray cluster, it
|
|
9
|
-
ensure that a job will not lose the resources it used to have before
|
|
10
|
-
the job is finished. The deadlock situtation while launch multiple jobs at the
|
|
11
|
-
same time is avoided by create a big placement group that contains the minimum
|
|
12
|
-
required command actors for the job. Once the placement groups are created(may
|
|
13
|
-
not be scheduled on a physical node yet), then we schedule command actors to
|
|
14
|
-
the corresponding placement group, each actor is associated with a placement
|
|
15
|
-
group which hold the resource the acotr needs. Each time a placement group successfully
|
|
16
|
-
acquired the resources from the ray cluster, the actor scheduled to this placement group
|
|
17
|
-
will be executed. Command actors are state machines their behavior is defined by the
|
|
18
|
-
_step function, this give more flexibility to us if we want to bette handle the
|
|
19
|
-
node failures.
|
|
20
|
-
"""
|
|
21
|
-
import json
|
|
22
|
-
import logging
|
|
23
|
-
import os
|
|
24
|
-
import socket
|
|
25
|
-
import subprocess
|
|
26
|
-
import sys
|
|
27
|
-
|
|
28
|
-
from contextlib import closing
|
|
29
|
-
from dataclasses import dataclass
|
|
30
|
-
from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
|
|
31
|
-
|
|
32
|
-
import ray
|
|
33
|
-
from ray.util.placement_group import PlacementGroup
|
|
34
|
-
|
|
35
|
-
if TYPE_CHECKING:
|
|
36
|
-
from torchx.schedulers.ray.ray_common import RayActor, TORCHX_RANK0_HOST
|
|
37
|
-
|
|
38
|
-
# Hack to make code work for tests as well as running ray job.
|
|
39
|
-
# For tests the `torchx.schedulers.ray.ray_common` import must be used
|
|
40
|
-
# For running ray jobs `ray_common` import must be used
|
|
41
|
-
try:
|
|
42
|
-
# pyre-fixme[21]: Could not find a module corresponding to import `ray_common`.
|
|
43
|
-
from ray_common import RayActor, TORCHX_RANK0_HOST # noqa: F811
|
|
44
|
-
except ModuleNotFoundError:
|
|
45
|
-
from torchx.schedulers.ray.ray_common import RayActor, TORCHX_RANK0_HOST
|
|
46
|
-
|
|
47
|
-
_logger: logging.Logger = logging.getLogger(__name__)
|
|
48
|
-
_logger.setLevel(logging.getLevelName(os.environ.get("LOGLEVEL", "INFO")))
|
|
49
|
-
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@dataclass
|
|
53
|
-
class RayResult:
|
|
54
|
-
id: str
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
class TaskCompleted(RayResult):
|
|
58
|
-
pass
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class CommandActorScheduled(RayResult):
|
|
62
|
-
pass
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
@ray.remote
|
|
66
|
-
class CommandActor: # pragma: no cover
|
|
67
|
-
def __init__(self, cmd: List[str], env: Dict[str, str]) -> None:
|
|
68
|
-
self.cmd: List[str] = cmd
|
|
69
|
-
self.env: Dict[str, str] = env
|
|
70
|
-
|
|
71
|
-
def exec_module(
|
|
72
|
-
self, master_addr: str, master_port: int, actor_id: str
|
|
73
|
-
) -> TaskCompleted:
|
|
74
|
-
"""Execute a user script"""
|
|
75
|
-
if master_addr is None or master_port is None:
|
|
76
|
-
raise RuntimeError(
|
|
77
|
-
"Either MASTER_ADDR or MASTER_PORT are not set. This is most likely bug in torchx"
|
|
78
|
-
"Open issue at https://github.com/pytorch/torchx"
|
|
79
|
-
)
|
|
80
|
-
worker_evn = {}
|
|
81
|
-
worker_evn.update(os.environ)
|
|
82
|
-
worker_evn.update(self.env)
|
|
83
|
-
worker_evn[TORCHX_RANK0_HOST] = master_addr
|
|
84
|
-
popen = subprocess.Popen(self.cmd, env=worker_evn)
|
|
85
|
-
|
|
86
|
-
returncode = popen.wait()
|
|
87
|
-
_logger.info(f"Finished with code {returncode}")
|
|
88
|
-
|
|
89
|
-
if returncode != 0:
|
|
90
|
-
raise RuntimeError(f"exec_module failed with return code {returncode}")
|
|
91
|
-
|
|
92
|
-
return TaskCompleted(actor_id)
|
|
93
|
-
|
|
94
|
-
def schedule(self, actor_id: str) -> CommandActorScheduled:
|
|
95
|
-
"""Testing if a command actor is scheduled"""
|
|
96
|
-
return CommandActorScheduled(actor_id)
|
|
97
|
-
|
|
98
|
-
def get_actor_address_and_port(self) -> Tuple[str, int]:
|
|
99
|
-
addr = ray.util.get_node_ip_address()
|
|
100
|
-
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
|
|
101
|
-
s.bind(("", 0))
|
|
102
|
-
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
103
|
-
port = s.getsockname()[1]
|
|
104
|
-
return addr, port
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def load_actor_json(filename: str) -> List[RayActor]:
|
|
108
|
-
"""Loading replicas specifications from a JSON file"""
|
|
109
|
-
with open(filename) as f:
|
|
110
|
-
actors: List[RayActor] = []
|
|
111
|
-
# Yes this is gross but it works
|
|
112
|
-
actor_dict = json.load(f)
|
|
113
|
-
actor_dict = json.loads(actor_dict)
|
|
114
|
-
for actor in actor_dict:
|
|
115
|
-
actors.append(RayActor(**actor))
|
|
116
|
-
return actors
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
def create_placement_group_async(replicas: List[RayActor]) -> PlacementGroup: # type: ignore
|
|
120
|
-
"""return a placement group reference, the corresponding placement group could be scheduled or pending"""
|
|
121
|
-
bundles = []
|
|
122
|
-
for replica in replicas:
|
|
123
|
-
bundles.append({"CPU": replica.num_cpus, "GPU": replica.num_gpus})
|
|
124
|
-
|
|
125
|
-
pg = ray.util.placement_group(bundles, strategy="SPREAD")
|
|
126
|
-
return pg
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
@dataclass
|
|
130
|
-
class ActorInfo:
|
|
131
|
-
"""Used to store the information for restoring a failed command actor"""
|
|
132
|
-
|
|
133
|
-
pg: PlacementGroup
|
|
134
|
-
replica: RayActor
|
|
135
|
-
actor: CommandActor
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
class RayDriver:
|
|
139
|
-
def __init__(self, replicas: List[RayActor]) -> None:
|
|
140
|
-
self.replicas = replicas
|
|
141
|
-
self.master_node_id: Optional[str] = None # the actor id of the master node
|
|
142
|
-
self.rank_0_address: Optional[str] = None
|
|
143
|
-
self.rank_0_port: Optional[int] = None
|
|
144
|
-
self.max_replicas: int = len(replicas)
|
|
145
|
-
self.min_replicas: int
|
|
146
|
-
if replicas[0].min_replicas is None:
|
|
147
|
-
self.min_replicas = self.max_replicas
|
|
148
|
-
else:
|
|
149
|
-
self.min_replicas = replicas[0].min_replicas # pyre-ignore[8]
|
|
150
|
-
|
|
151
|
-
self.placement_groups: List[PlacementGroup] = (
|
|
152
|
-
[]
|
|
153
|
-
) # all the placement groups, shall never change
|
|
154
|
-
self.actor_info_of_id: Dict[str, ActorInfo] = (
|
|
155
|
-
{}
|
|
156
|
-
) # store the info used to recover an actor
|
|
157
|
-
self.active_tasks: List["ray.ObjectRef"] = [] # list of active tasks
|
|
158
|
-
|
|
159
|
-
self.terminating: bool = False # if the job has finished and being terminated
|
|
160
|
-
self.command_actors_count: int = 0 # number of created command actors
|
|
161
|
-
|
|
162
|
-
def init_placement_groups(self) -> None:
|
|
163
|
-
"""Initialize all placement groups needed for this job"""
|
|
164
|
-
# find the actor specifications of a given placement group
|
|
165
|
-
replica_ix_of_pg: List[int] = [0] + list(
|
|
166
|
-
range(
|
|
167
|
-
self.min_replicas,
|
|
168
|
-
self.max_replicas + 1,
|
|
169
|
-
)
|
|
170
|
-
)
|
|
171
|
-
# create all the placement groups
|
|
172
|
-
initial_group = create_placement_group_async(
|
|
173
|
-
self.replicas[replica_ix_of_pg[0] : replica_ix_of_pg[1]]
|
|
174
|
-
)
|
|
175
|
-
_logger.info("Waiting for minimum placement group to start.")
|
|
176
|
-
ready = initial_group.wait(100)
|
|
177
|
-
if not ready: # pragma: no cover
|
|
178
|
-
raise TimeoutError(
|
|
179
|
-
"Placement group creation timed out. Make sure "
|
|
180
|
-
"your cluster either has enough resources or use "
|
|
181
|
-
"an autoscaling cluster. Current resources "
|
|
182
|
-
"available: {}, resources requested by the "
|
|
183
|
-
"placement group: {}".format(
|
|
184
|
-
ray.available_resources(), initial_group.bundle_specs
|
|
185
|
-
)
|
|
186
|
-
)
|
|
187
|
-
self.placement_groups.append(initial_group)
|
|
188
|
-
for i in range(1, len(replica_ix_of_pg) - 1):
|
|
189
|
-
self.placement_groups.append(
|
|
190
|
-
create_placement_group_async(
|
|
191
|
-
self.replicas[replica_ix_of_pg[i] : replica_ix_of_pg[i + 1]]
|
|
192
|
-
)
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
def pop_actor_info(self, actor_id: str) -> ActorInfo:
|
|
196
|
-
"""Remove and return the info of a dead command actor"""
|
|
197
|
-
return self.actor_info_of_id.pop(actor_id)
|
|
198
|
-
|
|
199
|
-
def create_and_schedule_actor(self, pg: PlacementGroup, replica: RayActor) -> None:
|
|
200
|
-
"""create an command actor in the given placement group"""
|
|
201
|
-
# create the command actor
|
|
202
|
-
actor = CommandActor.options( # pyre-ignore[16]
|
|
203
|
-
placement_group=pg,
|
|
204
|
-
num_cpus=replica.num_cpus,
|
|
205
|
-
num_gpus=replica.num_gpus,
|
|
206
|
-
).remote(replica.command, replica.env)
|
|
207
|
-
|
|
208
|
-
# get the actor id of the created actor
|
|
209
|
-
actor_id = actor._actor_id.hex()
|
|
210
|
-
# launch a task to check if the actor is scheduled
|
|
211
|
-
self.active_tasks.append(actor.schedule.remote(actor_id))
|
|
212
|
-
# save the actor info for recovering from node failures
|
|
213
|
-
self.actor_info_of_id[actor_id] = ActorInfo(
|
|
214
|
-
actor=actor,
|
|
215
|
-
pg=pg,
|
|
216
|
-
replica=replica,
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
def place_command_actors(self) -> None:
|
|
220
|
-
"""Creating all command actors in all placement groups"""
|
|
221
|
-
# find the placement group index for a replica(actor's specification)
|
|
222
|
-
pg_ix_of_replica: List[int] = [
|
|
223
|
-
max(0, i - self.min_replicas + 1) for i in range(len(self.replicas))
|
|
224
|
-
]
|
|
225
|
-
# create the actors
|
|
226
|
-
for i in range(len(self.replicas)):
|
|
227
|
-
pg_ix = pg_ix_of_replica[i]
|
|
228
|
-
pg = self.placement_groups[pg_ix] # find the created placement group
|
|
229
|
-
replica = self.replicas[i]
|
|
230
|
-
self.create_and_schedule_actor(pg, replica)
|
|
231
|
-
|
|
232
|
-
def _step(self) -> bool:
|
|
233
|
-
"""Handling command actor's return"""
|
|
234
|
-
result: RayResult # execution result
|
|
235
|
-
_logger.info(f"running ray.wait on {self.active_tasks}")
|
|
236
|
-
# ray.wait is partial waiting
|
|
237
|
-
completed_tasks, self.active_tasks = ray.wait(self.active_tasks)
|
|
238
|
-
# If a failure occurs the ObjectRef will be marked as completed.
|
|
239
|
-
# Calling ray.get will expose the failure as a RayActorError.
|
|
240
|
-
for object_ref in completed_tasks:
|
|
241
|
-
result = ray.get(object_ref)
|
|
242
|
-
if isinstance(result, CommandActorScheduled):
|
|
243
|
-
if not self.terminating:
|
|
244
|
-
actor = self.actor_info_of_id[result.id].actor
|
|
245
|
-
if self.master_node_id is None:
|
|
246
|
-
# make this actor be the master node
|
|
247
|
-
self.master_node_id = result.id
|
|
248
|
-
self.rank_0_address, self.rank_0_port = ray.get(
|
|
249
|
-
actor.get_actor_address_and_port.remote() # pyre-ignore
|
|
250
|
-
)
|
|
251
|
-
self.active_tasks.append(
|
|
252
|
-
actor.exec_module.remote( # pyre-ignore
|
|
253
|
-
"localhost", 0, result.id
|
|
254
|
-
)
|
|
255
|
-
)
|
|
256
|
-
else:
|
|
257
|
-
self.active_tasks.append(
|
|
258
|
-
actor.exec_module.remote(
|
|
259
|
-
self.rank_0_address, self.rank_0_port, result.id
|
|
260
|
-
)
|
|
261
|
-
)
|
|
262
|
-
self.command_actors_count += 1
|
|
263
|
-
elif isinstance(result, TaskCompleted):
|
|
264
|
-
self.terminating = (
|
|
265
|
-
True # terminating the job, wait for all actors to finish
|
|
266
|
-
)
|
|
267
|
-
self.command_actors_count -= 1 # 1 completed command actor
|
|
268
|
-
self.pop_actor_info(result.id)
|
|
269
|
-
if (
|
|
270
|
-
self.command_actors_count == 0
|
|
271
|
-
): # all the command actors have finished
|
|
272
|
-
return True # is terminal
|
|
273
|
-
else:
|
|
274
|
-
raise RuntimeError(
|
|
275
|
-
f"Ray actor returns unknown type {type(result)}"
|
|
276
|
-
"This is most likely bug in torchx"
|
|
277
|
-
"Open issue at https://github.com/pytorch/torchx"
|
|
278
|
-
)
|
|
279
|
-
return False
|
|
280
|
-
|
|
281
|
-
def run(self) -> None:
|
|
282
|
-
"""This is the main loop the ray driver, it executes the user script on the scheduled nodes,
|
|
283
|
-
and restart the failed nodes(node failures). The loop ends when all the actors that joining
|
|
284
|
-
the job exits."""
|
|
285
|
-
self.terminating = False
|
|
286
|
-
self.command_actors_count = 0
|
|
287
|
-
# Await return result of remote ray function and initialize new command actors
|
|
288
|
-
while len(self.active_tasks) > 0:
|
|
289
|
-
terminal = self._step()
|
|
290
|
-
if terminal:
|
|
291
|
-
break
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
def main() -> None: # pragma: no cover
|
|
295
|
-
actors: List[RayActor] = load_actor_json("actors.json")
|
|
296
|
-
driver = RayDriver(actors)
|
|
297
|
-
ray.init(address="auto", namespace="torchx-ray")
|
|
298
|
-
driver.init_placement_groups()
|
|
299
|
-
_logger.info("Successfully created placement groups")
|
|
300
|
-
driver.place_command_actors()
|
|
301
|
-
_logger.info("Successfully placed command actors")
|
|
302
|
-
_logger.info("Entering main loop, start executing the script on worker nodes")
|
|
303
|
-
driver.run()
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
if __name__ == "__main__":
|
|
307
|
-
main()
|