torchx-nightly 2025.9.16__py3-none-any.whl → 2025.9.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

@@ -1,274 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) Meta Platforms, Inc. and affiliates.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the BSD-style license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- # pyre-strict
9
-
10
- import json
11
- import os
12
- import os.path
13
- import shlex
14
- from typing import Mapping, Optional, Tuple
15
-
16
- import yaml
17
- from kfp import components, dsl
18
-
19
- # @manual=fbsource//third-party/pypi/kfp:kfp
20
- from kfp.components.structures import ComponentSpec, OutputSpec
21
- from kubernetes.client.models import (
22
- V1ContainerPort,
23
- V1EmptyDirVolumeSource,
24
- V1Volume,
25
- V1VolumeMount,
26
- )
27
- from torchx.schedulers.kubernetes_scheduler import app_to_resource, pod_labels
28
- from torchx.specs import api
29
- from typing_extensions import Protocol
30
-
31
- from .version import __version__ as __version__ # noqa F401
32
-
33
-
34
- def component_spec_from_app(app: api.AppDef) -> Tuple[str, api.Role]:
35
- """
36
- component_spec_from_app takes in a TorchX component and generates the yaml
37
- spec for it. Notably this doesn't apply resources or port_maps since those
38
- must be applied at runtime which is why it returns the role spec as well.
39
-
40
- >>> from torchx import specs
41
- >>> from torchx.pipelines.kfp.adapter import component_spec_from_app
42
- >>> app_def = specs.AppDef(
43
- ... name="trainer",
44
- ... roles=[specs.Role("trainer", image="foo:latest")],
45
- ... )
46
- >>> component_spec_from_app(app_def)
47
- ('description: ...', Role(...))
48
- """
49
- assert len(app.roles) == 1, f"KFP adapter only support one role, got {app.roles}"
50
-
51
- role = app.roles[0]
52
- assert (
53
- role.num_replicas
54
- == 1
55
- # pyre-fixme[16]: `AppDef` has no attribute `num_replicas`.
56
- ), f"KFP adapter only supports one replica, got {app.num_replicas}"
57
-
58
- command = [role.entrypoint, *role.args]
59
-
60
- spec = {
61
- "name": f"{app.name}-{role.name}",
62
- "description": f"KFP wrapper for TorchX component {app.name}, role {role.name}",
63
- "implementation": {
64
- "container": {
65
- "image": role.image,
66
- "command": command,
67
- "env": role.env,
68
- }
69
- },
70
- "outputs": [],
71
- }
72
- return yaml.dump(spec), role
73
-
74
-
75
- class ContainerFactory(Protocol):
76
- """
77
- ContainerFactory is a protocol that represents a function that when called produces a
78
- kfp.dsl.ContainerOp.
79
- """
80
-
81
- def __call__(self, *args: object, **kwargs: object) -> dsl.ContainerOp: ...
82
-
83
-
84
- class KFPContainerFactory(ContainerFactory, Protocol):
85
- """
86
- KFPContainerFactory is a ContainerFactory that also has some KFP metadata
87
- attached to it.
88
- """
89
-
90
- component_spec: ComponentSpec
91
-
92
-
93
- METADATA_FILE = "/tmp/outputs/mlpipeline-ui-metadata/data.json"
94
-
95
-
96
- def component_from_app(
97
- app: api.AppDef, ui_metadata: Optional[Mapping[str, object]] = None
98
- ) -> ContainerFactory:
99
- """
100
- component_from_app takes in a TorchX component/AppDef and returns a KFP
101
- ContainerOp factory. This is equivalent to the
102
- `kfp.components.load_component_from_*
103
- <https://kubeflow-pipelines.readthedocs.io/en/1.8.22/source/kfp.components.html#kfp.components.load_component_from_text>`_
104
- methods.
105
-
106
- Args:
107
- app: The AppDef to generate a KFP container factory for.
108
- ui_metadata: KFP UI Metadata to output so you can have model results show
109
- up in the UI. See
110
- https://www.kubeflow.org/docs/components/pipelines/legacy-v1/sdk/output-viewer/
111
- for more info on the format.
112
-
113
- >>> from torchx import specs
114
- >>> from torchx.pipelines.kfp.adapter import component_from_app
115
- >>> app_def = specs.AppDef(
116
- ... name="trainer",
117
- ... roles=[specs.Role("trainer", image="foo:latest")],
118
- ... )
119
- >>> component_from_app(app_def)
120
- <function component_from_app...>
121
- """
122
-
123
- role_spec: api.Role
124
- spec, role_spec = component_spec_from_app(app)
125
- resources: api.Resource = role_spec.resource
126
- assert (
127
- len(resources.capabilities) == 0
128
- ), f"KFP doesn't support capabilities, got {resources.capabilities}"
129
- component_factory: KFPContainerFactory = components.load_component_from_text(spec)
130
-
131
- if ui_metadata is not None:
132
- # pyre-fixme[16]: `ComponentSpec` has no attribute `outputs`
133
- component_factory.component_spec.outputs.append(
134
- OutputSpec(
135
- name="mlpipeline-ui-metadata",
136
- type="MLPipeline UI Metadata",
137
- description="ui metadata",
138
- )
139
- )
140
-
141
- def factory_wrapper(*args: object, **kwargs: object) -> dsl.ContainerOp:
142
- c = component_factory(*args, **kwargs)
143
- container = c.container
144
-
145
- if ui_metadata is not None:
146
- # We generate the UI metadata from the sidecar so we need to make
147
- # both the container and the sidecar share the same tmp directory so
148
- # the outputs appear in the original container.
149
- c.add_volume(V1Volume(name="tmp", empty_dir=V1EmptyDirVolumeSource()))
150
- container.add_volume_mount(
151
- V1VolumeMount(
152
- name="tmp",
153
- mount_path="/tmp/",
154
- )
155
- )
156
- c.output_artifact_paths["mlpipeline-ui-metadata"] = METADATA_FILE
157
- c.add_sidecar(_ui_metadata_sidecar(ui_metadata))
158
-
159
- cpu = resources.cpu
160
- if cpu >= 0:
161
- cpu_str = f"{int(cpu*1000)}m"
162
- container.set_cpu_request(cpu_str)
163
- container.set_cpu_limit(cpu_str)
164
- mem = resources.memMB
165
- if mem >= 0:
166
- mem_str = f"{int(mem)}M"
167
- container.set_memory_request(mem_str)
168
- container.set_memory_limit(mem_str)
169
- gpu = resources.gpu
170
- if gpu > 0:
171
- container.set_gpu_limit(str(gpu))
172
-
173
- for name, port in role_spec.port_map.items():
174
- container.add_port(
175
- V1ContainerPort(
176
- name=name,
177
- container_port=port,
178
- ),
179
- )
180
-
181
- c.pod_labels.update(pod_labels(app, 0, role_spec, 0, app.name))
182
-
183
- return c
184
-
185
- return factory_wrapper
186
-
187
-
188
- def _ui_metadata_sidecar(
189
- ui_metadata: Mapping[str, object], image: str = "alpine"
190
- ) -> dsl.Sidecar:
191
- shell_encoded = shlex.quote(json.dumps(ui_metadata))
192
- dirname = os.path.dirname(METADATA_FILE)
193
- return dsl.Sidecar(
194
- name="ui-metadata-sidecar",
195
- image=image,
196
- command=[
197
- "sh",
198
- "-c",
199
- f"mkdir -p {dirname}; echo {shell_encoded} > {METADATA_FILE}",
200
- ],
201
- mirror_volume_mounts=True,
202
- )
203
-
204
-
205
- def container_from_app(
206
- app: api.AppDef,
207
- *args: object,
208
- ui_metadata: Optional[Mapping[str, object]] = None,
209
- **kwargs: object,
210
- ) -> dsl.ContainerOp:
211
- """
212
- container_from_app transforms the app into a KFP component and returns a
213
- corresponding ContainerOp instance.
214
-
215
- See component_from_app for description on the arguments. Any unspecified
216
- arguments are passed through to the KFP container factory method.
217
-
218
- >>> import kfp
219
- >>> from torchx import specs
220
- >>> from torchx.pipelines.kfp.adapter import container_from_app
221
- >>> app_def = specs.AppDef(
222
- ... name="trainer",
223
- ... roles=[specs.Role("trainer", image="foo:latest")],
224
- ... )
225
- >>> def pipeline():
226
- ... trainer = container_from_app(app_def)
227
- ... print(trainer)
228
- >>> kfp.compiler.Compiler().compile(
229
- ... pipeline_func=pipeline,
230
- ... package_path="/tmp/pipeline.yaml",
231
- ... )
232
- {'ContainerOp': {... 'name': 'trainer-trainer', ...}}
233
- """
234
- factory = component_from_app(app, ui_metadata)
235
- return factory(*args, **kwargs)
236
-
237
-
238
- def resource_from_app(
239
- app: api.AppDef,
240
- queue: str,
241
- service_account: Optional[str] = None,
242
- ) -> dsl.ResourceOp:
243
- """
244
- resource_from_app generates a KFP ResourceOp from the provided app that uses
245
- the Volcano job scheduler on Kubernetes to run distributed apps. See
246
- https://volcano.sh/en/docs/ for more info on Volcano and how to install.
247
-
248
- Args:
249
- app: The torchx AppDef to adapt.
250
- queue: the Volcano queue to schedule the operator in.
251
-
252
- >>> import kfp
253
- >>> from torchx import specs
254
- >>> from torchx.pipelines.kfp.adapter import resource_from_app
255
- >>> app_def = specs.AppDef(
256
- ... name="trainer",
257
- ... roles=[specs.Role("trainer", image="foo:latest", num_replicas=3)],
258
- ... )
259
- >>> def pipeline():
260
- ... trainer = resource_from_app(app_def, queue="test")
261
- ... print(trainer)
262
- >>> kfp.compiler.Compiler().compile(
263
- ... pipeline_func=pipeline,
264
- ... package_path="/tmp/pipeline.yaml",
265
- ... )
266
- {'ResourceOp': {... 'name': 'trainer-0', ... 'name': 'trainer-1', ... 'name': 'trainer-2', ...}}
267
- """
268
- return dsl.ResourceOp(
269
- name=app.name,
270
- action="create",
271
- success_condition="status.state.phase = Completed",
272
- failure_condition="status.state.phase = Failed",
273
- k8s_resource=app_to_resource(app, queue, service_account=service_account),
274
- )
@@ -1,19 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) Meta Platforms, Inc. and affiliates.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the BSD-style license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- # pyre-strict
9
-
10
- # Follows PEP-0440 version scheme guidelines
11
- # https://www.python.org/dev/peps/pep-0440/#version-scheme
12
- #
13
- # Examples:
14
- # 0.1.0.devN # Developmental release
15
- # 0.1.0aN # Alpha release
16
- # 0.1.0bN # Beta release
17
- # 0.1.0rcN # Release Candidate
18
- # 0.1.0 # Final release
19
- __version__ = "0.1.0.dev0"
@@ -1,6 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) Meta Platforms, Inc. and affiliates.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the BSD-style license found in the
6
- # LICENSE file in the root directory of this source tree.
@@ -1,22 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- from dataclasses import dataclass, field
8
- from typing import Dict, List, Optional
9
-
10
- TORCHX_RANK0_HOST: str = "TORCHX_RANK0_HOST"
11
-
12
-
13
- @dataclass
14
- class RayActor:
15
- """Describes an actor (a.k.a. worker/replica in TorchX terms)."""
16
-
17
- name: str
18
- command: List[str]
19
- env: Dict[str, str] = field(default_factory=dict)
20
- num_cpus: int = 1
21
- num_gpus: int = 0
22
- min_replicas: Optional[int] = None
@@ -1,307 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """
8
- We use placement groups to reserve resources in the ray cluster, it
9
- ensure that a job will not lose the resources it used to have before
10
- the job is finished. The deadlock situtation while launch multiple jobs at the
11
- same time is avoided by create a big placement group that contains the minimum
12
- required command actors for the job. Once the placement groups are created(may
13
- not be scheduled on a physical node yet), then we schedule command actors to
14
- the corresponding placement group, each actor is associated with a placement
15
- group which hold the resource the acotr needs. Each time a placement group successfully
16
- acquired the resources from the ray cluster, the actor scheduled to this placement group
17
- will be executed. Command actors are state machines their behavior is defined by the
18
- _step function, this give more flexibility to us if we want to bette handle the
19
- node failures.
20
- """
21
- import json
22
- import logging
23
- import os
24
- import socket
25
- import subprocess
26
- import sys
27
-
28
- from contextlib import closing
29
- from dataclasses import dataclass
30
- from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
31
-
32
- import ray
33
- from ray.util.placement_group import PlacementGroup
34
-
35
- if TYPE_CHECKING:
36
- from torchx.schedulers.ray.ray_common import RayActor, TORCHX_RANK0_HOST
37
-
38
- # Hack to make code work for tests as well as running ray job.
39
- # For tests the `torchx.schedulers.ray.ray_common` import must be used
40
- # For running ray jobs `ray_common` import must be used
41
- try:
42
- # pyre-fixme[21]: Could not find a module corresponding to import `ray_common`.
43
- from ray_common import RayActor, TORCHX_RANK0_HOST # noqa: F811
44
- except ModuleNotFoundError:
45
- from torchx.schedulers.ray.ray_common import RayActor, TORCHX_RANK0_HOST
46
-
47
- _logger: logging.Logger = logging.getLogger(__name__)
48
- _logger.setLevel(logging.getLevelName(os.environ.get("LOGLEVEL", "INFO")))
49
- logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
50
-
51
-
52
- @dataclass
53
- class RayResult:
54
- id: str
55
-
56
-
57
- class TaskCompleted(RayResult):
58
- pass
59
-
60
-
61
- class CommandActorScheduled(RayResult):
62
- pass
63
-
64
-
65
- @ray.remote
66
- class CommandActor: # pragma: no cover
67
- def __init__(self, cmd: List[str], env: Dict[str, str]) -> None:
68
- self.cmd: List[str] = cmd
69
- self.env: Dict[str, str] = env
70
-
71
- def exec_module(
72
- self, master_addr: str, master_port: int, actor_id: str
73
- ) -> TaskCompleted:
74
- """Execute a user script"""
75
- if master_addr is None or master_port is None:
76
- raise RuntimeError(
77
- "Either MASTER_ADDR or MASTER_PORT are not set. This is most likely bug in torchx"
78
- "Open issue at https://github.com/pytorch/torchx"
79
- )
80
- worker_evn = {}
81
- worker_evn.update(os.environ)
82
- worker_evn.update(self.env)
83
- worker_evn[TORCHX_RANK0_HOST] = master_addr
84
- popen = subprocess.Popen(self.cmd, env=worker_evn)
85
-
86
- returncode = popen.wait()
87
- _logger.info(f"Finished with code {returncode}")
88
-
89
- if returncode != 0:
90
- raise RuntimeError(f"exec_module failed with return code {returncode}")
91
-
92
- return TaskCompleted(actor_id)
93
-
94
- def schedule(self, actor_id: str) -> CommandActorScheduled:
95
- """Testing if a command actor is scheduled"""
96
- return CommandActorScheduled(actor_id)
97
-
98
- def get_actor_address_and_port(self) -> Tuple[str, int]:
99
- addr = ray.util.get_node_ip_address()
100
- with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
101
- s.bind(("", 0))
102
- s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
103
- port = s.getsockname()[1]
104
- return addr, port
105
-
106
-
107
- def load_actor_json(filename: str) -> List[RayActor]:
108
- """Loading replicas specifications from a JSON file"""
109
- with open(filename) as f:
110
- actors: List[RayActor] = []
111
- # Yes this is gross but it works
112
- actor_dict = json.load(f)
113
- actor_dict = json.loads(actor_dict)
114
- for actor in actor_dict:
115
- actors.append(RayActor(**actor))
116
- return actors
117
-
118
-
119
- def create_placement_group_async(replicas: List[RayActor]) -> PlacementGroup: # type: ignore
120
- """return a placement group reference, the corresponding placement group could be scheduled or pending"""
121
- bundles = []
122
- for replica in replicas:
123
- bundles.append({"CPU": replica.num_cpus, "GPU": replica.num_gpus})
124
-
125
- pg = ray.util.placement_group(bundles, strategy="SPREAD")
126
- return pg
127
-
128
-
129
- @dataclass
130
- class ActorInfo:
131
- """Used to store the information for restoring a failed command actor"""
132
-
133
- pg: PlacementGroup
134
- replica: RayActor
135
- actor: CommandActor
136
-
137
-
138
- class RayDriver:
139
- def __init__(self, replicas: List[RayActor]) -> None:
140
- self.replicas = replicas
141
- self.master_node_id: Optional[str] = None # the actor id of the master node
142
- self.rank_0_address: Optional[str] = None
143
- self.rank_0_port: Optional[int] = None
144
- self.max_replicas: int = len(replicas)
145
- self.min_replicas: int
146
- if replicas[0].min_replicas is None:
147
- self.min_replicas = self.max_replicas
148
- else:
149
- self.min_replicas = replicas[0].min_replicas # pyre-ignore[8]
150
-
151
- self.placement_groups: List[PlacementGroup] = (
152
- []
153
- ) # all the placement groups, shall never change
154
- self.actor_info_of_id: Dict[str, ActorInfo] = (
155
- {}
156
- ) # store the info used to recover an actor
157
- self.active_tasks: List["ray.ObjectRef"] = [] # list of active tasks
158
-
159
- self.terminating: bool = False # if the job has finished and being terminated
160
- self.command_actors_count: int = 0 # number of created command actors
161
-
162
- def init_placement_groups(self) -> None:
163
- """Initialize all placement groups needed for this job"""
164
- # find the actor specifications of a given placement group
165
- replica_ix_of_pg: List[int] = [0] + list(
166
- range(
167
- self.min_replicas,
168
- self.max_replicas + 1,
169
- )
170
- )
171
- # create all the placement groups
172
- initial_group = create_placement_group_async(
173
- self.replicas[replica_ix_of_pg[0] : replica_ix_of_pg[1]]
174
- )
175
- _logger.info("Waiting for minimum placement group to start.")
176
- ready = initial_group.wait(100)
177
- if not ready: # pragma: no cover
178
- raise TimeoutError(
179
- "Placement group creation timed out. Make sure "
180
- "your cluster either has enough resources or use "
181
- "an autoscaling cluster. Current resources "
182
- "available: {}, resources requested by the "
183
- "placement group: {}".format(
184
- ray.available_resources(), initial_group.bundle_specs
185
- )
186
- )
187
- self.placement_groups.append(initial_group)
188
- for i in range(1, len(replica_ix_of_pg) - 1):
189
- self.placement_groups.append(
190
- create_placement_group_async(
191
- self.replicas[replica_ix_of_pg[i] : replica_ix_of_pg[i + 1]]
192
- )
193
- )
194
-
195
- def pop_actor_info(self, actor_id: str) -> ActorInfo:
196
- """Remove and return the info of a dead command actor"""
197
- return self.actor_info_of_id.pop(actor_id)
198
-
199
- def create_and_schedule_actor(self, pg: PlacementGroup, replica: RayActor) -> None:
200
- """create an command actor in the given placement group"""
201
- # create the command actor
202
- actor = CommandActor.options( # pyre-ignore[16]
203
- placement_group=pg,
204
- num_cpus=replica.num_cpus,
205
- num_gpus=replica.num_gpus,
206
- ).remote(replica.command, replica.env)
207
-
208
- # get the actor id of the created actor
209
- actor_id = actor._actor_id.hex()
210
- # launch a task to check if the actor is scheduled
211
- self.active_tasks.append(actor.schedule.remote(actor_id))
212
- # save the actor info for recovering from node failures
213
- self.actor_info_of_id[actor_id] = ActorInfo(
214
- actor=actor,
215
- pg=pg,
216
- replica=replica,
217
- )
218
-
219
- def place_command_actors(self) -> None:
220
- """Creating all command actors in all placement groups"""
221
- # find the placement group index for a replica(actor's specification)
222
- pg_ix_of_replica: List[int] = [
223
- max(0, i - self.min_replicas + 1) for i in range(len(self.replicas))
224
- ]
225
- # create the actors
226
- for i in range(len(self.replicas)):
227
- pg_ix = pg_ix_of_replica[i]
228
- pg = self.placement_groups[pg_ix] # find the created placement group
229
- replica = self.replicas[i]
230
- self.create_and_schedule_actor(pg, replica)
231
-
232
- def _step(self) -> bool:
233
- """Handling command actor's return"""
234
- result: RayResult # execution result
235
- _logger.info(f"running ray.wait on {self.active_tasks}")
236
- # ray.wait is partial waiting
237
- completed_tasks, self.active_tasks = ray.wait(self.active_tasks)
238
- # If a failure occurs the ObjectRef will be marked as completed.
239
- # Calling ray.get will expose the failure as a RayActorError.
240
- for object_ref in completed_tasks:
241
- result = ray.get(object_ref)
242
- if isinstance(result, CommandActorScheduled):
243
- if not self.terminating:
244
- actor = self.actor_info_of_id[result.id].actor
245
- if self.master_node_id is None:
246
- # make this actor be the master node
247
- self.master_node_id = result.id
248
- self.rank_0_address, self.rank_0_port = ray.get(
249
- actor.get_actor_address_and_port.remote() # pyre-ignore
250
- )
251
- self.active_tasks.append(
252
- actor.exec_module.remote( # pyre-ignore
253
- "localhost", 0, result.id
254
- )
255
- )
256
- else:
257
- self.active_tasks.append(
258
- actor.exec_module.remote(
259
- self.rank_0_address, self.rank_0_port, result.id
260
- )
261
- )
262
- self.command_actors_count += 1
263
- elif isinstance(result, TaskCompleted):
264
- self.terminating = (
265
- True # terminating the job, wait for all actors to finish
266
- )
267
- self.command_actors_count -= 1 # 1 completed command actor
268
- self.pop_actor_info(result.id)
269
- if (
270
- self.command_actors_count == 0
271
- ): # all the command actors have finished
272
- return True # is terminal
273
- else:
274
- raise RuntimeError(
275
- f"Ray actor returns unknown type {type(result)}"
276
- "This is most likely bug in torchx"
277
- "Open issue at https://github.com/pytorch/torchx"
278
- )
279
- return False
280
-
281
- def run(self) -> None:
282
- """This is the main loop the ray driver, it executes the user script on the scheduled nodes,
283
- and restart the failed nodes(node failures). The loop ends when all the actors that joining
284
- the job exits."""
285
- self.terminating = False
286
- self.command_actors_count = 0
287
- # Await return result of remote ray function and initialize new command actors
288
- while len(self.active_tasks) > 0:
289
- terminal = self._step()
290
- if terminal:
291
- break
292
-
293
-
294
- def main() -> None: # pragma: no cover
295
- actors: List[RayActor] = load_actor_json("actors.json")
296
- driver = RayDriver(actors)
297
- ray.init(address="auto", namespace="torchx-ray")
298
- driver.init_placement_groups()
299
- _logger.info("Successfully created placement groups")
300
- driver.place_command_actors()
301
- _logger.info("Successfully placed command actors")
302
- _logger.info("Entering main loop, start executing the script on worker nodes")
303
- driver.run()
304
-
305
-
306
- if __name__ == "__main__":
307
- main()