torchx-nightly 2024.1.6__py3-none-any.whl → 2025.12.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/__init__.py +2 -0
- torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
- torchx/apps/serve/serve.py +2 -0
- torchx/apps/utils/booth_main.py +2 -0
- torchx/apps/utils/copy_main.py +2 -0
- torchx/apps/utils/process_monitor.py +2 -0
- torchx/cli/__init__.py +2 -0
- torchx/cli/argparse_util.py +38 -3
- torchx/cli/cmd_base.py +2 -0
- torchx/cli/cmd_cancel.py +2 -0
- torchx/cli/cmd_configure.py +2 -0
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/cmd_describe.py +2 -0
- torchx/cli/cmd_list.py +8 -4
- torchx/cli/cmd_log.py +6 -24
- torchx/cli/cmd_run.py +269 -45
- torchx/cli/cmd_runopts.py +2 -0
- torchx/cli/cmd_status.py +12 -1
- torchx/cli/cmd_tracker.py +3 -1
- torchx/cli/colors.py +2 -0
- torchx/cli/main.py +4 -0
- torchx/components/__init__.py +3 -8
- torchx/components/component_test_base.py +2 -0
- torchx/components/dist.py +18 -7
- torchx/components/integration_tests/component_provider.py +4 -2
- torchx/components/integration_tests/integ_tests.py +2 -0
- torchx/components/serve.py +2 -0
- torchx/components/structured_arg.py +4 -3
- torchx/components/utils.py +15 -4
- torchx/distributed/__init__.py +2 -4
- torchx/examples/apps/datapreproc/datapreproc.py +2 -0
- torchx/examples/apps/lightning/data.py +5 -3
- torchx/examples/apps/lightning/model.py +7 -6
- torchx/examples/apps/lightning/profiler.py +7 -4
- torchx/examples/apps/lightning/train.py +11 -2
- torchx/examples/torchx_out_of_sync_training.py +11 -0
- torchx/notebook.py +2 -0
- torchx/runner/__init__.py +2 -0
- torchx/runner/api.py +167 -60
- torchx/runner/config.py +43 -10
- torchx/runner/events/__init__.py +57 -13
- torchx/runner/events/api.py +14 -3
- torchx/runner/events/handlers.py +2 -0
- torchx/runtime/tracking/__init__.py +2 -0
- torchx/runtime/tracking/api.py +2 -0
- torchx/schedulers/__init__.py +16 -15
- torchx/schedulers/api.py +70 -14
- torchx/schedulers/aws_batch_scheduler.py +75 -6
- torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
- torchx/schedulers/devices.py +17 -4
- torchx/schedulers/docker_scheduler.py +43 -11
- torchx/schedulers/ids.py +29 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +9 -7
- torchx/schedulers/kubernetes_scheduler.py +383 -38
- torchx/schedulers/local_scheduler.py +100 -27
- torchx/schedulers/lsf_scheduler.py +5 -4
- torchx/schedulers/slurm_scheduler.py +336 -20
- torchx/schedulers/streams.py +2 -0
- torchx/specs/__init__.py +89 -12
- torchx/specs/api.py +418 -30
- torchx/specs/builders.py +176 -38
- torchx/specs/file_linter.py +143 -57
- torchx/specs/finder.py +68 -28
- torchx/specs/named_resources_aws.py +181 -4
- torchx/specs/named_resources_generic.py +2 -0
- torchx/specs/overlays.py +106 -0
- torchx/specs/test/components/__init__.py +2 -0
- torchx/specs/test/components/a/__init__.py +2 -0
- torchx/specs/test/components/a/b/__init__.py +2 -0
- torchx/specs/test/components/a/b/c.py +2 -0
- torchx/specs/test/components/c/__init__.py +2 -0
- torchx/specs/test/components/c/d.py +2 -0
- torchx/tracker/__init__.py +12 -6
- torchx/tracker/api.py +15 -18
- torchx/tracker/backend/fsspec.py +2 -0
- torchx/util/cuda.py +2 -0
- torchx/util/datetime.py +2 -0
- torchx/util/entrypoints.py +39 -15
- torchx/util/io.py +2 -0
- torchx/util/log_tee_helpers.py +210 -0
- torchx/util/modules.py +65 -0
- torchx/util/session.py +42 -0
- torchx/util/shlex.py +2 -0
- torchx/util/strings.py +3 -1
- torchx/util/types.py +90 -29
- torchx/version.py +4 -2
- torchx/workspace/__init__.py +2 -0
- torchx/workspace/api.py +136 -6
- torchx/workspace/dir_workspace.py +2 -0
- torchx/workspace/docker_workspace.py +30 -2
- torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
- torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
- torchx/pipelines/kfp/__init__.py +0 -28
- torchx/pipelines/kfp/adapter.py +0 -271
- torchx/pipelines/kfp/version.py +0 -17
- torchx/schedulers/gcp_batch_scheduler.py +0 -487
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -453
- torchx_nightly-2024.1.6.dist-info/METADATA +0 -176
- torchx_nightly-2024.1.6.dist-info/RECORD +0 -118
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
|
@@ -1,307 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
#
|
|
4
|
-
# This source code is licensed under the BSD-style license found in the
|
|
5
|
-
# LICENSE file in the root directory of this source tree.
|
|
6
|
-
|
|
7
|
-
"""
|
|
8
|
-
We use placement groups to reserve resources in the ray cluster, it
|
|
9
|
-
ensure that a job will not lose the resources it used to have before
|
|
10
|
-
the job is finished. The deadlock situtation while launch multiple jobs at the
|
|
11
|
-
same time is avoided by create a big placement group that contains the minimum
|
|
12
|
-
required command actors for the job. Once the placement groups are created(may
|
|
13
|
-
not be scheduled on a physical node yet), then we schedule command actors to
|
|
14
|
-
the corresponding placement group, each actor is associated with a placement
|
|
15
|
-
group which hold the resource the acotr needs. Each time a placement group successfully
|
|
16
|
-
acquired the resources from the ray cluster, the actor scheduled to this placement group
|
|
17
|
-
will be executed. Command actors are state machines their behavior is defined by the
|
|
18
|
-
_step function, this give more flexibility to us if we want to bette handle the
|
|
19
|
-
node failures.
|
|
20
|
-
"""
|
|
21
|
-
import json
|
|
22
|
-
import logging
|
|
23
|
-
import os
|
|
24
|
-
import socket
|
|
25
|
-
import subprocess
|
|
26
|
-
import sys
|
|
27
|
-
|
|
28
|
-
from contextlib import closing
|
|
29
|
-
from dataclasses import dataclass
|
|
30
|
-
from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
|
|
31
|
-
|
|
32
|
-
import ray
|
|
33
|
-
from ray.util.placement_group import PlacementGroup
|
|
34
|
-
|
|
35
|
-
if TYPE_CHECKING:
|
|
36
|
-
from torchx.schedulers.ray.ray_common import RayActor, TORCHX_RANK0_HOST
|
|
37
|
-
|
|
38
|
-
# Hack to make code work for tests as well as running ray job.
|
|
39
|
-
# For tests the `torchx.schedulers.ray.ray_common` import must be used
|
|
40
|
-
# For running ray jobs `ray_common` import must be used
|
|
41
|
-
try:
|
|
42
|
-
# pyre-fixme[21]: Could not find a module corresponding to import `ray_common`.
|
|
43
|
-
from ray_common import RayActor, TORCHX_RANK0_HOST # noqa: F811
|
|
44
|
-
except ModuleNotFoundError:
|
|
45
|
-
from torchx.schedulers.ray.ray_common import RayActor, TORCHX_RANK0_HOST
|
|
46
|
-
|
|
47
|
-
_logger: logging.Logger = logging.getLogger(__name__)
|
|
48
|
-
_logger.setLevel(logging.getLevelName(os.environ.get("LOGLEVEL", "INFO")))
|
|
49
|
-
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@dataclass
|
|
53
|
-
class RayResult:
|
|
54
|
-
id: str
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
class TaskCompleted(RayResult):
|
|
58
|
-
pass
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class CommandActorScheduled(RayResult):
|
|
62
|
-
pass
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
@ray.remote
|
|
66
|
-
class CommandActor: # pragma: no cover
|
|
67
|
-
def __init__(self, cmd: List[str], env: Dict[str, str]) -> None:
|
|
68
|
-
self.cmd: List[str] = cmd
|
|
69
|
-
self.env: Dict[str, str] = env
|
|
70
|
-
|
|
71
|
-
def exec_module(
|
|
72
|
-
self, master_addr: str, master_port: int, actor_id: str
|
|
73
|
-
) -> TaskCompleted:
|
|
74
|
-
"""Execute a user script"""
|
|
75
|
-
if master_addr is None or master_port is None:
|
|
76
|
-
raise RuntimeError(
|
|
77
|
-
"Either MASTER_ADDR or MASTER_PORT are not set. This is most likely bug in torchx"
|
|
78
|
-
"Open issue at https://github.com/pytorch/torchx"
|
|
79
|
-
)
|
|
80
|
-
worker_evn = {}
|
|
81
|
-
worker_evn.update(os.environ)
|
|
82
|
-
worker_evn.update(self.env)
|
|
83
|
-
worker_evn[TORCHX_RANK0_HOST] = master_addr
|
|
84
|
-
popen = subprocess.Popen(self.cmd, env=worker_evn)
|
|
85
|
-
|
|
86
|
-
returncode = popen.wait()
|
|
87
|
-
_logger.info(f"Finished with code {returncode}")
|
|
88
|
-
|
|
89
|
-
if returncode != 0:
|
|
90
|
-
raise RuntimeError(f"exec_module failed with return code {returncode}")
|
|
91
|
-
|
|
92
|
-
return TaskCompleted(actor_id)
|
|
93
|
-
|
|
94
|
-
def schedule(self, actor_id: str) -> CommandActorScheduled:
|
|
95
|
-
"""Testing if a command actor is scheduled"""
|
|
96
|
-
return CommandActorScheduled(actor_id)
|
|
97
|
-
|
|
98
|
-
def get_actor_address_and_port(self) -> Tuple[str, int]:
|
|
99
|
-
addr = ray.util.get_node_ip_address()
|
|
100
|
-
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
|
|
101
|
-
s.bind(("", 0))
|
|
102
|
-
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
103
|
-
port = s.getsockname()[1]
|
|
104
|
-
return addr, port
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def load_actor_json(filename: str) -> List[RayActor]:
|
|
108
|
-
"""Loading replicas specifications from a JSON file"""
|
|
109
|
-
with open(filename) as f:
|
|
110
|
-
actors: List[RayActor] = []
|
|
111
|
-
# Yes this is gross but it works
|
|
112
|
-
actor_dict = json.load(f)
|
|
113
|
-
actor_dict = json.loads(actor_dict)
|
|
114
|
-
for actor in actor_dict:
|
|
115
|
-
actors.append(RayActor(**actor))
|
|
116
|
-
return actors
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
def create_placement_group_async(replicas: List[RayActor]) -> PlacementGroup:
|
|
120
|
-
"""return a placement group reference, the corresponding placement group could be scheduled or pending"""
|
|
121
|
-
bundles = []
|
|
122
|
-
for replica in replicas:
|
|
123
|
-
bundles.append({"CPU": replica.num_cpus, "GPU": replica.num_gpus})
|
|
124
|
-
|
|
125
|
-
pg = ray.util.placement_group(bundles, strategy="SPREAD")
|
|
126
|
-
return pg
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
@dataclass
|
|
130
|
-
class ActorInfo:
|
|
131
|
-
"""Used to store the information for restoring a failed command actor"""
|
|
132
|
-
|
|
133
|
-
pg: PlacementGroup
|
|
134
|
-
replica: RayActor
|
|
135
|
-
actor: CommandActor
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
class RayDriver:
|
|
139
|
-
def __init__(self, replicas: List[RayActor]) -> None:
|
|
140
|
-
self.replicas = replicas
|
|
141
|
-
self.master_node_id: Optional[str] = None # the actor id of the master node
|
|
142
|
-
self.rank_0_address: Optional[str] = None
|
|
143
|
-
self.rank_0_port: Optional[int] = None
|
|
144
|
-
self.max_replicas: int = len(replicas)
|
|
145
|
-
self.min_replicas: int
|
|
146
|
-
if replicas[0].min_replicas is None:
|
|
147
|
-
self.min_replicas = self.max_replicas
|
|
148
|
-
else:
|
|
149
|
-
self.min_replicas = replicas[0].min_replicas # pyre-ignore[8]
|
|
150
|
-
|
|
151
|
-
self.placement_groups: List[
|
|
152
|
-
PlacementGroup
|
|
153
|
-
] = [] # all the placement groups, shall never change
|
|
154
|
-
self.actor_info_of_id: Dict[
|
|
155
|
-
str, ActorInfo
|
|
156
|
-
] = {} # store the info used to recover an actor
|
|
157
|
-
self.active_tasks: List["ray.ObjectRef"] = [] # list of active tasks
|
|
158
|
-
|
|
159
|
-
self.terminating: bool = False # if the job has finished and being terminated
|
|
160
|
-
self.command_actors_count: int = 0 # number of created command actors
|
|
161
|
-
|
|
162
|
-
def init_placement_groups(self) -> None:
|
|
163
|
-
"""Initialize all placement groups needed for this job"""
|
|
164
|
-
# find the actor specifications of a given placement group
|
|
165
|
-
replica_ix_of_pg: List[int] = [0] + list(
|
|
166
|
-
range(
|
|
167
|
-
self.min_replicas,
|
|
168
|
-
self.max_replicas + 1,
|
|
169
|
-
)
|
|
170
|
-
)
|
|
171
|
-
# create all the placement groups
|
|
172
|
-
initial_group = create_placement_group_async(
|
|
173
|
-
self.replicas[replica_ix_of_pg[0] : replica_ix_of_pg[1]]
|
|
174
|
-
)
|
|
175
|
-
_logger.info("Waiting for minimum placement group to start.")
|
|
176
|
-
ready = initial_group.wait(100)
|
|
177
|
-
if not ready: # pragma: no cover
|
|
178
|
-
raise TimeoutError(
|
|
179
|
-
"Placement group creation timed out. Make sure "
|
|
180
|
-
"your cluster either has enough resources or use "
|
|
181
|
-
"an autoscaling cluster. Current resources "
|
|
182
|
-
"available: {}, resources requested by the "
|
|
183
|
-
"placement group: {}".format(
|
|
184
|
-
ray.available_resources(), initial_group.bundle_specs
|
|
185
|
-
)
|
|
186
|
-
)
|
|
187
|
-
self.placement_groups.append(initial_group)
|
|
188
|
-
for i in range(1, len(replica_ix_of_pg) - 1):
|
|
189
|
-
self.placement_groups.append(
|
|
190
|
-
create_placement_group_async(
|
|
191
|
-
self.replicas[replica_ix_of_pg[i] : replica_ix_of_pg[i + 1]]
|
|
192
|
-
)
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
def pop_actor_info(self, actor_id: str) -> ActorInfo:
|
|
196
|
-
"""Remove and return the info of a dead command actor"""
|
|
197
|
-
return self.actor_info_of_id.pop(actor_id)
|
|
198
|
-
|
|
199
|
-
def create_and_schedule_actor(self, pg: PlacementGroup, replica: RayActor) -> None:
|
|
200
|
-
"""create an command actor in the given placement group"""
|
|
201
|
-
# create the command actor
|
|
202
|
-
actor = CommandActor.options( # pyre-ignore[16]
|
|
203
|
-
placement_group=pg,
|
|
204
|
-
num_cpus=replica.num_cpus,
|
|
205
|
-
num_gpus=replica.num_gpus,
|
|
206
|
-
).remote(replica.command, replica.env)
|
|
207
|
-
|
|
208
|
-
# get the actor id of the created actor
|
|
209
|
-
actor_id = actor._actor_id.hex()
|
|
210
|
-
# launch a task to check if the actor is scheduled
|
|
211
|
-
self.active_tasks.append(actor.schedule.remote(actor_id))
|
|
212
|
-
# save the actor info for recovering from node failures
|
|
213
|
-
self.actor_info_of_id[actor_id] = ActorInfo(
|
|
214
|
-
actor=actor,
|
|
215
|
-
pg=pg,
|
|
216
|
-
replica=replica,
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
def place_command_actors(self) -> None:
|
|
220
|
-
"""Creating all command actors in all placement groups"""
|
|
221
|
-
# find the placement group index for a replica(actor's specification)
|
|
222
|
-
pg_ix_of_replica: List[int] = [
|
|
223
|
-
max(0, i - self.min_replicas + 1) for i in range(len(self.replicas))
|
|
224
|
-
]
|
|
225
|
-
# create the actors
|
|
226
|
-
for i in range(len(self.replicas)):
|
|
227
|
-
pg_ix = pg_ix_of_replica[i]
|
|
228
|
-
pg = self.placement_groups[pg_ix] # find the created placement group
|
|
229
|
-
replica = self.replicas[i]
|
|
230
|
-
self.create_and_schedule_actor(pg, replica)
|
|
231
|
-
|
|
232
|
-
def _step(self) -> bool:
|
|
233
|
-
"""Handling command actor's return"""
|
|
234
|
-
result: RayResult # execution result
|
|
235
|
-
_logger.info(f"running ray.wait on {self.active_tasks}")
|
|
236
|
-
# ray.wait is partial waiting
|
|
237
|
-
completed_tasks, self.active_tasks = ray.wait(self.active_tasks)
|
|
238
|
-
# If a failure occurs the ObjectRef will be marked as completed.
|
|
239
|
-
# Calling ray.get will expose the failure as a RayActorError.
|
|
240
|
-
for object_ref in completed_tasks:
|
|
241
|
-
result = ray.get(object_ref)
|
|
242
|
-
if isinstance(result, CommandActorScheduled):
|
|
243
|
-
if not self.terminating:
|
|
244
|
-
actor = self.actor_info_of_id[result.id].actor
|
|
245
|
-
if self.master_node_id is None:
|
|
246
|
-
# make this actor be the master node
|
|
247
|
-
self.master_node_id = result.id
|
|
248
|
-
self.rank_0_address, self.rank_0_port = ray.get(
|
|
249
|
-
actor.get_actor_address_and_port.remote() # pyre-ignore
|
|
250
|
-
)
|
|
251
|
-
self.active_tasks.append(
|
|
252
|
-
actor.exec_module.remote( # pyre-ignore
|
|
253
|
-
"localhost", 0, result.id
|
|
254
|
-
)
|
|
255
|
-
)
|
|
256
|
-
else:
|
|
257
|
-
self.active_tasks.append(
|
|
258
|
-
actor.exec_module.remote(
|
|
259
|
-
self.rank_0_address, self.rank_0_port, result.id
|
|
260
|
-
)
|
|
261
|
-
)
|
|
262
|
-
self.command_actors_count += 1
|
|
263
|
-
elif isinstance(result, TaskCompleted):
|
|
264
|
-
self.terminating = (
|
|
265
|
-
True # terminating the job, wait for all actors to finish
|
|
266
|
-
)
|
|
267
|
-
self.command_actors_count -= 1 # 1 completed command actor
|
|
268
|
-
self.pop_actor_info(result.id)
|
|
269
|
-
if (
|
|
270
|
-
self.command_actors_count == 0
|
|
271
|
-
): # all the command actors have finished
|
|
272
|
-
return True # is terminal
|
|
273
|
-
else:
|
|
274
|
-
raise RuntimeError(
|
|
275
|
-
f"Ray actor returns unknown type {type(result)}"
|
|
276
|
-
"This is most likely bug in torchx"
|
|
277
|
-
"Open issue at https://github.com/pytorch/torchx"
|
|
278
|
-
)
|
|
279
|
-
return False
|
|
280
|
-
|
|
281
|
-
def run(self) -> None:
|
|
282
|
-
"""This is the main loop the ray driver, it executes the user script on the scheduled nodes,
|
|
283
|
-
and restart the failed nodes(node failures). The loop ends when all the actors that joining
|
|
284
|
-
the job exits."""
|
|
285
|
-
self.terminating = False
|
|
286
|
-
self.command_actors_count = 0
|
|
287
|
-
# Await return result of remote ray function and initialize new command actors
|
|
288
|
-
while len(self.active_tasks) > 0:
|
|
289
|
-
terminal = self._step()
|
|
290
|
-
if terminal:
|
|
291
|
-
break
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
def main() -> None: # pragma: no cover
|
|
295
|
-
actors: List[RayActor] = load_actor_json("actors.json")
|
|
296
|
-
driver = RayDriver(actors)
|
|
297
|
-
ray.init(address="auto", namespace="torchx-ray")
|
|
298
|
-
driver.init_placement_groups()
|
|
299
|
-
_logger.info("Successfully created placement groups")
|
|
300
|
-
driver.place_command_actors()
|
|
301
|
-
_logger.info("Successfully placed command actors")
|
|
302
|
-
_logger.info("Entering main loop, start executing the script on worker nodes")
|
|
303
|
-
driver.run()
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
if __name__ == "__main__":
|
|
307
|
-
main()
|