torchx-nightly 2025.10.16__py3-none-any.whl → 2025.12.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

torchx/_version.py ADDED
@@ -0,0 +1,8 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ BASE_VERSION = "0.8.0dev0"
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-strict
9
+
10
+ import argparse
11
+ import logging
12
+
13
+ from torchx.cli.cmd_base import SubCommand
14
+ from torchx.runner import get_runner
15
+
16
+ logger: logging.Logger = logging.getLogger(__name__)
17
+
18
+
19
+ class CmdDelete(SubCommand):
20
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
21
+ subparser.add_argument(
22
+ "app_handle",
23
+ type=str,
24
+ help="torchx app handle (e.g. local://session-name/app-id)",
25
+ )
26
+
27
+ def run(self, args: argparse.Namespace) -> None:
28
+ app_handle = args.app_handle
29
+ runner = get_runner()
30
+ runner.delete(app_handle)
torchx/cli/main.py CHANGED
@@ -16,6 +16,7 @@ import torchx
16
16
  from torchx.cli.cmd_base import SubCommand
17
17
  from torchx.cli.cmd_cancel import CmdCancel
18
18
  from torchx.cli.cmd_configure import CmdConfigure
19
+ from torchx.cli.cmd_delete import CmdDelete
19
20
  from torchx.cli.cmd_describe import CmdDescribe
20
21
  from torchx.cli.cmd_list import CmdList
21
22
  from torchx.cli.cmd_log import CmdLog
@@ -37,6 +38,7 @@ def get_default_sub_cmds() -> Dict[str, SubCommand]:
37
38
  "builtins": CmdBuiltins(),
38
39
  "cancel": CmdCancel(),
39
40
  "configure": CmdConfigure(),
41
+ "delete": CmdDelete(),
40
42
  "describe": CmdDescribe(),
41
43
  "list": CmdList(),
42
44
  "log": CmdLog(),
torchx/runner/api.py CHANGED
@@ -420,52 +420,44 @@ class Runner:
420
420
  scheduler,
421
421
  runcfg=json.dumps(cfg) if cfg else None,
422
422
  workspace=str(workspace),
423
- ):
423
+ ) as ctx:
424
424
  sched = self._scheduler(scheduler)
425
425
  resolved_cfg = sched.run_opts().resolve(cfg)
426
426
 
427
427
  sched._pre_build_validate(app, scheduler, resolved_cfg)
428
428
 
429
429
  if isinstance(sched, WorkspaceMixin):
430
- for i, role in enumerate(app.roles):
431
- role_workspace = role.workspace
432
-
433
- if i == 0 and workspace:
434
- # NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
435
- # later, torchx added support for the workspace attr in Role
436
- # for BC, give precedence to the workspace argument over the workspace attr for role[0]
437
- if role_workspace:
438
- logger.info(
439
- f"Using workspace={workspace} over role[{i}].workspace={role_workspace} for role[{i}]={role.name}."
440
- " To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically." # noqa: B950
441
- )
442
- role_workspace = workspace
443
-
444
- if role_workspace:
445
- old_img = role.image
430
+ if workspace:
431
+ # NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
432
+ # later, torchx added support for the workspace attr in Role
433
+ # for BC, give precedence to the workspace argument over the workspace attr for role[0]
434
+ if app.roles[0].workspace:
446
435
  logger.info(
447
- f"Checking for changes in workspace `{role_workspace}` for role[{i}]={role.name}..."
448
- )
449
- # TODO kiuk@ once we deprecate the `workspace` argument in runner APIs we can simplify the signature of
450
- # build_workspace_and_update_role2() to just taking the role and resolved_cfg
451
- sched.build_workspace_and_update_role2(
452
- role, role_workspace, resolved_cfg
436
+ "Overriding role[%d] (%s) workspace to `%s`"
437
+ "To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
438
+ 0,
439
+ role.name,
440
+ str(app.roles[0].workspace),
453
441
  )
442
+ app.roles[0].workspace = (
443
+ Workspace.from_str(workspace)
444
+ if isinstance(workspace, str)
445
+ else workspace
446
+ )
454
447
 
455
- if old_img != role.image:
456
- logger.info(
457
- f"Built new image `{role.image}` based on original image `{old_img}`"
458
- f" and changes in workspace `{role_workspace}` for role[{i}]={role.name}."
459
- )
460
- else:
461
- logger.info(
462
- f"Reusing original image `{old_img}` for role[{i}]={role.name}."
463
- " Either a patch was built or no changes to workspace was detected."
464
- )
448
+ sched.build_workspaces(app.roles, resolved_cfg)
465
449
 
466
450
  sched._validate(app, scheduler, resolved_cfg)
467
451
  dryrun_info = sched.submit_dryrun(app, resolved_cfg)
468
452
  dryrun_info._scheduler = scheduler
453
+
454
+ event = ctx._torchx_event
455
+ event.scheduler = scheduler
456
+ event.runcfg = json.dumps(cfg) if cfg else None
457
+ event.app_id = app.name
458
+ event.app_image = none_throws(dryrun_info._app).roles[0].image
459
+ event.app_metadata = app.metadata
460
+
469
461
  return dryrun_info
470
462
 
471
463
  def scheduler_run_opts(self, scheduler: str) -> runopts:
@@ -595,6 +587,16 @@ class Runner:
595
587
  if status is not None and not status.is_terminal():
596
588
  scheduler.cancel(app_id)
597
589
 
590
+ def delete(self, app_handle: AppHandle) -> None:
591
+ """
592
+ Deletes the application from the scheduler.
593
+ """
594
+ scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
595
+ with log_event("delete", scheduler_backend, app_id):
596
+ status = self.status(app_handle)
597
+ if status is not None:
598
+ scheduler.delete(app_id)
599
+
598
600
  def stop(self, app_handle: AppHandle) -> None:
599
601
  """
600
602
  See method ``cancel``.
torchx/schedulers/api.py CHANGED
@@ -11,10 +11,11 @@ import re
11
11
  from dataclasses import dataclass, field
12
12
  from datetime import datetime
13
13
  from enum import Enum
14
- from typing import Generic, Iterable, List, Optional, TypeVar, Union
14
+ from typing import Generic, Iterable, List, Optional, TypeVar
15
15
 
16
16
  from torchx.specs import (
17
17
  AppDef,
18
+ AppDryRunInfo,
18
19
  AppState,
19
20
  NONE,
20
21
  NULL_RESOURCE,
@@ -95,11 +96,9 @@ class ListAppResponse:
95
96
 
96
97
 
97
98
  T = TypeVar("T")
98
- A = TypeVar("A")
99
- D = TypeVar("D")
100
99
 
101
100
 
102
- class Scheduler(abc.ABC, Generic[T, A, D]):
101
+ class Scheduler(abc.ABC, Generic[T]):
103
102
  """
104
103
  An interface abstracting functionalities of a scheduler.
105
104
  Implementers need only implement those methods annotated with
@@ -129,9 +128,9 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
129
128
 
130
129
  def submit(
131
130
  self,
132
- app: A,
131
+ app: AppDef,
133
132
  cfg: T,
134
- workspace: Optional[Union[Workspace, str]] = None,
133
+ workspace: str | Workspace | None = None,
135
134
  ) -> str:
136
135
  """
137
136
  Submits the application to be run by the scheduler.
@@ -145,14 +144,19 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
145
144
  resolved_cfg = self.run_opts().resolve(cfg)
146
145
  if workspace:
147
146
  assert isinstance(self, WorkspaceMixin)
148
- self.build_workspace_and_update_role2(app.roles[0], workspace, resolved_cfg)
147
+
148
+ if isinstance(workspace, str):
149
+ workspace = Workspace.from_str(workspace)
150
+
151
+ app.roles[0].workspace = workspace
152
+ self.build_workspaces(app.roles, resolved_cfg)
149
153
 
150
154
  # pyre-fixme: submit_dryrun takes Generic type for resolved_cfg
151
155
  dryrun_info = self.submit_dryrun(app, resolved_cfg)
152
156
  return self.schedule(dryrun_info)
153
157
 
154
158
  @abc.abstractmethod
155
- def schedule(self, dryrun_info: D) -> str:
159
+ def schedule(self, dryrun_info: AppDryRunInfo) -> str:
156
160
  """
157
161
  Same as ``submit`` except that it takes an ``AppDryRunInfo``.
158
162
  Implementers are encouraged to implement this method rather than
@@ -168,7 +172,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
168
172
 
169
173
  raise NotImplementedError()
170
174
 
171
- def submit_dryrun(self, app: A, cfg: T) -> D:
175
+ def submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
172
176
  """
173
177
  Rather than submitting the request to run the app, returns the
174
178
  request object that would have been submitted to the underlying
@@ -182,15 +186,15 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
182
186
  # pyre-fixme: _submit_dryrun takes Generic type for resolved_cfg
183
187
  dryrun_info = self._submit_dryrun(app, resolved_cfg)
184
188
 
185
- if isinstance(app, AppDef):
186
- for role in app.roles:
187
- dryrun_info = role.pre_proc(self.backend, dryrun_info)
189
+ for role in app.roles:
190
+ dryrun_info = role.pre_proc(self.backend, dryrun_info)
191
+
188
192
  dryrun_info._app = app
189
193
  dryrun_info._cfg = resolved_cfg
190
194
  return dryrun_info
191
195
 
192
196
  @abc.abstractmethod
193
- def _submit_dryrun(self, app: A, cfg: T) -> D:
197
+ def _submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
194
198
  raise NotImplementedError()
195
199
 
196
200
  def run_opts(self) -> runopts:
@@ -259,6 +263,46 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
259
263
  # do nothing if the app does not exist
260
264
  return
261
265
 
266
+ def delete(self, app_id: str) -> None:
267
+ """
268
+ Deletes the job information for the specified ``app_id`` from the
269
+ scheduler's data-plane. Basically "deep-purging" the job from the
270
+ scheduler's data-plane. Calling this API on a "live" job (e.g in a
271
+ non-terminal status such as PENDING or RUNNING) cancels the job.
272
+
273
+ Note that this API is only relevant for schedulers for which its
274
+ data-plane persistently stores the "JobDefinition" (which is often
275
+ versioned). AWS Batch and Kubernetes are examples of such schedulers.
276
+ On these schedulers, a finished job may fall out of the data-plane
277
+ (e.g. really old finished jobs get deleted) but the JobDefinition is
278
+ typically permanently stored. In this case, calling
279
+ :py:meth:`~cancel` would not delete the job definition.
280
+
281
+ In schedulers with no such feature (e.g. SLURM)
282
+ :py:meth:`~delete` is the same as :py:meth:`~cancel`, which is the
283
+ default implementation. Hence implementors of such schedulers need not
284
+ override this method.
285
+
286
+ .. warning::
287
+ Calling :py:meth:`~delete` on an ``app_id`` that has fallen out of
288
+ the scheduler's data-plane does nothing. The user is responsible for
289
+ manually tracking down and cleaning up any dangling resources related
290
+ to the job.
291
+ """
292
+ if self.exists(app_id):
293
+ self._delete_existing(app_id)
294
+
295
+ def _delete_existing(self, app_id: str) -> None:
296
+ """
297
+ Deletes the job information for the specified ``app_id`` from the
298
+ scheduler's data-plane. This method will only be called on an
299
+ application that exists.
300
+
301
+ The default implementation calls :py:meth:`~_cancel_existing` which is
302
+ appropriate for schedulers without persistent job definitions.
303
+ """
304
+ self._cancel_existing(app_id)
305
+
262
306
  def log_iter(
263
307
  self,
264
308
  app_id: str,
@@ -349,15 +393,12 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
349
393
  """
350
394
  pass
351
395
 
352
- def _validate(self, app: A, scheduler: str, cfg: T) -> None:
396
+ def _validate(self, app: AppDef, scheduler: str, cfg: T) -> None:
353
397
  """
354
398
  Validates after workspace build whether application is consistent with the scheduler.
355
399
 
356
400
  Raises error if application is not compatible with scheduler
357
401
  """
358
- if not isinstance(app, AppDef):
359
- return
360
-
361
402
  for role in app.roles:
362
403
  if role.resource == NULL_RESOURCE:
363
404
  raise ValueError(
@@ -381,7 +381,7 @@ def _thread_local_cache(f: Callable[[], T]) -> Callable[[], T]:
381
381
 
382
382
 
383
383
  @_thread_local_cache
384
- def _local_session() -> "boto3.session.Session":
384
+ def _local_session() -> "boto3.session.Session": # noqa: F821
385
385
  import boto3.session
386
386
 
387
387
  return boto3.session.Session()
@@ -399,9 +399,7 @@ class AWSBatchOpts(TypedDict, total=False):
399
399
  ulimits: Optional[list[str]]
400
400
 
401
401
 
402
- class AWSBatchScheduler(
403
- DockerWorkspaceMixin, Scheduler[AWSBatchOpts, AppDef, AppDryRunInfo[BatchJob]]
404
- ):
402
+ class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
405
403
  """
406
404
  AWSBatchScheduler is a TorchX scheduling interface to AWS Batch.
407
405
 
@@ -157,7 +157,7 @@ def _merge_ordered(
157
157
 
158
158
  class AWSSageMakerScheduler(
159
159
  DockerWorkspaceMixin,
160
- Scheduler[AWSSageMakerOpts, AppDef, AppDryRunInfo[AWSSageMakerJob]],
160
+ Scheduler[AWSSageMakerOpts],
161
161
  ):
162
162
  """
163
163
  AWSSageMakerScheduler is a TorchX scheduling interface to AWS SageMaker.
@@ -129,9 +129,7 @@ class DockerOpts(TypedDict, total=False):
129
129
  privileged: bool
130
130
 
131
131
 
132
- class DockerScheduler(
133
- DockerWorkspaceMixin, Scheduler[DockerOpts, AppDef, AppDryRunInfo[DockerJob]]
134
- ):
132
+ class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
135
133
  """
136
134
  DockerScheduler is a TorchX scheduling interface to Docker.
137
135
 
@@ -796,10 +796,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
796
796
  network: Optional[str]
797
797
 
798
798
 
799
- class KubernetesMCADScheduler(
800
- DockerWorkspaceMixin,
801
- Scheduler[KubernetesMCADOpts, AppDef, AppDryRunInfo[KubernetesMCADJob]],
802
- ):
799
+ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
803
800
  """
804
801
  KubernetesMCADScheduler is a TorchX scheduling interface to Kubernetes.
805
802