torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  2. torchx/cli/cmd_delete.py +30 -0
  3. torchx/cli/cmd_list.py +1 -2
  4. torchx/cli/cmd_run.py +202 -28
  5. torchx/cli/cmd_tracker.py +1 -1
  6. torchx/cli/main.py +2 -0
  7. torchx/components/__init__.py +1 -8
  8. torchx/components/dist.py +9 -3
  9. torchx/components/integration_tests/component_provider.py +2 -2
  10. torchx/components/utils.py +1 -1
  11. torchx/distributed/__init__.py +1 -1
  12. torchx/runner/api.py +102 -81
  13. torchx/runner/config.py +3 -1
  14. torchx/runner/events/__init__.py +20 -10
  15. torchx/runner/events/api.py +1 -1
  16. torchx/schedulers/__init__.py +7 -10
  17. torchx/schedulers/api.py +66 -25
  18. torchx/schedulers/aws_batch_scheduler.py +47 -6
  19. torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
  20. torchx/schedulers/docker_scheduler.py +4 -3
  21. torchx/schedulers/ids.py +27 -23
  22. torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
  23. torchx/schedulers/kubernetes_scheduler.py +355 -36
  24. torchx/schedulers/local_scheduler.py +2 -1
  25. torchx/schedulers/lsf_scheduler.py +1 -1
  26. torchx/schedulers/slurm_scheduler.py +102 -27
  27. torchx/specs/__init__.py +40 -9
  28. torchx/specs/api.py +222 -12
  29. torchx/specs/builders.py +109 -28
  30. torchx/specs/file_linter.py +117 -53
  31. torchx/specs/finder.py +25 -37
  32. torchx/specs/named_resources_aws.py +13 -2
  33. torchx/specs/overlays.py +106 -0
  34. torchx/tracker/__init__.py +2 -2
  35. torchx/tracker/api.py +1 -1
  36. torchx/util/entrypoints.py +1 -6
  37. torchx/util/strings.py +1 -1
  38. torchx/util/types.py +12 -1
  39. torchx/version.py +2 -2
  40. torchx/workspace/api.py +102 -5
  41. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
  42. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
  43. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
  44. torchx/examples/pipelines/__init__.py +0 -0
  45. torchx/examples/pipelines/kfp/__init__.py +0 -0
  46. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
  47. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
  48. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
  49. torchx/pipelines/kfp/__init__.py +0 -30
  50. torchx/pipelines/kfp/adapter.py +0 -274
  51. torchx/pipelines/kfp/version.py +0 -19
  52. torchx/schedulers/gcp_batch_scheduler.py +0 -497
  53. torchx/schedulers/ray/ray_common.py +0 -22
  54. torchx/schedulers/ray/ray_driver.py +0 -307
  55. torchx/schedulers/ray_scheduler.py +0 -454
  56. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
  57. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
  58. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
torchx/runner/api.py CHANGED
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env python3
2
1
  # Copyright (c) Meta Platforms, Inc. and affiliates.
3
2
  # All rights reserved.
4
3
  #
@@ -25,6 +24,7 @@ from typing import (
25
24
  Type,
26
25
  TYPE_CHECKING,
27
26
  TypeVar,
27
+ Union,
28
28
  )
29
29
 
30
30
  from torchx.runner.events import log_event
@@ -42,6 +42,7 @@ from torchx.specs import (
42
42
  parse_app_handle,
43
43
  runopts,
44
44
  UnknownAppException,
45
+ Workspace,
45
46
  )
46
47
  from torchx.specs.finder import get_component
47
48
  from torchx.tracker.api import (
@@ -53,7 +54,7 @@ from torchx.tracker.api import (
53
54
  from torchx.util.session import get_session_id_or_create_new, TORCHX_INTERNAL_SESSION_ID
54
55
 
55
56
  from torchx.util.types import none_throws
56
- from torchx.workspace.api import PkgInfo, WorkspaceBuilder, WorkspaceMixin
57
+ from torchx.workspace import WorkspaceMixin
57
58
 
58
59
  if TYPE_CHECKING:
59
60
  from typing_extensions import Self
@@ -129,9 +130,9 @@ class Runner:
129
130
  def _get_scheduler_params_from_env(self) -> Dict[str, str]:
130
131
  scheduler_params = {}
131
132
  for key, value in os.environ.items():
132
- lower_case_key = key.lower()
133
- if lower_case_key.startswith("torchx_"):
134
- scheduler_params[lower_case_key.strip("torchx_")] = value
133
+ key = key.lower()
134
+ if key.startswith("torchx_"):
135
+ scheduler_params[key.removeprefix("torchx_")] = value
135
136
  return scheduler_params
136
137
 
137
138
  def __enter__(self) -> "Self":
@@ -164,25 +165,13 @@ class Runner:
164
165
  for scheduler in self._scheduler_instances.values():
165
166
  scheduler.close()
166
167
 
167
- def build_standalone_workspace(
168
- self,
169
- workspace_builder: WorkspaceBuilder[S, T],
170
- sync: bool = True,
171
- ) -> PkgInfo[S]:
172
- """
173
- Build a standalone workspace for the given role.
174
- This method is used to build a workspace for a role independent of the scheduler and
175
- also enables asynchronous workspace building using the Role overrides.
176
- """
177
- return workspace_builder.build_workspace(sync)
178
-
179
168
  def run_component(
180
169
  self,
181
170
  component: str,
182
- component_args: List[str],
171
+ component_args: Union[list[str], dict[str, Any]],
183
172
  scheduler: str,
184
173
  cfg: Optional[Mapping[str, CfgVal]] = None,
185
- workspace: Optional[str] = None,
174
+ workspace: Optional[Union[Workspace, str]] = None,
186
175
  parent_run_id: Optional[str] = None,
187
176
  ) -> AppHandle:
188
177
  """
@@ -217,7 +206,7 @@ class Runner:
217
206
  ComponentNotFoundException: if the ``component_path`` is failed to resolve.
218
207
  """
219
208
 
220
- with log_event("run_component", workspace=workspace) as ctx:
209
+ with log_event("run_component") as ctx:
221
210
  dryrun_info = self.dryrun_component(
222
211
  component,
223
212
  component_args,
@@ -228,7 +217,8 @@ class Runner:
228
217
  )
229
218
  handle = self.schedule(dryrun_info)
230
219
  app = none_throws(dryrun_info._app)
231
- ctx._torchx_event.workspace = workspace
220
+
221
+ ctx._torchx_event.workspace = str(workspace)
232
222
  ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler)
233
223
  ctx._torchx_event.app_image = app.roles[0].image
234
224
  ctx._torchx_event.app_id = parse_app_handle(handle)[2]
@@ -238,10 +228,10 @@ class Runner:
238
228
  def dryrun_component(
239
229
  self,
240
230
  component: str,
241
- component_args: List[str],
231
+ component_args: Union[list[str], dict[str, Any]],
242
232
  scheduler: str,
243
233
  cfg: Optional[Mapping[str, CfgVal]] = None,
244
- workspace: Optional[str] = None,
234
+ workspace: Optional[Union[Workspace, str]] = None,
245
235
  parent_run_id: Optional[str] = None,
246
236
  ) -> AppDryRunInfo:
247
237
  """
@@ -249,10 +239,13 @@ class Runner:
249
239
  component, but just returns what "would" have run.
250
240
  """
251
241
  component_def = get_component(component)
242
+ args_from_cli = component_args if isinstance(component_args, list) else []
243
+ args_from_json = component_args if isinstance(component_args, dict) else {}
252
244
  app = materialize_appdef(
253
245
  component_def.fn,
254
- component_args,
246
+ args_from_cli,
255
247
  self._component_defaults.get(component, None),
248
+ args_from_json,
256
249
  )
257
250
  return self.dryrun(
258
251
  app,
@@ -267,7 +260,7 @@ class Runner:
267
260
  app: AppDef,
268
261
  scheduler: str,
269
262
  cfg: Optional[Mapping[str, CfgVal]] = None,
270
- workspace: Optional[str] = None,
263
+ workspace: Optional[Union[Workspace, str]] = None,
271
264
  parent_run_id: Optional[str] = None,
272
265
  ) -> AppHandle:
273
266
  """
@@ -280,9 +273,7 @@ class Runner:
280
273
  An application handle that is used to call other action APIs on the app.
281
274
  """
282
275
 
283
- with log_event(
284
- api="run", runcfg=json.dumps(cfg) if cfg else None, workspace=workspace
285
- ) as ctx:
276
+ with log_event(api="run") as ctx:
286
277
  dryrun_info = self.dryrun(
287
278
  app,
288
279
  scheduler,
@@ -291,10 +282,15 @@ class Runner:
291
282
  parent_run_id=parent_run_id,
292
283
  )
293
284
  handle = self.schedule(dryrun_info)
294
- ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler)
295
- ctx._torchx_event.app_image = none_throws(dryrun_info._app).roles[0].image
296
- ctx._torchx_event.app_id = parse_app_handle(handle)[2]
297
- ctx._torchx_event.app_metadata = app.metadata
285
+
286
+ event = ctx._torchx_event
287
+ event.scheduler = scheduler
288
+ event.runcfg = json.dumps(cfg) if cfg else None
289
+ event.workspace = str(workspace)
290
+ event.app_id = parse_app_handle(handle)[2]
291
+ event.app_image = none_throws(dryrun_info._app).roles[0].image
292
+ event.app_metadata = app.metadata
293
+
298
294
  return handle
299
295
 
300
296
  def schedule(self, dryrun_info: AppDryRunInfo) -> AppHandle:
@@ -328,21 +324,22 @@ class Runner:
328
324
 
329
325
  """
330
326
  scheduler = none_throws(dryrun_info._scheduler)
331
- app_image = none_throws(dryrun_info._app).roles[0].image
332
327
  cfg = dryrun_info._cfg
333
- with log_event(
334
- "schedule",
335
- scheduler,
336
- app_image=app_image,
337
- runcfg=json.dumps(cfg) if cfg else None,
338
- ) as ctx:
328
+ with log_event("schedule") as ctx:
339
329
  sched = self._scheduler(scheduler)
340
330
  app_id = sched.schedule(dryrun_info)
341
331
  app_handle = make_app_handle(scheduler, self._name, app_id)
332
+
342
333
  app = none_throws(dryrun_info._app)
343
334
  self._apps[app_handle] = app
344
- _, _, app_id = parse_app_handle(app_handle)
345
- ctx._torchx_event.app_id = app_id
335
+
336
+ event = ctx._torchx_event
337
+ event.scheduler = scheduler
338
+ event.runcfg = json.dumps(cfg) if cfg else None
339
+ event.app_id = app_id
340
+ event.app_image = none_throws(dryrun_info._app).roles[0].image
341
+ event.app_metadata = app.metadata
342
+
346
343
  return app_handle
347
344
 
348
345
  def name(self) -> str:
@@ -353,7 +350,7 @@ class Runner:
353
350
  app: AppDef,
354
351
  scheduler: str,
355
352
  cfg: Optional[Mapping[str, CfgVal]] = None,
356
- workspace: Optional[str] = None,
353
+ workspace: Optional[Union[Workspace, str]] = None,
357
354
  parent_run_id: Optional[str] = None,
358
355
  ) -> AppDryRunInfo:
359
356
  """
@@ -422,52 +419,45 @@ class Runner:
422
419
  "dryrun",
423
420
  scheduler,
424
421
  runcfg=json.dumps(cfg) if cfg else None,
425
- workspace=workspace,
426
- ):
422
+ workspace=str(workspace),
423
+ ) as ctx:
427
424
  sched = self._scheduler(scheduler)
428
425
  resolved_cfg = sched.run_opts().resolve(cfg)
429
426
 
430
- # early validation before build workspace
431
- with log_event(
432
- "pre_build_validate",
433
- scheduler,
434
- ):
435
- sched._pre_build_validate(app, scheduler, resolved_cfg)
436
-
437
- if workspace and isinstance(sched, WorkspaceMixin):
438
- role = app.roles[0]
439
- old_img = role.image
440
-
441
- logger.info(f"Checking for changes in workspace `{workspace}`...")
442
- logger.info(
443
- 'To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.'
444
- )
445
- with log_event(
446
- "build_workspace_and_update_role",
447
- scheduler,
448
- ) as ctx:
449
- sched.build_workspace_and_update_role(role, workspace, resolved_cfg)
450
- ctx._torchx_event.app_image = role.image
451
- ctx._torchx_event.workspace = workspace
452
-
453
- if old_img != role.image:
454
- logger.info(
455
- f"Built new image `{role.image}` based on original image `{old_img}`"
456
- f" and changes in workspace `{workspace}` for role[0]={role.name}."
457
- )
458
- else:
459
- logger.info(
460
- f"Reusing original image `{old_img}` for role[0]={role.name}."
461
- " Either a patch was built or no changes to workspace was detected."
427
+ sched._pre_build_validate(app, scheduler, resolved_cfg)
428
+
429
+ if isinstance(sched, WorkspaceMixin):
430
+ if workspace:
431
+ # NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
432
+ # later, torchx added support for the workspace attr in Role
433
+ # for BC, give precedence to the workspace argument over the workspace attr for role[0]
434
+ if app.roles[0].workspace:
435
+ logger.info(
436
+ "Overriding role[%d] (%s) workspace to `%s`"
437
+ "To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
438
+ 0,
439
+ role.name,
440
+ str(app.roles[0].workspace),
441
+ )
442
+ app.roles[0].workspace = (
443
+ Workspace.from_str(workspace)
444
+ if isinstance(workspace, str)
445
+ else workspace
462
446
  )
463
447
 
464
- with log_event(
465
- "validate",
466
- scheduler,
467
- ):
468
- sched._validate(app, scheduler, resolved_cfg)
448
+ sched.build_workspaces(app.roles, resolved_cfg)
449
+
450
+ sched._validate(app, scheduler, resolved_cfg)
469
451
  dryrun_info = sched.submit_dryrun(app, resolved_cfg)
470
452
  dryrun_info._scheduler = scheduler
453
+
454
+ event = ctx._torchx_event
455
+ event.scheduler = scheduler
456
+ event.runcfg = json.dumps(cfg) if cfg else None
457
+ event.app_id = app.name
458
+ event.app_image = none_throws(dryrun_info._app).roles[0].image
459
+ event.app_metadata = app.metadata
460
+
471
461
  return dryrun_info
472
462
 
473
463
  def scheduler_run_opts(self, scheduler: str) -> runopts:
@@ -486,6 +476,27 @@ class Runner:
486
476
  """
487
477
  return self._scheduler(scheduler).run_opts()
488
478
 
479
+ def cfg_from_str(self, scheduler: str, *cfg_literal: str) -> Mapping[str, CfgVal]:
480
+ """
481
+ Convenience function around the scheduler's ``runopts.cfg_from_str()`` method.
482
+
483
+ Usage:
484
+
485
+ .. doctest::
486
+
487
+ from torchx.runner import get_runner
488
+
489
+ runner = get_runner()
490
+ cfg = runner.cfg_from_str("local_cwd", "log_dir=/tmp/foobar", "prepend_cwd=True")
491
+ assert cfg == {"log_dir": "/tmp/foobar", "prepend_cwd": True, "auto_set_cuda_visible_devices": False}
492
+ """
493
+
494
+ opts = self._scheduler(scheduler).run_opts()
495
+ cfg = {}
496
+ for cfg_str in cfg_literal:
497
+ cfg.update(opts.cfg_from_str(cfg_str))
498
+ return cfg
499
+
489
500
  def scheduler_backends(self) -> List[str]:
490
501
  """
491
502
  Returns a list of all supported scheduler backends.
@@ -576,6 +587,16 @@ class Runner:
576
587
  if status is not None and not status.is_terminal():
577
588
  scheduler.cancel(app_id)
578
589
 
590
+ def delete(self, app_handle: AppHandle) -> None:
591
+ """
592
+ Deletes the application from the scheduler.
593
+ """
594
+ scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
595
+ with log_event("delete", scheduler_backend, app_id):
596
+ status = self.status(app_handle)
597
+ if status is not None:
598
+ scheduler.delete(app_id)
599
+
579
600
  def stop(self, app_handle: AppHandle) -> None:
580
601
  """
581
602
  See method ``cancel``.
torchx/runner/config.py CHANGED
@@ -73,7 +73,7 @@ CLI Usage
73
73
 
74
74
  #. In addition, it is possible to specify a different config other than .torchxconfig to
75
75
  load at runtime. Requirements are that the config path is specified by enviornment
76
- variable TORCHX_CONFIG. It also disables hierarchy loading configs from multiple
76
+ variable TORCHXCONFIG. It also disables hierarchy loading configs from multiple
77
77
  directories as the cases otherwise.
78
78
 
79
79
  #. User level .torchxconfig
@@ -494,6 +494,8 @@ def find_configs(dirs: Optional[Iterable[str]] = None) -> List[str]:
494
494
 
495
495
  config = os.getenv(ENV_TORCHXCONFIG)
496
496
  if config is not None:
497
+ if not config:
498
+ return []
497
499
  configfile = Path(config)
498
500
  if not configfile.is_file():
499
501
  raise FileNotFoundError(
@@ -33,8 +33,9 @@ from torchx.util.session import get_session_id_or_create_new
33
33
 
34
34
  from .api import SourceType, TorchxEvent # noqa F401
35
35
 
36
- # pyre-fixme[9]: _events_logger is a global variable
37
- _events_logger: logging.Logger = None
36
+ _events_logger: Optional[logging.Logger] = None
37
+
38
+ log: logging.Logger = logging.getLogger(__name__)
38
39
 
39
40
 
40
41
  def _get_or_create_logger(destination: str = "null") -> logging.Logger:
@@ -51,19 +52,28 @@ def _get_or_create_logger(destination: str = "null") -> logging.Logger:
51
52
  a new logger if None provided.
52
53
  """
53
54
  global _events_logger
55
+
54
56
  if _events_logger:
55
57
  return _events_logger
56
- logging_handler = get_logging_handler(destination)
57
- logging_handler.setLevel(logging.DEBUG)
58
- _events_logger = logging.getLogger(f"torchx-events-{destination}")
59
- # Do not propagate message to the root logger
60
- _events_logger.propagate = False
61
- _events_logger.addHandler(logging_handler)
62
- return _events_logger
58
+ else:
59
+ logging_handler = get_logging_handler(destination)
60
+ logging_handler.setLevel(logging.DEBUG)
61
+ _events_logger = logging.getLogger(f"torchx-events-{destination}")
62
+ # Do not propagate message to the root logger
63
+ _events_logger.propagate = False
64
+ _events_logger.addHandler(logging_handler)
65
+
66
+ assert _events_logger # make type-checker happy
67
+ return _events_logger
63
68
 
64
69
 
65
70
  def record(event: TorchxEvent, destination: str = "null") -> None:
66
- _get_or_create_logger(destination).info(event.serialize())
71
+ try:
72
+ serialized_event = event.serialize()
73
+ except Exception:
74
+ log.exception("failed to serialize event, will not record event")
75
+ else:
76
+ _get_or_create_logger(destination).info(serialized_event)
67
77
 
68
78
 
69
79
  class log_event:
@@ -29,7 +29,7 @@ class TorchxEvent:
29
29
  scheduler: Scheduler that is used to execute request
30
30
  api: Api name
31
31
  app_id: Unique id that is set by the underlying scheduler
32
- image: Image/container bundle that is used to execute request.
32
+ app_image: Image/container bundle that is used to execute request.
33
33
  app_metadata: metadata to the app (treatment of metadata is scheduler dependent)
34
34
  runcfg: Run config that was used to schedule app.
35
35
  source: Type of source the event is generated.
@@ -21,8 +21,6 @@ DEFAULT_SCHEDULER_MODULES: Mapping[str, str] = {
21
21
  "kubernetes_mcad": "torchx.schedulers.kubernetes_mcad_scheduler",
22
22
  "aws_batch": "torchx.schedulers.aws_batch_scheduler",
23
23
  "aws_sagemaker": "torchx.schedulers.aws_sagemaker_scheduler",
24
- "gcp_batch": "torchx.schedulers.gcp_batch_scheduler",
25
- "ray": "torchx.schedulers.ray_scheduler",
26
24
  "lsf": "torchx.schedulers.lsf_scheduler",
27
25
  }
28
26
 
@@ -51,15 +49,14 @@ def get_scheduler_factories(
51
49
  The first scheduler in the dictionary is used as the default scheduler.
52
50
  """
53
51
 
54
- default_schedulers: dict[str, SchedulerFactory] = {}
55
- for scheduler, path in DEFAULT_SCHEDULER_MODULES.items():
56
- default_schedulers[scheduler] = _defer_load_scheduler(path)
52
+ if skip_defaults:
53
+ default_schedulers = {}
54
+ else:
55
+ default_schedulers: dict[str, SchedulerFactory] = {}
56
+ for scheduler, path in DEFAULT_SCHEDULER_MODULES.items():
57
+ default_schedulers[scheduler] = _defer_load_scheduler(path)
57
58
 
58
- return load_group(
59
- group,
60
- default=default_schedulers,
61
- skip_defaults=skip_defaults,
62
- )
59
+ return load_group(group, default=default_schedulers)
63
60
 
64
61
 
65
62
  def get_default_scheduler_name() -> str:
torchx/schedulers/api.py CHANGED
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env python3
2
1
  # Copyright (c) Meta Platforms, Inc. and affiliates.
3
2
  # All rights reserved.
4
3
  #
@@ -16,14 +15,16 @@ from typing import Generic, Iterable, List, Optional, TypeVar
16
15
 
17
16
  from torchx.specs import (
18
17
  AppDef,
18
+ AppDryRunInfo,
19
19
  AppState,
20
20
  NONE,
21
21
  NULL_RESOURCE,
22
22
  Role,
23
23
  RoleStatus,
24
24
  runopts,
25
+ Workspace,
25
26
  )
26
- from torchx.workspace.api import WorkspaceMixin
27
+ from torchx.workspace import WorkspaceMixin
27
28
 
28
29
 
29
30
  DAYS_IN_2_WEEKS = 14
@@ -95,11 +96,9 @@ class ListAppResponse:
95
96
 
96
97
 
97
98
  T = TypeVar("T")
98
- A = TypeVar("A")
99
- D = TypeVar("D")
100
99
 
101
100
 
102
- class Scheduler(abc.ABC, Generic[T, A, D]):
101
+ class Scheduler(abc.ABC, Generic[T]):
103
102
  """
104
103
  An interface abstracting functionalities of a scheduler.
105
104
  Implementers need only implement those methods annotated with
@@ -129,9 +128,9 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
129
128
 
130
129
  def submit(
131
130
  self,
132
- app: A,
131
+ app: AppDef,
133
132
  cfg: T,
134
- workspace: Optional[str] = None,
133
+ workspace: str | Workspace | None = None,
135
134
  ) -> str:
136
135
  """
137
136
  Submits the application to be run by the scheduler.
@@ -144,16 +143,20 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
144
143
  # pyre-fixme: Generic cfg type passed to resolve
145
144
  resolved_cfg = self.run_opts().resolve(cfg)
146
145
  if workspace:
147
- sched = self
148
- assert isinstance(sched, WorkspaceMixin)
149
- role = app.roles[0]
150
- sched.build_workspace_and_update_role(role, workspace, resolved_cfg)
146
+ assert isinstance(self, WorkspaceMixin)
147
+
148
+ if isinstance(workspace, str):
149
+ workspace = Workspace.from_str(workspace)
150
+
151
+ app.roles[0].workspace = workspace
152
+ self.build_workspaces(app.roles, resolved_cfg)
153
+
151
154
  # pyre-fixme: submit_dryrun takes Generic type for resolved_cfg
152
155
  dryrun_info = self.submit_dryrun(app, resolved_cfg)
153
156
  return self.schedule(dryrun_info)
154
157
 
155
158
  @abc.abstractmethod
156
- def schedule(self, dryrun_info: D) -> str:
159
+ def schedule(self, dryrun_info: AppDryRunInfo) -> str:
157
160
  """
158
161
  Same as ``submit`` except that it takes an ``AppDryRunInfo``.
159
162
  Implementers are encouraged to implement this method rather than
@@ -169,7 +172,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
169
172
 
170
173
  raise NotImplementedError()
171
174
 
172
- def submit_dryrun(self, app: A, cfg: T) -> D:
175
+ def submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
173
176
  """
174
177
  Rather than submitting the request to run the app, returns the
175
178
  request object that would have been submitted to the underlying
@@ -183,15 +186,15 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
183
186
  # pyre-fixme: _submit_dryrun takes Generic type for resolved_cfg
184
187
  dryrun_info = self._submit_dryrun(app, resolved_cfg)
185
188
 
186
- if isinstance(app, AppDef):
187
- for role in app.roles:
188
- dryrun_info = role.pre_proc(self.backend, dryrun_info)
189
+ for role in app.roles:
190
+ dryrun_info = role.pre_proc(self.backend, dryrun_info)
191
+
189
192
  dryrun_info._app = app
190
193
  dryrun_info._cfg = resolved_cfg
191
194
  return dryrun_info
192
195
 
193
196
  @abc.abstractmethod
194
- def _submit_dryrun(self, app: A, cfg: T) -> D:
197
+ def _submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
195
198
  raise NotImplementedError()
196
199
 
197
200
  def run_opts(self) -> runopts:
@@ -260,6 +263,46 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
260
263
  # do nothing if the app does not exist
261
264
  return
262
265
 
266
+ def delete(self, app_id: str) -> None:
267
+ """
268
+ Deletes the job information for the specified ``app_id`` from the
269
+ scheduler's data-plane. Basically "deep-purging" the job from the
270
+ scheduler's data-plane. Calling this API on a "live" job (e.g in a
271
+ non-terminal status such as PENDING or RUNNING) cancels the job.
272
+
273
+ Note that this API is only relevant for schedulers for which its
274
+ data-plane persistently stores the "JobDefinition" (which is often
275
+ versioned). AWS Batch and Kubernetes are examples of such schedulers.
276
+ On these schedulers, a finished job may fall out of the data-plane
277
+ (e.g. really old finished jobs get deleted) but the JobDefinition is
278
+ typically permanently stored. In this case, calling
279
+ :py:meth:`~cancel` would not delete the job definition.
280
+
281
+ In schedulers with no such feature (e.g. SLURM)
282
+ :py:meth:`~delete` is the same as :py:meth:`~cancel`, which is the
283
+ default implementation. Hence implementors of such schedulers need not
284
+ override this method.
285
+
286
+ .. warning::
287
+ Calling :py:meth:`~delete` on an ``app_id`` that has fallen out of
288
+ the scheduler's data-plane does nothing. The user is responsible for
289
+ manually tracking down and cleaning up any dangling resources related
290
+ to the job.
291
+ """
292
+ if self.exists(app_id):
293
+ self._delete_existing(app_id)
294
+
295
+ def _delete_existing(self, app_id: str) -> None:
296
+ """
297
+ Deletes the job information for the specified ``app_id`` from the
298
+ scheduler's data-plane. This method will only be called on an
299
+ application that exists.
300
+
301
+ The default implementation calls :py:meth:`~_cancel_existing` which is
302
+ appropriate for schedulers without persistent job definitions.
303
+ """
304
+ self._cancel_existing(app_id)
305
+
263
306
  def log_iter(
264
307
  self,
265
308
  app_id: str,
@@ -350,19 +393,17 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
350
393
  """
351
394
  pass
352
395
 
353
- def _validate(self, app: A, scheduler: str, cfg: T) -> None:
396
+ def _validate(self, app: AppDef, scheduler: str, cfg: T) -> None:
354
397
  """
355
398
  Validates after workspace build whether application is consistent with the scheduler.
356
399
 
357
400
  Raises error if application is not compatible with scheduler
358
401
  """
359
- if isinstance(app, AppDef):
360
- for role in app.roles:
361
- if role.resource == NULL_RESOURCE:
362
- raise ValueError(
363
- f"No resource for role: {role.image}."
364
- f" Did you forget to attach resource to the role"
365
- )
402
+ for role in app.roles:
403
+ if role.resource == NULL_RESOURCE:
404
+ raise ValueError(
405
+ f"No resource for role: {role.image}. Did you forget to attach resource to the role"
406
+ )
366
407
 
367
408
 
368
409
  def filter_regex(regex: str, data: Iterable[str]) -> Iterable[str]: