torchx-nightly 2023.10.21__py3-none-any.whl → 2025.12.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

Files changed (110) hide show
  1. torchx/__init__.py +2 -0
  2. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  3. torchx/apps/serve/serve.py +2 -0
  4. torchx/apps/utils/booth_main.py +2 -0
  5. torchx/apps/utils/copy_main.py +2 -0
  6. torchx/apps/utils/process_monitor.py +2 -0
  7. torchx/cli/__init__.py +2 -0
  8. torchx/cli/argparse_util.py +38 -3
  9. torchx/cli/cmd_base.py +2 -0
  10. torchx/cli/cmd_cancel.py +2 -0
  11. torchx/cli/cmd_configure.py +2 -0
  12. torchx/cli/cmd_delete.py +30 -0
  13. torchx/cli/cmd_describe.py +2 -0
  14. torchx/cli/cmd_list.py +8 -4
  15. torchx/cli/cmd_log.py +6 -24
  16. torchx/cli/cmd_run.py +269 -45
  17. torchx/cli/cmd_runopts.py +2 -0
  18. torchx/cli/cmd_status.py +12 -1
  19. torchx/cli/cmd_tracker.py +3 -1
  20. torchx/cli/colors.py +2 -0
  21. torchx/cli/main.py +4 -0
  22. torchx/components/__init__.py +3 -8
  23. torchx/components/component_test_base.py +2 -0
  24. torchx/components/dist.py +18 -7
  25. torchx/components/integration_tests/component_provider.py +4 -2
  26. torchx/components/integration_tests/integ_tests.py +2 -0
  27. torchx/components/serve.py +2 -0
  28. torchx/components/structured_arg.py +7 -6
  29. torchx/components/utils.py +15 -4
  30. torchx/distributed/__init__.py +2 -4
  31. torchx/examples/apps/datapreproc/datapreproc.py +2 -0
  32. torchx/examples/apps/lightning/data.py +5 -3
  33. torchx/examples/apps/lightning/model.py +7 -6
  34. torchx/examples/apps/lightning/profiler.py +7 -4
  35. torchx/examples/apps/lightning/train.py +11 -2
  36. torchx/examples/torchx_out_of_sync_training.py +11 -0
  37. torchx/notebook.py +2 -0
  38. torchx/runner/__init__.py +2 -0
  39. torchx/runner/api.py +167 -60
  40. torchx/runner/config.py +43 -10
  41. torchx/runner/events/__init__.py +57 -13
  42. torchx/runner/events/api.py +14 -3
  43. torchx/runner/events/handlers.py +2 -0
  44. torchx/runtime/tracking/__init__.py +2 -0
  45. torchx/runtime/tracking/api.py +2 -0
  46. torchx/schedulers/__init__.py +16 -15
  47. torchx/schedulers/api.py +70 -14
  48. torchx/schedulers/aws_batch_scheduler.py +79 -5
  49. torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
  50. torchx/schedulers/devices.py +17 -4
  51. torchx/schedulers/docker_scheduler.py +43 -11
  52. torchx/schedulers/ids.py +29 -23
  53. torchx/schedulers/kubernetes_mcad_scheduler.py +10 -8
  54. torchx/schedulers/kubernetes_scheduler.py +383 -38
  55. torchx/schedulers/local_scheduler.py +100 -27
  56. torchx/schedulers/lsf_scheduler.py +5 -4
  57. torchx/schedulers/slurm_scheduler.py +336 -20
  58. torchx/schedulers/streams.py +2 -0
  59. torchx/specs/__init__.py +89 -12
  60. torchx/specs/api.py +431 -32
  61. torchx/specs/builders.py +176 -38
  62. torchx/specs/file_linter.py +143 -57
  63. torchx/specs/finder.py +68 -28
  64. torchx/specs/named_resources_aws.py +254 -22
  65. torchx/specs/named_resources_generic.py +2 -0
  66. torchx/specs/overlays.py +106 -0
  67. torchx/specs/test/components/__init__.py +2 -0
  68. torchx/specs/test/components/a/__init__.py +2 -0
  69. torchx/specs/test/components/a/b/__init__.py +2 -0
  70. torchx/specs/test/components/a/b/c.py +2 -0
  71. torchx/specs/test/components/c/__init__.py +2 -0
  72. torchx/specs/test/components/c/d.py +2 -0
  73. torchx/tracker/__init__.py +12 -6
  74. torchx/tracker/api.py +15 -18
  75. torchx/tracker/backend/fsspec.py +2 -0
  76. torchx/util/cuda.py +2 -0
  77. torchx/util/datetime.py +2 -0
  78. torchx/util/entrypoints.py +39 -15
  79. torchx/util/io.py +2 -0
  80. torchx/util/log_tee_helpers.py +210 -0
  81. torchx/util/modules.py +65 -0
  82. torchx/util/session.py +42 -0
  83. torchx/util/shlex.py +2 -0
  84. torchx/util/strings.py +3 -1
  85. torchx/util/types.py +90 -29
  86. torchx/version.py +4 -2
  87. torchx/workspace/__init__.py +2 -0
  88. torchx/workspace/api.py +136 -6
  89. torchx/workspace/dir_workspace.py +2 -0
  90. torchx/workspace/docker_workspace.py +30 -2
  91. torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
  92. torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
  93. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
  94. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
  95. torchx/examples/pipelines/__init__.py +0 -0
  96. torchx/examples/pipelines/kfp/__init__.py +0 -0
  97. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
  98. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
  99. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
  100. torchx/pipelines/kfp/__init__.py +0 -28
  101. torchx/pipelines/kfp/adapter.py +0 -271
  102. torchx/pipelines/kfp/version.py +0 -17
  103. torchx/schedulers/gcp_batch_scheduler.py +0 -487
  104. torchx/schedulers/ray/ray_common.py +0 -22
  105. torchx/schedulers/ray/ray_driver.py +0 -307
  106. torchx/schedulers/ray_scheduler.py +0 -453
  107. torchx_nightly-2023.10.21.dist-info/METADATA +0 -174
  108. torchx_nightly-2023.10.21.dist-info/RECORD +0 -118
  109. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
  110. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
torchx/runner/api.py CHANGED
@@ -1,10 +1,11 @@
1
- #!/usr/bin/env python3
2
1
  # Copyright (c) Meta Platforms, Inc. and affiliates.
3
2
  # All rights reserved.
4
3
  #
5
4
  # This source code is licensed under the BSD-style license found in the
6
5
  # LICENSE file in the root directory of this source tree.
7
6
 
7
+ # pyre-strict
8
+
8
9
  import json
9
10
  import logging
10
11
  import os
@@ -12,7 +13,19 @@ import time
12
13
  import warnings
13
14
  from datetime import datetime
14
15
  from types import TracebackType
15
- from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Type
16
+ from typing import (
17
+ Any,
18
+ Dict,
19
+ Iterable,
20
+ List,
21
+ Mapping,
22
+ Optional,
23
+ Tuple,
24
+ Type,
25
+ TYPE_CHECKING,
26
+ TypeVar,
27
+ Union,
28
+ )
16
29
 
17
30
  from torchx.runner.events import log_event
18
31
  from torchx.schedulers import get_scheduler_factories, SchedulerFactory
@@ -29,6 +42,7 @@ from torchx.specs import (
29
42
  parse_app_handle,
30
43
  runopts,
31
44
  UnknownAppException,
45
+ Workspace,
32
46
  )
33
47
  from torchx.specs.finder import get_component
34
48
  from torchx.tracker.api import (
@@ -37,9 +51,13 @@ from torchx.tracker.api import (
37
51
  ENV_TORCHX_TRACKERS,
38
52
  tracker_config_env_var_name,
39
53
  )
54
+ from torchx.util.session import get_session_id_or_create_new, TORCHX_INTERNAL_SESSION_ID
40
55
 
41
56
  from torchx.util.types import none_throws
42
- from torchx.workspace.api import WorkspaceMixin
57
+ from torchx.workspace import WorkspaceMixin
58
+
59
+ if TYPE_CHECKING:
60
+ from typing_extensions import Self
43
61
 
44
62
  from .config import get_config, get_configs
45
63
 
@@ -47,6 +65,8 @@ logger: logging.Logger = logging.getLogger(__name__)
47
65
 
48
66
 
49
67
  NONE: str = "<NONE>"
68
+ S = TypeVar("S")
69
+ T = TypeVar("T")
50
70
 
51
71
 
52
72
  def get_configured_trackers() -> Dict[str, Optional[str]]:
@@ -96,15 +116,26 @@ class Runner:
96
116
  """
97
117
  self._name: str = name
98
118
  self._scheduler_factories = scheduler_factories
99
- self._scheduler_params: Dict[str, object] = scheduler_params or {}
100
- # pyre-ignore[24]: Scheduler opts
119
+ self._scheduler_params: Dict[str, Any] = {
120
+ **(self._get_scheduler_params_from_env()),
121
+ **(scheduler_params or {}),
122
+ }
123
+ # pyre-fixme[24]: SchedulerOpts is a generic, and we don't have access to the corresponding type
101
124
  self._scheduler_instances: Dict[str, Scheduler] = {}
102
125
  self._apps: Dict[AppHandle, AppDef] = {}
103
126
 
104
127
  # component_name -> map of component_fn_param_name -> user-specified default val encoded as str
105
128
  self._component_defaults: Dict[str, Dict[str, str]] = component_defaults or {}
106
129
 
107
- def __enter__(self) -> "Runner":
130
+ def _get_scheduler_params_from_env(self) -> Dict[str, str]:
131
+ scheduler_params = {}
132
+ for key, value in os.environ.items():
133
+ key = key.lower()
134
+ if key.startswith("torchx_"):
135
+ scheduler_params[key.removeprefix("torchx_")] = value
136
+ return scheduler_params
137
+
138
+ def __enter__(self) -> "Self":
108
139
  return self
109
140
 
110
141
  def __exit__(
@@ -131,16 +162,16 @@ class Runner:
131
162
  It is ok to call this method multiple times on the same runner object.
132
163
  """
133
164
 
134
- for name, scheduler in self._scheduler_instances.items():
165
+ for scheduler in self._scheduler_instances.values():
135
166
  scheduler.close()
136
167
 
137
168
  def run_component(
138
169
  self,
139
170
  component: str,
140
- component_args: List[str],
171
+ component_args: Union[list[str], dict[str, Any]],
141
172
  scheduler: str,
142
173
  cfg: Optional[Mapping[str, CfgVal]] = None,
143
- workspace: Optional[str] = None,
174
+ workspace: Optional[Union[Workspace, str]] = None,
144
175
  parent_run_id: Optional[str] = None,
145
176
  ) -> AppHandle:
146
177
  """
@@ -175,23 +206,32 @@ class Runner:
175
206
  ComponentNotFoundException: if the ``component_path`` is failed to resolve.
176
207
  """
177
208
 
178
- dryrun_info = self.dryrun_component(
179
- component,
180
- component_args,
181
- scheduler,
182
- cfg=cfg,
183
- workspace=workspace,
184
- parent_run_id=parent_run_id,
185
- )
186
- return self.schedule(dryrun_info)
209
+ with log_event("run_component") as ctx:
210
+ dryrun_info = self.dryrun_component(
211
+ component,
212
+ component_args,
213
+ scheduler,
214
+ cfg=cfg,
215
+ workspace=workspace,
216
+ parent_run_id=parent_run_id,
217
+ )
218
+ handle = self.schedule(dryrun_info)
219
+ app = none_throws(dryrun_info._app)
220
+
221
+ ctx._torchx_event.workspace = str(workspace)
222
+ ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler)
223
+ ctx._torchx_event.app_image = app.roles[0].image
224
+ ctx._torchx_event.app_id = parse_app_handle(handle)[2]
225
+ ctx._torchx_event.app_metadata = app.metadata
226
+ return handle
187
227
 
188
228
  def dryrun_component(
189
229
  self,
190
230
  component: str,
191
- component_args: List[str],
231
+ component_args: Union[list[str], dict[str, Any]],
192
232
  scheduler: str,
193
233
  cfg: Optional[Mapping[str, CfgVal]] = None,
194
- workspace: Optional[str] = None,
234
+ workspace: Optional[Union[Workspace, str]] = None,
195
235
  parent_run_id: Optional[str] = None,
196
236
  ) -> AppDryRunInfo:
197
237
  """
@@ -199,10 +239,13 @@ class Runner:
199
239
  component, but just returns what "would" have run.
200
240
  """
201
241
  component_def = get_component(component)
242
+ args_from_cli = component_args if isinstance(component_args, list) else []
243
+ args_from_json = component_args if isinstance(component_args, dict) else {}
202
244
  app = materialize_appdef(
203
245
  component_def.fn,
204
- component_args,
246
+ args_from_cli,
205
247
  self._component_defaults.get(component, None),
248
+ args_from_json,
206
249
  )
207
250
  return self.dryrun(
208
251
  app,
@@ -217,7 +260,7 @@ class Runner:
217
260
  app: AppDef,
218
261
  scheduler: str,
219
262
  cfg: Optional[Mapping[str, CfgVal]] = None,
220
- workspace: Optional[str] = None,
263
+ workspace: Optional[Union[Workspace, str]] = None,
221
264
  parent_run_id: Optional[str] = None,
222
265
  ) -> AppHandle:
223
266
  """
@@ -230,10 +273,25 @@ class Runner:
230
273
  An application handle that is used to call other action APIs on the app.
231
274
  """
232
275
 
233
- dryrun_info = self.dryrun(
234
- app, scheduler, cfg=cfg, workspace=workspace, parent_run_id=parent_run_id
235
- )
236
- return self.schedule(dryrun_info)
276
+ with log_event(api="run") as ctx:
277
+ dryrun_info = self.dryrun(
278
+ app,
279
+ scheduler,
280
+ cfg=cfg,
281
+ workspace=workspace,
282
+ parent_run_id=parent_run_id,
283
+ )
284
+ handle = self.schedule(dryrun_info)
285
+
286
+ event = ctx._torchx_event
287
+ event.scheduler = scheduler
288
+ event.runcfg = json.dumps(cfg) if cfg else None
289
+ event.workspace = str(workspace)
290
+ event.app_id = parse_app_handle(handle)[2]
291
+ event.app_image = none_throws(dryrun_info._app).roles[0].image
292
+ event.app_metadata = app.metadata
293
+
294
+ return handle
237
295
 
238
296
  def schedule(self, dryrun_info: AppDryRunInfo) -> AppHandle:
239
297
  """
@@ -266,21 +324,22 @@ class Runner:
266
324
 
267
325
  """
268
326
  scheduler = none_throws(dryrun_info._scheduler)
269
- app_image = none_throws(dryrun_info._app).roles[0].image
270
327
  cfg = dryrun_info._cfg
271
- with log_event(
272
- "schedule",
273
- scheduler,
274
- app_image=app_image,
275
- runcfg=json.dumps(cfg) if cfg else None,
276
- ) as ctx:
328
+ with log_event("schedule") as ctx:
277
329
  sched = self._scheduler(scheduler)
278
330
  app_id = sched.schedule(dryrun_info)
279
331
  app_handle = make_app_handle(scheduler, self._name, app_id)
332
+
280
333
  app = none_throws(dryrun_info._app)
281
334
  self._apps[app_handle] = app
282
- _, _, app_id = parse_app_handle(app_handle)
283
- ctx._torchx_event.app_id = app_id
335
+
336
+ event = ctx._torchx_event
337
+ event.scheduler = scheduler
338
+ event.runcfg = json.dumps(cfg) if cfg else None
339
+ event.app_id = app_id
340
+ event.app_image = none_throws(dryrun_info._app).roles[0].image
341
+ event.app_metadata = app.metadata
342
+
284
343
  return app_handle
285
344
 
286
345
  def name(self) -> str:
@@ -291,7 +350,7 @@ class Runner:
291
350
  app: AppDef,
292
351
  scheduler: str,
293
352
  cfg: Optional[Mapping[str, CfgVal]] = None,
294
- workspace: Optional[str] = None,
353
+ workspace: Optional[Union[Workspace, str]] = None,
295
354
  parent_run_id: Optional[str] = None,
296
355
  ) -> AppDryRunInfo:
297
356
  """
@@ -343,6 +402,7 @@ class Runner:
343
402
  role.env[ENV_TORCHX_JOB_ID] = make_app_handle(
344
403
  scheduler, self._name, macros.app_id
345
404
  )
405
+ role.env[TORCHX_INTERNAL_SESSION_ID] = get_session_id_or_create_new()
346
406
 
347
407
  if parent_run_id:
348
408
  role.env[ENV_TORCHX_PARENT_RUN_ID] = parent_run_id
@@ -355,33 +415,49 @@ class Runner:
355
415
  role.env[tracker_config_env_var_name(name)] = config
356
416
 
357
417
  cfg = cfg or dict()
358
- with log_event("dryrun", scheduler, runcfg=json.dumps(cfg) if cfg else None):
418
+ with log_event(
419
+ "dryrun",
420
+ scheduler,
421
+ runcfg=json.dumps(cfg) if cfg else None,
422
+ workspace=str(workspace),
423
+ ) as ctx:
359
424
  sched = self._scheduler(scheduler)
360
425
  resolved_cfg = sched.run_opts().resolve(cfg)
361
- if workspace and isinstance(sched, WorkspaceMixin):
362
- role = app.roles[0]
363
- old_img = role.image
364
426
 
365
- logger.info(f"Checking for changes in workspace `{workspace}`...")
366
- logger.info(
367
- 'To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.'
368
- )
369
- sched.build_workspace_and_update_role(role, workspace, resolved_cfg)
370
-
371
- if old_img != role.image:
372
- logger.info(
373
- f"Built new image `{role.image}` based on original image `{old_img}`"
374
- f" and changes in workspace `{workspace}` for role[0]={role.name}."
375
- )
376
- else:
377
- logger.info(
378
- f"Reusing original image `{old_img}` for role[0]={role.name}."
379
- " Either a patch was built or no changes to workspace was detected."
427
+ sched._pre_build_validate(app, scheduler, resolved_cfg)
428
+
429
+ if isinstance(sched, WorkspaceMixin):
430
+ if workspace:
431
+ # NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
432
+ # later, torchx added support for the workspace attr in Role
433
+ # for BC, give precedence to the workspace argument over the workspace attr for role[0]
434
+ if app.roles[0].workspace:
435
+ logger.info(
436
+ "Overriding role[%d] (%s) workspace to `%s`"
437
+ "To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
438
+ 0,
439
+ role.name,
440
+ str(app.roles[0].workspace),
441
+ )
442
+ app.roles[0].workspace = (
443
+ Workspace.from_str(workspace)
444
+ if isinstance(workspace, str)
445
+ else workspace
380
446
  )
381
447
 
382
- sched._validate(app, scheduler)
448
+ sched.build_workspaces(app.roles, resolved_cfg)
449
+
450
+ sched._validate(app, scheduler, resolved_cfg)
383
451
  dryrun_info = sched.submit_dryrun(app, resolved_cfg)
384
452
  dryrun_info._scheduler = scheduler
453
+
454
+ event = ctx._torchx_event
455
+ event.scheduler = scheduler
456
+ event.runcfg = json.dumps(cfg) if cfg else None
457
+ event.app_id = app.name
458
+ event.app_image = none_throws(dryrun_info._app).roles[0].image
459
+ event.app_metadata = app.metadata
460
+
385
461
  return dryrun_info
386
462
 
387
463
  def scheduler_run_opts(self, scheduler: str) -> runopts:
@@ -400,6 +476,27 @@ class Runner:
400
476
  """
401
477
  return self._scheduler(scheduler).run_opts()
402
478
 
479
+ def cfg_from_str(self, scheduler: str, *cfg_literal: str) -> Mapping[str, CfgVal]:
480
+ """
481
+ Convenience function around the scheduler's ``runopts.cfg_from_str()`` method.
482
+
483
+ Usage:
484
+
485
+ .. doctest::
486
+
487
+ from torchx.runner import get_runner
488
+
489
+ runner = get_runner()
490
+ cfg = runner.cfg_from_str("local_cwd", "log_dir=/tmp/foobar", "prepend_cwd=True")
491
+ assert cfg == {"log_dir": "/tmp/foobar", "prepend_cwd": True, "auto_set_cuda_visible_devices": False}
492
+ """
493
+
494
+ opts = self._scheduler(scheduler).run_opts()
495
+ cfg = {}
496
+ for cfg_str in cfg_literal:
497
+ cfg.update(opts.cfg_from_str(cfg_str))
498
+ return cfg
499
+
403
500
  def scheduler_backends(self) -> List[str]:
404
501
  """
405
502
  Returns a list of all supported scheduler backends.
@@ -490,6 +587,16 @@ class Runner:
490
587
  if status is not None and not status.is_terminal():
491
588
  scheduler.cancel(app_id)
492
589
 
590
+ def delete(self, app_handle: AppHandle) -> None:
591
+ """
592
+ Deletes the application from the scheduler.
593
+ """
594
+ scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
595
+ with log_event("delete", scheduler_backend, app_id):
596
+ status = self.status(app_handle)
597
+ if status is not None:
598
+ scheduler.delete(app_id)
599
+
493
600
  def stop(self, app_handle: AppHandle) -> None:
494
601
  """
495
602
  See method ``cancel``.
@@ -525,7 +632,7 @@ class Runner:
525
632
  if not app:
526
633
  desc = scheduler.describe(app_id)
527
634
  if desc:
528
- app = AppDef(name=app_id, roles=desc.roles)
635
+ app = AppDef(name=app_id, roles=desc.roles, metadata=desc.metadata)
529
636
  return app
530
637
 
531
638
  def log_lines(
@@ -637,7 +744,7 @@ class Runner:
637
744
  app.app_handle = make_app_handle(scheduler, self._name, app.app_id)
638
745
  return apps
639
746
 
640
- # pyre-fixme: Scheduler opts
747
+ # pyre-fixme[24]: SchedulerOpts is a generic, and we don't have access to the corresponding type
641
748
  def _scheduler(self, scheduler: str) -> Scheduler:
642
749
  sched = self._scheduler_instances.get(scheduler)
643
750
  if not sched:
@@ -654,8 +761,8 @@ class Runner:
654
761
  def _scheduler_app_id(
655
762
  self,
656
763
  app_handle: AppHandle,
657
- check_session: bool = True
658
- # pyre-fixme: Scheduler opts
764
+ check_session: bool = True,
765
+ # pyre-fixme[24]: SchedulerOpts is a generic, and we don't have access to the corresponding type
659
766
  ) -> Tuple[Scheduler, str, str]:
660
767
  """
661
768
  Returns the scheduler and app_id from the app_handle.
torchx/runner/config.py CHANGED
@@ -5,6 +5,8 @@
5
5
  # This source code is licensed under the BSD-style license found in the
6
6
  # LICENSE file in the root directory of this source tree.
7
7
 
8
+ # pyre-strict
9
+
8
10
  """
9
11
  Status: Beta
10
12
 
@@ -71,7 +73,7 @@ CLI Usage
71
73
 
72
74
  #. In addition, it is possible to specify a different config other than .torchxconfig to
73
75
  load at runtime. Requirements are that the config path is specified by enviornment
74
- variable TORCHX_CONFIG. It also disables hierarchy loading configs from multiple
76
+ variable TORCHXCONFIG. It also disables hierarchy loading configs from multiple
75
77
  directories as the cases otherwise.
76
78
 
77
79
  #. User level .torchxconfig
@@ -195,7 +197,15 @@ def _configparser() -> configparser.ConfigParser:
195
197
 
196
198
 
197
199
  def _get_scheduler(name: str) -> Scheduler:
198
- schedulers = get_scheduler_factories()
200
+ schedulers = {
201
+ **get_scheduler_factories(),
202
+ **(
203
+ get_scheduler_factories(
204
+ group="torchx.schedulers.orchestrator", skip_defaults=True
205
+ )
206
+ or {}
207
+ ),
208
+ }
199
209
  if name not in schedulers:
200
210
  raise ValueError(
201
211
  f"`{name}` is not a registered scheduler. Valid scheduler names: {schedulers.keys()}"
@@ -239,7 +249,16 @@ def dump(
239
249
  if schedulers:
240
250
  scheds = schedulers
241
251
  else:
242
- scheds = get_scheduler_factories().keys()
252
+ scheduler_factories = {
253
+ **get_scheduler_factories(),
254
+ **(
255
+ get_scheduler_factories(
256
+ group="torchx.schedulers.orchestrator", skip_defaults=True
257
+ )
258
+ or {}
259
+ ),
260
+ }
261
+ scheds = scheduler_factories.keys()
243
262
 
244
263
  config = _configparser()
245
264
  for sched_name in scheds:
@@ -259,13 +278,20 @@ def dump(
259
278
  continue
260
279
 
261
280
  # serialize list elements with `;` delimiter (consistent with torchx cli)
262
- if opt.opt_type == List[str]:
281
+ if opt.is_type_list_of_str:
263
282
  # deal with empty or None default lists
264
283
  if opt.default:
265
284
  # pyre-ignore[6] opt.default type checked already as List[str]
266
285
  val = ";".join(opt.default)
267
286
  else:
268
287
  val = _NONE
288
+ elif opt.is_type_dict_of_str:
289
+ # deal with empty or None default lists
290
+ if opt.default:
291
+ # pyre-ignore[16] opt.default type checked already as Dict[str, str]
292
+ val = ";".join([f"{k}:{v}" for k, v in opt.default.items()])
293
+ else:
294
+ val = _NONE
269
295
  else:
270
296
  val = f"{opt.default}"
271
297
 
@@ -468,6 +494,8 @@ def find_configs(dirs: Optional[Iterable[str]] = None) -> List[str]:
468
494
 
469
495
  config = os.getenv(ENV_TORCHXCONFIG)
470
496
  if config is not None:
497
+ if not config:
498
+ return []
471
499
  configfile = Path(config)
472
500
  if not configfile.is_file():
473
501
  raise FileNotFoundError(
@@ -480,7 +508,7 @@ def find_configs(dirs: Optional[Iterable[str]] = None) -> List[str]:
480
508
  dirs = DEFAULT_CONFIG_DIRS
481
509
  for d in dirs:
482
510
  configfile = Path(d) / CONFIG_FILE
483
- if configfile.exists():
511
+ if os.access(configfile, os.R_OK):
484
512
  config_files.append(str(configfile))
485
513
  return config_files
486
514
 
@@ -510,21 +538,26 @@ def load(scheduler: str, f: TextIO, cfg: Dict[str, CfgVal]) -> None:
510
538
  # this also handles empty or None lists
511
539
  cfg[name] = None
512
540
  else:
513
- runopt = runopts.get(name)
541
+ opt = runopts.get(name)
514
542
 
515
- if runopt is None:
543
+ if opt is None:
516
544
  log.warning(
517
545
  f"`{name} = {value}` was declared in the [{section}] section "
518
546
  f" of the config file but is not a runopt of `{scheduler}` scheduler."
519
547
  f" Remove the entry from the config file to no longer see this warning"
520
548
  )
521
549
  else:
522
- if runopt.opt_type is bool:
550
+ if opt.opt_type is bool:
523
551
  # need to handle bool specially since str -> bool is based on
524
552
  # str emptiness not value (e.g. bool("False") == True)
525
553
  cfg[name] = config.getboolean(section, name)
526
- elif runopt.opt_type is List[str]:
554
+ elif opt.is_type_list_of_str:
527
555
  cfg[name] = value.split(";")
556
+ elif opt.is_type_dict_of_str:
557
+ cfg[name] = {
558
+ s.split(":", 1)[0]: s.split(":", 1)[1]
559
+ for s in value.replace(",", ";").split(";")
560
+ }
528
561
  else:
529
562
  # pyre-ignore[29]
530
- cfg[name] = runopt.opt_type(value)
563
+ cfg[name] = opt.opt_type(value)
@@ -5,6 +5,8 @@
5
5
  # This source code is licensed under the BSD-style license found in the
6
6
  # LICENSE file in the root directory of this source tree.
7
7
 
8
+ # pyre-strict
9
+
8
10
  """
9
11
  Module contains events processing mechanisms that are integrated with the standard python logging.
10
12
 
@@ -18,18 +20,22 @@ Example of usage:
18
20
 
19
21
  """
20
22
 
23
+ import json
21
24
  import logging
25
+ import sys
22
26
  import time
23
27
  import traceback
24
28
  from types import TracebackType
25
- from typing import Optional, Type
29
+ from typing import Dict, Optional, Type
26
30
 
27
31
  from torchx.runner.events.handlers import get_logging_handler
32
+ from torchx.util.session import get_session_id_or_create_new
28
33
 
29
34
  from .api import SourceType, TorchxEvent # noqa F401
30
35
 
31
- # pyre-fixme[9]: _events_logger is a global variable
32
- _events_logger: logging.Logger = None
36
+ _events_logger: Optional[logging.Logger] = None
37
+
38
+ log: logging.Logger = logging.getLogger(__name__)
33
39
 
34
40
 
35
41
  def _get_or_create_logger(destination: str = "null") -> logging.Logger:
@@ -46,19 +52,28 @@ def _get_or_create_logger(destination: str = "null") -> logging.Logger:
46
52
  a new logger if None provided.
47
53
  """
48
54
  global _events_logger
55
+
49
56
  if _events_logger:
50
57
  return _events_logger
51
- logging_handler = get_logging_handler(destination)
52
- _events_logger = logging.getLogger(f"torchx-events-{destination}")
53
- _events_logger.setLevel(logging.DEBUG)
54
- # Do not propagate message to the root logger
55
- _events_logger.propagate = False
56
- _events_logger.addHandler(logging_handler)
57
- return _events_logger
58
+ else:
59
+ logging_handler = get_logging_handler(destination)
60
+ logging_handler.setLevel(logging.DEBUG)
61
+ _events_logger = logging.getLogger(f"torchx-events-{destination}")
62
+ # Do not propagate message to the root logger
63
+ _events_logger.propagate = False
64
+ _events_logger.addHandler(logging_handler)
65
+
66
+ assert _events_logger # make type-checker happy
67
+ return _events_logger
58
68
 
59
69
 
60
70
  def record(event: TorchxEvent, destination: str = "null") -> None:
61
- _get_or_create_logger(destination).info(event.serialize())
71
+ try:
72
+ serialized_event = event.serialize()
73
+ except Exception:
74
+ log.exception("failed to serialize event, will not record event")
75
+ else:
76
+ _get_or_create_logger(destination).info(serialized_event)
62
77
 
63
78
 
64
79
  class log_event:
@@ -82,17 +97,28 @@ class log_event:
82
97
  scheduler: Optional[str] = None,
83
98
  app_id: Optional[str] = None,
84
99
  app_image: Optional[str] = None,
100
+ app_metadata: Optional[Dict[str, str]] = None,
85
101
  runcfg: Optional[str] = None,
102
+ workspace: Optional[str] = None,
86
103
  ) -> None:
87
104
  self._torchx_event: TorchxEvent = self._generate_torchx_event(
88
- api, scheduler or "", app_id, app_image=app_image, runcfg=runcfg
105
+ api,
106
+ scheduler or "",
107
+ app_id,
108
+ app_image=app_image,
109
+ app_metadata=app_metadata,
110
+ runcfg=runcfg,
111
+ workspace=workspace,
89
112
  )
90
113
  self._start_cpu_time_ns = 0
91
114
  self._start_wall_time_ns = 0
115
+ self._start_epoch_time_usec = 0
92
116
 
93
117
  def __enter__(self) -> "log_event":
94
118
  self._start_cpu_time_ns = time.process_time_ns()
95
119
  self._start_wall_time_ns = time.perf_counter_ns()
120
+ self._torchx_event.start_epoch_time_usec = int(time.time() * 1_000_000)
121
+
96
122
  return self
97
123
 
98
124
  def __exit__(
@@ -109,6 +135,20 @@ class log_event:
109
135
  ) // 1000
110
136
  if traceback_type:
111
137
  self._torchx_event.raw_exception = traceback.format_exc()
138
+ typ, value, tb = sys.exc_info()
139
+ if tb:
140
+ last_frame = traceback.extract_tb(tb)[-1]
141
+ self._torchx_event.exception_source_location = json.dumps(
142
+ {
143
+ "filename": last_frame.filename,
144
+ "lineno": last_frame.lineno,
145
+ "name": last_frame.name,
146
+ }
147
+ )
148
+ if exec_type:
149
+ self._torchx_event.exception_type = exec_type.__name__
150
+ if exec_value:
151
+ self._torchx_event.exception_message = str(exec_value)
112
152
  record(self._torchx_event)
113
153
 
114
154
  def _generate_torchx_event(
@@ -117,15 +157,19 @@ class log_event:
117
157
  scheduler: str,
118
158
  app_id: Optional[str] = None,
119
159
  app_image: Optional[str] = None,
160
+ app_metadata: Optional[Dict[str, str]] = None,
120
161
  runcfg: Optional[str] = None,
121
162
  source: SourceType = SourceType.UNKNOWN,
163
+ workspace: Optional[str] = None,
122
164
  ) -> TorchxEvent:
123
165
  return TorchxEvent(
124
- session=app_id or "",
166
+ session=get_session_id_or_create_new(),
125
167
  scheduler=scheduler,
126
168
  api=api,
127
169
  app_id=app_id,
128
170
  app_image=app_image,
171
+ app_metadata=app_metadata,
129
172
  runcfg=runcfg,
130
173
  source=source,
174
+ workspace=workspace,
131
175
  )