torchx-nightly 2024.1.24__py3-none-any.whl → 2024.1.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/schedulers/aws_batch_scheduler.py +13 -1
- torchx/schedulers/gcp_batch_scheduler.py +5 -0
- torchx/schedulers/kubernetes_scheduler.py +1 -1
- torchx/tracker/__init__.py +8 -4
- torchx/tracker/api.py +11 -14
- torchx/util/modules.py +33 -0
- {torchx_nightly-2024.1.24.dist-info → torchx_nightly-2024.1.25.dist-info}/METADATA +1 -1
- {torchx_nightly-2024.1.24.dist-info → torchx_nightly-2024.1.25.dist-info}/RECORD +12 -11
- {torchx_nightly-2024.1.24.dist-info → torchx_nightly-2024.1.25.dist-info}/LICENSE +0 -0
- {torchx_nightly-2024.1.24.dist-info → torchx_nightly-2024.1.25.dist-info}/WHEEL +0 -0
- {torchx_nightly-2024.1.24.dist-info → torchx_nightly-2024.1.25.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2024.1.24.dist-info → torchx_nightly-2024.1.25.dist-info}/top_level.txt +0 -0
|
@@ -170,7 +170,10 @@ def resource_from_resource_requirements(
|
|
|
170
170
|
|
|
171
171
|
|
|
172
172
|
def _role_to_node_properties(
|
|
173
|
-
role: Role,
|
|
173
|
+
role: Role,
|
|
174
|
+
start_idx: int,
|
|
175
|
+
privileged: bool = False,
|
|
176
|
+
job_role_arn: Optional[str] = None,
|
|
174
177
|
) -> Dict[str, object]:
|
|
175
178
|
role.mounts += get_device_mounts(role.resource.devices)
|
|
176
179
|
|
|
@@ -245,6 +248,8 @@ def _role_to_node_properties(
|
|
|
245
248
|
"mountPoints": mount_points,
|
|
246
249
|
"volumes": volumes,
|
|
247
250
|
}
|
|
251
|
+
if job_role_arn:
|
|
252
|
+
container["jobRoleArn"] = job_role_arn
|
|
248
253
|
if role.num_replicas > 1:
|
|
249
254
|
instance_type = instance_type_from_resource(role.resource)
|
|
250
255
|
if instance_type is not None:
|
|
@@ -349,6 +354,7 @@ class AWSBatchOpts(TypedDict, total=False):
|
|
|
349
354
|
privileged: bool
|
|
350
355
|
share_id: Optional[str]
|
|
351
356
|
priority: int
|
|
357
|
+
job_role_arn: Optional[str]
|
|
352
358
|
|
|
353
359
|
|
|
354
360
|
class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
|
|
@@ -498,6 +504,7 @@ class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
|
|
|
498
504
|
role,
|
|
499
505
|
start_idx=node_idx,
|
|
500
506
|
privileged=cfg["privileged"],
|
|
507
|
+
job_role_arn=cfg.get("job_role_arn"),
|
|
501
508
|
)
|
|
502
509
|
)
|
|
503
510
|
node_idx += role.num_replicas
|
|
@@ -573,6 +580,11 @@ class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
|
|
|
573
580
|
"Higher number (between 0 and 9999) means higher priority. "
|
|
574
581
|
"This will only take effect if the job queue has a scheduling policy.",
|
|
575
582
|
)
|
|
583
|
+
opts.add(
|
|
584
|
+
"job_role_arn",
|
|
585
|
+
type_=str,
|
|
586
|
+
help="The Amazon Resource Name (ARN) of the IAM role that the container can assume for AWS permissions.",
|
|
587
|
+
)
|
|
576
588
|
return opts
|
|
577
589
|
|
|
578
590
|
def _get_job_id(self, app_id: str) -> Optional[str]:
|
|
@@ -205,12 +205,14 @@ class GCPBatchScheduler(Scheduler[GCPBatchOpts]):
|
|
|
205
205
|
if cpu <= 0:
|
|
206
206
|
cpu = 1
|
|
207
207
|
MILLI = 1000
|
|
208
|
+
# pyre-fixme[8]: Attribute has type `Field`; used as `int`.
|
|
208
209
|
res.cpu_milli = cpu * MILLI
|
|
209
210
|
memMB = resource.memMB
|
|
210
211
|
if memMB < 0:
|
|
211
212
|
raise ValueError(
|
|
212
213
|
f"memMB should to be set to a positive value, got {memMB}"
|
|
213
214
|
)
|
|
215
|
+
# pyre-fixme[8]: Attribute has type `Field`; used as `int`.
|
|
214
216
|
res.memory_mib = memMB
|
|
215
217
|
|
|
216
218
|
# TODO support named resources
|
|
@@ -358,11 +360,13 @@ class GCPBatchScheduler(Scheduler[GCPBatchOpts]):
|
|
|
358
360
|
return None
|
|
359
361
|
|
|
360
362
|
gpu = 0
|
|
363
|
+
# pyre-fixme[16]: `Field` has no attribute `instances`.
|
|
361
364
|
if len(job.allocation_policy.instances) != 0:
|
|
362
365
|
gpu_type = job.allocation_policy.instances[0].policy.machine_type
|
|
363
366
|
gpu = GPU_TYPE_TO_COUNT[gpu_type]
|
|
364
367
|
|
|
365
368
|
roles = {}
|
|
369
|
+
# pyre-fixme[16]: `RepeatedField` has no attribute `__iter__`.
|
|
366
370
|
for tg in job.task_groups:
|
|
367
371
|
env = tg.task_spec.environment.variables
|
|
368
372
|
role = env["TORCHX_ROLE_NAME"]
|
|
@@ -386,6 +390,7 @@ class GCPBatchScheduler(Scheduler[GCPBatchOpts]):
|
|
|
386
390
|
# TODO map role/replica status
|
|
387
391
|
desc = DescribeAppResponse(
|
|
388
392
|
app_id=app_id,
|
|
393
|
+
# pyre-fixme[16]: `Field` has no attribute `state`.
|
|
389
394
|
state=JOB_STATE[job.status.state.name],
|
|
390
395
|
roles=list(roles.values()),
|
|
391
396
|
)
|
|
@@ -23,7 +23,7 @@ Install Volcano:
|
|
|
23
23
|
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.6.0/installer/volcano-development.yaml
|
|
24
24
|
|
|
25
25
|
See the
|
|
26
|
-
`Volcano Quickstart <https://github.com/volcano-sh/volcano#
|
|
26
|
+
`Volcano Quickstart <https://github.com/volcano-sh/volcano#quick-start-guide>`_
|
|
27
27
|
for more information.
|
|
28
28
|
"""
|
|
29
29
|
|
torchx/tracker/__init__.py
CHANGED
|
@@ -37,7 +37,7 @@ Tracker Setup
|
|
|
37
37
|
-------------
|
|
38
38
|
To enable tracking it requires:
|
|
39
39
|
|
|
40
|
-
1. Defining tracker backends (entrypoints and configuration) on launcher side using :doc:`runner.config`
|
|
40
|
+
1. Defining tracker backends (entrypoints/modules and configuration) on launcher side using :doc:`runner.config`
|
|
41
41
|
2. Adding entrypoints within a user job using entry_points (`specification`_)
|
|
42
42
|
|
|
43
43
|
.. _specification: https://packaging.python.org/en/latest/specifications/entry-points/
|
|
@@ -49,13 +49,13 @@ To enable tracking it requires:
|
|
|
49
49
|
User can define any number of tracker backends under **torchx:tracker** section in :doc:`runner.config`, where:
|
|
50
50
|
* Key: is an arbitrary name for the tracker, where the name will be used to configure its properties
|
|
51
51
|
under [tracker:<TRACKER_NAME>]
|
|
52
|
-
* Value: is *entrypoint
|
|
52
|
+
* Value: is *entrypoint* or *module* factory method that must be available within user job. The value will be injected into a
|
|
53
53
|
user job and used to construct tracker implementation.
|
|
54
54
|
|
|
55
55
|
.. code-block:: ini
|
|
56
56
|
|
|
57
57
|
[torchx:tracker]
|
|
58
|
-
tracker_name=<
|
|
58
|
+
tracker_name=<entry_point_or_module_factory_method>
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
Each tracker can be additionally configured (currently limited to `config` parameter) under `[tracker:<TRACKER NAME>]` section:
|
|
@@ -71,11 +71,15 @@ For example, ~/.torchxconfig may be setup as:
|
|
|
71
71
|
|
|
72
72
|
[torchx:tracker]
|
|
73
73
|
tracker1=tracker1
|
|
74
|
-
|
|
74
|
+
tracker2=backend_2_entry_point
|
|
75
|
+
tracker3=torchx.tracker.mlflow:create_tracker
|
|
75
76
|
|
|
76
77
|
[tracker:tracker1]
|
|
77
78
|
config=s3://my_bucket/config.json
|
|
78
79
|
|
|
80
|
+
[tracker:tracker3]
|
|
81
|
+
config=my_config.json
|
|
82
|
+
|
|
79
83
|
|
|
80
84
|
2. User job configuration (Advanced)
|
|
81
85
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
torchx/tracker/api.py
CHANGED
|
@@ -14,6 +14,7 @@ from functools import lru_cache
|
|
|
14
14
|
from typing import Iterable, Mapping, Optional
|
|
15
15
|
|
|
16
16
|
from torchx.util.entrypoints import load_group
|
|
17
|
+
from torchx.util.modules import load_module
|
|
17
18
|
|
|
18
19
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -177,30 +178,26 @@ def _extract_tracker_name_and_config_from_environ() -> Mapping[str, Optional[str
|
|
|
177
178
|
|
|
178
179
|
|
|
179
180
|
def build_trackers(
|
|
180
|
-
|
|
181
|
+
factory_and_config: Mapping[str, Optional[str]]
|
|
181
182
|
) -> Iterable[TrackerBase]:
|
|
182
183
|
trackers = []
|
|
183
184
|
|
|
184
|
-
entrypoint_factories = load_group("torchx.tracker")
|
|
185
|
+
entrypoint_factories = load_group("torchx.tracker") or {}
|
|
185
186
|
if not entrypoint_factories:
|
|
186
|
-
logger.warning(
|
|
187
|
-
"No 'torchx.tracker' entry_points are defined. Tracking will not capture any data."
|
|
188
|
-
)
|
|
189
|
-
return trackers
|
|
187
|
+
logger.warning("No 'torchx.tracker' entry_points are defined.")
|
|
190
188
|
|
|
191
|
-
for
|
|
192
|
-
|
|
189
|
+
for factory_name, config in factory_and_config.items():
|
|
190
|
+
factory = entrypoint_factories.get(factory_name) or load_module(factory_name)
|
|
191
|
+
if not factory:
|
|
193
192
|
logger.warning(
|
|
194
|
-
f"
|
|
193
|
+
f"No tracker factory `{factory_name}` found in entry_points or modules. See https://pytorch.org/torchx/main/tracker.html#module-torchx.tracker"
|
|
195
194
|
)
|
|
196
195
|
continue
|
|
197
|
-
factory = entrypoint_factories[entrypoint_key]
|
|
198
196
|
if config:
|
|
199
|
-
logger.info(f"Tracker config found for `{
|
|
200
|
-
tracker = factory(config)
|
|
197
|
+
logger.info(f"Tracker config found for `{factory_name}` as `{config}`")
|
|
201
198
|
else:
|
|
202
|
-
logger.info(f"No tracker config specified for `{
|
|
203
|
-
|
|
199
|
+
logger.info(f"No tracker config specified for `{factory_name}`")
|
|
200
|
+
tracker = factory(config)
|
|
204
201
|
trackers.append(tracker)
|
|
205
202
|
return trackers
|
|
206
203
|
|
torchx/util/modules.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import importlib
|
|
8
|
+
from types import ModuleType
|
|
9
|
+
from typing import Callable, Optional, Union
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_module(path: str) -> Union[ModuleType, Optional[Callable[..., object]]]:
|
|
13
|
+
"""
|
|
14
|
+
Loads and returns the module/module attr represented by the ``path``: ``full.module.path:optional_attr``
|
|
15
|
+
|
|
16
|
+
::
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
1. ``load_module("this.is.a_module:fn")`` -> equivalent to ``this.is.a_module.fn``
|
|
20
|
+
1. ``load_module("this.is.a_module")`` -> equivalent to ``this.is.a_module``
|
|
21
|
+
"""
|
|
22
|
+
parts = path.split(":", 2)
|
|
23
|
+
module_path, method = parts[0], parts[1] if len(parts) > 1 else None
|
|
24
|
+
module = None
|
|
25
|
+
i, n = -1, len(module_path)
|
|
26
|
+
try:
|
|
27
|
+
while i < n:
|
|
28
|
+
i = module_path.find(".", i + 1)
|
|
29
|
+
i = i if i >= 0 else n
|
|
30
|
+
module = importlib.import_module(module_path[:i])
|
|
31
|
+
return getattr(module, method) if method else module
|
|
32
|
+
except Exception:
|
|
33
|
+
return None
|
|
@@ -65,13 +65,13 @@ torchx/runtime/tracking/__init__.py,sha256=uHbJ1NqsxFWGYz2aV0_p4OCMhW467zDJu_86B
|
|
|
65
65
|
torchx/runtime/tracking/api.py,sha256=9mlsCnnKP8hfvypNcEX2_57OYMW4AuTMM-nvsIgOzK4,5457
|
|
66
66
|
torchx/schedulers/__init__.py,sha256=cCansxGU45SV_lxhgzyw2on7AJyIvhprAFo6Di1x9xQ,2157
|
|
67
67
|
torchx/schedulers/api.py,sha256=XlYrD6ZjV71HotJxdVZxA_Zc8DuxhM4KKCnkibqZflU,14140
|
|
68
|
-
torchx/schedulers/aws_batch_scheduler.py,sha256=
|
|
68
|
+
torchx/schedulers/aws_batch_scheduler.py,sha256=t6wGK1NUjhTAoxH3ie7lIMTl2cxwdixkcT3HxKsggDk,27517
|
|
69
69
|
torchx/schedulers/devices.py,sha256=PNbcpf8fEM18Ag1RgK9Q30zPBalEcPdsFWctdbLxuv8,1352
|
|
70
70
|
torchx/schedulers/docker_scheduler.py,sha256=I-kZN-dXoQyokLPe9ZKjfhkVX5lHx_C5jvLLc2JmXQQ,15456
|
|
71
|
-
torchx/schedulers/gcp_batch_scheduler.py,sha256=
|
|
71
|
+
torchx/schedulers/gcp_batch_scheduler.py,sha256=mBxJbrNTUbIuYmudzyhOOcf8KAuUpxhiQMTDgJPtL8M,16549
|
|
72
72
|
torchx/schedulers/ids.py,sha256=IGsJEbCYTdfKdU3MhKLQU6b7sWCJy5dlRV6JIL_9BlE,1783
|
|
73
73
|
torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=xAt-on3K8HwS2kzWasn0zXd2q4IDQzo2N5A5Ehh9NII,42885
|
|
74
|
-
torchx/schedulers/kubernetes_scheduler.py,sha256=
|
|
74
|
+
torchx/schedulers/kubernetes_scheduler.py,sha256=6NXYJwiYCXNeB3ubr8t4q_SuAa-vlYdiCAPXTB3f-zg,27068
|
|
75
75
|
torchx/schedulers/local_scheduler.py,sha256=QUxOLRN8CSmlEUlF_cCKveZeWcieXor2rAuAGtHNzfE,40133
|
|
76
76
|
torchx/schedulers/lsf_scheduler.py,sha256=KM4-LSBiTYdtP1Js8F9dSjAdNwimaTaLraZmgnZiRuI,17638
|
|
77
77
|
torchx/schedulers/ray_scheduler.py,sha256=unnDtDu1rPpCLJLDcm4NYRo9ZCCtQgG5BtlHwVfly-U,17448
|
|
@@ -93,8 +93,8 @@ torchx/specs/test/components/a/b/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RM
|
|
|
93
93
|
torchx/specs/test/components/a/b/c.py,sha256=QyTZfsCaSZscmk3DeNOkAyMoz6GCcayrWtOKbNFIZ1M,539
|
|
94
94
|
torchx/specs/test/components/c/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
95
95
|
torchx/specs/test/components/c/d.py,sha256=RH07jjo6uvFbzIaNFnAwmD_h24cEsT8kyZDTN-ezFio,531
|
|
96
|
-
torchx/tracker/__init__.py,sha256=
|
|
97
|
-
torchx/tracker/api.py,sha256=
|
|
96
|
+
torchx/tracker/__init__.py,sha256=3uUMRTQbQ-EcJrAofn3YiYJ00QwzbK4j6bJdgX8_D-k,4350
|
|
97
|
+
torchx/tracker/api.py,sha256=1K7X5mtkoEonICOEXCpPHyefcnILI_VZhqDpPb-yOmc,11254
|
|
98
98
|
torchx/tracker/mlflow.py,sha256=poeoIXVPzr2sxgi515fMGRH83KAFNL6XFILMh0EQ2Dw,14487
|
|
99
99
|
torchx/tracker/backend/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
100
100
|
torchx/tracker/backend/fsspec.py,sha256=JpSioMgn54mrxqqpY0kw5Gudqx9hhxkgDLaOFSEP2Ko,10425
|
|
@@ -103,6 +103,7 @@ torchx/util/cuda.py,sha256=GiAtP9-T4zJxwHl7r5qGQRuINb59Mzwz8tnzB1MVY74,1084
|
|
|
103
103
|
torchx/util/datetime.py,sha256=e-sO5Wjx1Utpln14C3qfJHl4v3KM-SMnn11hSyvkqFY,390
|
|
104
104
|
torchx/util/entrypoints.py,sha256=C4A7cF1tPLlfyYWyZ7uZEtsKeuoOoLbMv0sOSxLhXs4,2710
|
|
105
105
|
torchx/util/io.py,sha256=sxb6KI42Lq6n5z6_-YKW_mAhgPdC6CxzexlMyGheWSc,1792
|
|
106
|
+
torchx/util/modules.py,sha256=PjAvkC199EYEQCRIM-Fmrb1DRnySNcnV3xWrfKtqaqQ,1116
|
|
106
107
|
torchx/util/shlex.py,sha256=KzyWektMeU3oXS3Z5mFkNSPLItBTszVcvQ3EYfOMUYA,448
|
|
107
108
|
torchx/util/strings.py,sha256=7CZe5WKHa7IQ6DuJCYeJ5FapUC4Fd1OGeq1yZAmjluw,663
|
|
108
109
|
torchx/util/types.py,sha256=6ASuDKGO91UU3DCSuWhPX_C03341tApLCQEByUz8xpY,7016
|
|
@@ -110,9 +111,9 @@ torchx/workspace/__init__.py,sha256=KbGEzJqqXaIxALm_EQO64aw-fE7MeDMFXcpU1mY650I,
|
|
|
110
111
|
torchx/workspace/api.py,sha256=Ej6DR__mNWaVyZgoVNAAOloDy1kTD5X1jz7pRtoVf80,5464
|
|
111
112
|
torchx/workspace/dir_workspace.py,sha256=Fz-hKIx0KN8iJf2BsthNj0NvTkWlxP6WFsElPs_BaT0,2253
|
|
112
113
|
torchx/workspace/docker_workspace.py,sha256=Yd8ut26bNfjyJQnmH8ANOrflfr-4VKcnOrIjbi_XIUY,9208
|
|
113
|
-
torchx_nightly-2024.1.
|
|
114
|
-
torchx_nightly-2024.1.
|
|
115
|
-
torchx_nightly-2024.1.
|
|
116
|
-
torchx_nightly-2024.1.
|
|
117
|
-
torchx_nightly-2024.1.
|
|
118
|
-
torchx_nightly-2024.1.
|
|
114
|
+
torchx_nightly-2024.1.25.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
115
|
+
torchx_nightly-2024.1.25.dist-info/METADATA,sha256=ZXhFYcZqrWvvAe-gwbPkVhYDMe8uJtpqbNP29mTRDe4,5611
|
|
116
|
+
torchx_nightly-2024.1.25.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
117
|
+
torchx_nightly-2024.1.25.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
|
|
118
|
+
torchx_nightly-2024.1.25.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
119
|
+
torchx_nightly-2024.1.25.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|