lightning-sdk 0.1.42__py3-none-any.whl → 0.1.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightning_sdk/__init__.py +1 -1
- lightning_sdk/api/job_api.py +35 -0
- lightning_sdk/api/utils.py +8 -0
- lightning_sdk/cli/run.py +113 -4
- lightning_sdk/cli/serve.py +102 -14
- lightning_sdk/job/base.py +10 -0
- lightning_sdk/job/job.py +28 -4
- lightning_sdk/job/v1.py +5 -0
- lightning_sdk/job/v2.py +18 -0
- lightning_sdk/job/work.py +10 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_membership.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_state.py +1 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_project_membership.py +27 -1
- lightning_sdk/mmt/__init__.py +2 -1
- lightning_sdk/mmt/base.py +117 -15
- lightning_sdk/mmt/mmt.py +114 -22
- lightning_sdk/mmt/v1.py +56 -0
- lightning_sdk/mmt/v2.py +57 -0
- lightning_sdk/plugin.py +28 -23
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/METADATA +2 -1
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/RECORD +25 -26
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/entry_points.txt +0 -1
- lightning_sdk/cli/mmt.py +0 -138
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/LICENSE +0 -0
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/WHEEL +0 -0
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/top_level.txt +0 -0
|
@@ -58,6 +58,7 @@ class V1ProjectMembership(object):
|
|
|
58
58
|
'last_name': 'str',
|
|
59
59
|
'membership_count': 'str',
|
|
60
60
|
'name': 'str',
|
|
61
|
+
'next_free_credits_grant_at': 'datetime',
|
|
61
62
|
'organization': 'str',
|
|
62
63
|
'owner_id': 'str',
|
|
63
64
|
'owner_type': 'V1OwnerType',
|
|
@@ -87,6 +88,7 @@ class V1ProjectMembership(object):
|
|
|
87
88
|
'last_name': 'lastName',
|
|
88
89
|
'membership_count': 'membershipCount',
|
|
89
90
|
'name': 'name',
|
|
91
|
+
'next_free_credits_grant_at': 'nextFreeCreditsGrantAt',
|
|
90
92
|
'organization': 'organization',
|
|
91
93
|
'owner_id': 'ownerId',
|
|
92
94
|
'owner_type': 'ownerType',
|
|
@@ -98,7 +100,7 @@ class V1ProjectMembership(object):
|
|
|
98
100
|
'username': 'username'
|
|
99
101
|
}
|
|
100
102
|
|
|
101
|
-
def __init__(self, avatar_url: 'str' =None, balance: 'float' =None, created_at: 'datetime' =None, creator_id: 'str' =None, datastore_count: 'str' =None, description: 'str' =None, display_name: 'str' =None, email: 'str' =None, first_name: 'str' =None, free_credits_enabled: 'bool' =None, inactive: 'bool' =None, is_default: 'bool' =None, job_count: 'str' =None, job_title: 'str' =None, last_name: 'str' =None, membership_count: 'str' =None, name: 'str' =None, organization: 'str' =None, owner_id: 'str' =None, owner_type: 'V1OwnerType' =None, project_id: 'str' =None, quotas: 'V1Quotas' =None, roles: 'list[V1Role]' =None, updated_at: 'datetime' =None, user_id: 'str' =None, username: 'str' =None): # noqa: E501
|
|
103
|
+
def __init__(self, avatar_url: 'str' =None, balance: 'float' =None, created_at: 'datetime' =None, creator_id: 'str' =None, datastore_count: 'str' =None, description: 'str' =None, display_name: 'str' =None, email: 'str' =None, first_name: 'str' =None, free_credits_enabled: 'bool' =None, inactive: 'bool' =None, is_default: 'bool' =None, job_count: 'str' =None, job_title: 'str' =None, last_name: 'str' =None, membership_count: 'str' =None, name: 'str' =None, next_free_credits_grant_at: 'datetime' =None, organization: 'str' =None, owner_id: 'str' =None, owner_type: 'V1OwnerType' =None, project_id: 'str' =None, quotas: 'V1Quotas' =None, roles: 'list[V1Role]' =None, updated_at: 'datetime' =None, user_id: 'str' =None, username: 'str' =None): # noqa: E501
|
|
102
104
|
"""V1ProjectMembership - a model defined in Swagger""" # noqa: E501
|
|
103
105
|
self._avatar_url = None
|
|
104
106
|
self._balance = None
|
|
@@ -117,6 +119,7 @@ class V1ProjectMembership(object):
|
|
|
117
119
|
self._last_name = None
|
|
118
120
|
self._membership_count = None
|
|
119
121
|
self._name = None
|
|
122
|
+
self._next_free_credits_grant_at = None
|
|
120
123
|
self._organization = None
|
|
121
124
|
self._owner_id = None
|
|
122
125
|
self._owner_type = None
|
|
@@ -161,6 +164,8 @@ class V1ProjectMembership(object):
|
|
|
161
164
|
self.membership_count = membership_count
|
|
162
165
|
if name is not None:
|
|
163
166
|
self.name = name
|
|
167
|
+
if next_free_credits_grant_at is not None:
|
|
168
|
+
self.next_free_credits_grant_at = next_free_credits_grant_at
|
|
164
169
|
if organization is not None:
|
|
165
170
|
self.organization = organization
|
|
166
171
|
if owner_id is not None:
|
|
@@ -537,6 +542,27 @@ class V1ProjectMembership(object):
|
|
|
537
542
|
|
|
538
543
|
self._name = name
|
|
539
544
|
|
|
545
|
+
@property
|
|
546
|
+
def next_free_credits_grant_at(self) -> 'datetime':
|
|
547
|
+
"""Gets the next_free_credits_grant_at of this V1ProjectMembership. # noqa: E501
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
:return: The next_free_credits_grant_at of this V1ProjectMembership. # noqa: E501
|
|
551
|
+
:rtype: datetime
|
|
552
|
+
"""
|
|
553
|
+
return self._next_free_credits_grant_at
|
|
554
|
+
|
|
555
|
+
@next_free_credits_grant_at.setter
|
|
556
|
+
def next_free_credits_grant_at(self, next_free_credits_grant_at: 'datetime'):
|
|
557
|
+
"""Sets the next_free_credits_grant_at of this V1ProjectMembership.
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
:param next_free_credits_grant_at: The next_free_credits_grant_at of this V1ProjectMembership. # noqa: E501
|
|
561
|
+
:type: datetime
|
|
562
|
+
"""
|
|
563
|
+
|
|
564
|
+
self._next_free_credits_grant_at = next_free_credits_grant_at
|
|
565
|
+
|
|
540
566
|
@property
|
|
541
567
|
def organization(self) -> 'str':
|
|
542
568
|
"""Gets the organization of this V1ProjectMembership. # noqa: E501
|
lightning_sdk/mmt/__init__.py
CHANGED
lightning_sdk/mmt/base.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
|
-
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, Optional, Protocol, Tuple, Union
|
|
3
3
|
|
|
4
4
|
if TYPE_CHECKING:
|
|
5
5
|
from lightning_sdk.machine import Machine
|
|
@@ -10,11 +10,41 @@ if TYPE_CHECKING:
|
|
|
10
10
|
from lightning_sdk.user import User
|
|
11
11
|
|
|
12
12
|
from lightning_sdk.job.base import _BaseJob
|
|
13
|
-
from lightning_sdk.job.job import Job
|
|
14
13
|
from lightning_sdk.utils.resolve import _resolve_deprecated_cluster
|
|
15
14
|
|
|
16
15
|
|
|
16
|
+
class MMTMachine(Protocol):
|
|
17
|
+
"""A single machine in multi-machine training."""
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def name(self) -> str:
|
|
21
|
+
"""The Name of the individual machine. Usually corresponds to the rank."""
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def machine(self) -> "Machine":
|
|
26
|
+
"""The actual machine type this node is running on."""
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def artifact_path(self) -> Optional[str]:
|
|
31
|
+
"""The path to the artifacts of this job."""
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def status(self) -> "Status":
|
|
36
|
+
"""The status of this job."""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def logs(self) -> str:
|
|
41
|
+
"""The logs of the given machine."""
|
|
42
|
+
...
|
|
43
|
+
|
|
44
|
+
|
|
17
45
|
class _BaseMMT(_BaseJob):
|
|
46
|
+
"""Base interface to all job types."""
|
|
47
|
+
|
|
18
48
|
@classmethod
|
|
19
49
|
def run(
|
|
20
50
|
cls,
|
|
@@ -36,6 +66,39 @@ class _BaseMMT(_BaseJob):
|
|
|
36
66
|
artifacts_remote: Optional[str] = None,
|
|
37
67
|
cluster: Optional[str] = None, # deprecated in favor of cloud_account
|
|
38
68
|
) -> "_BaseMMT":
|
|
69
|
+
"""Run async workloads using a docker image across multiple machines.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
name: The name of the job. Needs to be unique within the teamspace.
|
|
73
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
74
|
+
num_machine: The number of machines to run on.
|
|
75
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
76
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
77
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
78
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
79
|
+
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
80
|
+
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
81
|
+
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
82
|
+
cloud_account: The cloud account to run the job on.
|
|
83
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
84
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
85
|
+
env: Environment variables to set inside the job.
|
|
86
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
87
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
88
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
89
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
90
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
91
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
92
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
93
|
+
Only supported for jobs with a docker image compute environment.
|
|
94
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
95
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
96
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
97
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
98
|
+
within it.
|
|
99
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
100
|
+
Only supported for jobs with a docker image compute environment.
|
|
101
|
+
"""
|
|
39
102
|
from lightning_sdk.studio import Studio
|
|
40
103
|
|
|
41
104
|
cloud_account = _resolve_deprecated_cluster(cloud_account, cluster)
|
|
@@ -134,53 +197,92 @@ class _BaseMMT(_BaseJob):
|
|
|
134
197
|
artifacts_local: Optional[str] = None,
|
|
135
198
|
artifacts_remote: Optional[str] = None,
|
|
136
199
|
) -> None:
|
|
137
|
-
"""
|
|
200
|
+
"""Submit a new multi-machine job to the Lightning AI platform.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
num_machines: The number of machines to run on.
|
|
204
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
205
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
206
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
207
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
208
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
209
|
+
env: Environment variables to set inside the job.
|
|
210
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
211
|
+
cloud_account: The cloud account to run the job on.
|
|
212
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
213
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
214
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
215
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
216
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
217
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
218
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
219
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
220
|
+
Only supported for jobs with a docker image compute environment.
|
|
221
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
222
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
223
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
224
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
225
|
+
within it.
|
|
226
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
227
|
+
Only supported for jobs with a docker image compute environment.
|
|
228
|
+
"""
|
|
138
229
|
|
|
139
230
|
@property
|
|
140
231
|
@abstractmethod
|
|
141
|
-
def machines(self) -> Tuple[
|
|
142
|
-
|
|
232
|
+
def machines(self) -> Tuple[MMTMachine, ...]:
|
|
233
|
+
"""Returns the sub-jobs for each individual instance."""
|
|
143
234
|
|
|
144
235
|
@property
|
|
145
236
|
@abstractmethod
|
|
146
237
|
def machine(self) -> "Machine":
|
|
147
|
-
|
|
238
|
+
"""Returns the machine type this job is running on."""
|
|
148
239
|
|
|
149
240
|
@abstractmethod
|
|
150
241
|
def stop(self) -> None:
|
|
151
|
-
|
|
242
|
+
"""Stops the job."""
|
|
152
243
|
|
|
153
244
|
@abstractmethod
|
|
154
245
|
def delete(self) -> None:
|
|
155
|
-
|
|
246
|
+
"""Deletes the job.
|
|
247
|
+
|
|
248
|
+
Caution: This also deletes all artifacts and snapshots associated with the job.
|
|
249
|
+
"""
|
|
156
250
|
|
|
157
251
|
@property
|
|
158
252
|
@abstractmethod
|
|
159
253
|
def status(self) -> "Status":
|
|
160
|
-
|
|
254
|
+
"""The current status of the job."""
|
|
161
255
|
|
|
162
256
|
@property
|
|
163
257
|
@abstractmethod
|
|
164
258
|
def artifact_path(self) -> Optional[str]:
|
|
165
|
-
|
|
259
|
+
"""Path to the artifacts created by the job within the distributed teamspace filesystem."""
|
|
166
260
|
|
|
167
261
|
@property
|
|
168
262
|
@abstractmethod
|
|
169
263
|
def snapshot_path(self) -> Optional[str]:
|
|
170
|
-
|
|
264
|
+
"""Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
|
|
171
265
|
|
|
172
266
|
@property
|
|
173
267
|
def share_path(self) -> Optional[str]:
|
|
268
|
+
"""Path to the jobs share path."""
|
|
174
269
|
return None
|
|
175
270
|
|
|
176
|
-
@abstractmethod
|
|
177
|
-
def _update_internal_job(self) -> None:
|
|
178
|
-
pass
|
|
179
|
-
|
|
180
271
|
@property
|
|
181
272
|
def name(self) -> str:
|
|
273
|
+
"""The job's name."""
|
|
182
274
|
return self._name
|
|
183
275
|
|
|
184
276
|
@property
|
|
185
277
|
def teamspace(self) -> "Teamspace":
|
|
278
|
+
"""The teamspace the job is part of."""
|
|
186
279
|
return self._teamspace
|
|
280
|
+
|
|
281
|
+
@property
|
|
282
|
+
def logs(self) -> str:
|
|
283
|
+
"""Logs of the rank 0 machine."""
|
|
284
|
+
return self.machines[0].logs
|
|
285
|
+
|
|
286
|
+
@abstractmethod
|
|
287
|
+
def _update_internal_job(self) -> None:
|
|
288
|
+
pass
|
lightning_sdk/mmt/mmt.py
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
1
|
-
from
|
|
1
|
+
from contextlib import suppress
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
2
4
|
|
|
5
|
+
from lightning_sdk.api.user_api import UserApi
|
|
3
6
|
from lightning_sdk.job.job import _has_jobs_v2
|
|
4
|
-
from lightning_sdk.mmt.base import _BaseMMT
|
|
7
|
+
from lightning_sdk.mmt.base import MMTMachine, _BaseMMT
|
|
5
8
|
from lightning_sdk.mmt.v1 import _MMTV1
|
|
6
9
|
from lightning_sdk.mmt.v2 import _MMTV2
|
|
10
|
+
from lightning_sdk.utils.resolve import _setup_logger
|
|
7
11
|
|
|
8
12
|
if TYPE_CHECKING:
|
|
9
13
|
from lightning_sdk.machine import Machine
|
|
@@ -13,28 +17,25 @@ if TYPE_CHECKING:
|
|
|
13
17
|
from lightning_sdk.teamspace import Teamspace
|
|
14
18
|
from lightning_sdk.user import User
|
|
15
19
|
|
|
20
|
+
_logger = _setup_logger(__name__)
|
|
16
21
|
|
|
17
|
-
class MMTMachine(Protocol):
|
|
18
|
-
"""A single machine in multi-machine training."""
|
|
19
22
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def machine(self) -> "Machine":
|
|
26
|
-
...
|
|
23
|
+
@lru_cache(maxsize=None)
|
|
24
|
+
def _has_mmt_v2() -> bool:
|
|
25
|
+
# users need both mmtv2 and jobsv2 flags in order for mmtv2 to work correctly
|
|
26
|
+
if not _has_jobs_v2():
|
|
27
|
+
return False
|
|
27
28
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def status(self) -> "Status":
|
|
34
|
-
...
|
|
29
|
+
api = UserApi()
|
|
30
|
+
try:
|
|
31
|
+
return api._get_feature_flags().mmt_v2
|
|
32
|
+
except Exception:
|
|
33
|
+
return False
|
|
35
34
|
|
|
36
35
|
|
|
37
36
|
class MMT(_BaseMMT):
|
|
37
|
+
"""Class to submit and manage multi-machine jobs on the Lightning AI Platform."""
|
|
38
|
+
|
|
38
39
|
_force_v1: (
|
|
39
40
|
bool
|
|
40
41
|
) = False # required for studio plugin still working correctly as v2 currently does not support the studio env
|
|
@@ -48,7 +49,16 @@ class MMT(_BaseMMT):
|
|
|
48
49
|
*,
|
|
49
50
|
_fetch_job: bool = True,
|
|
50
51
|
) -> None:
|
|
51
|
-
|
|
52
|
+
"""Fetch already existing jobs.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
name: the name of the job
|
|
56
|
+
teamspace: the teamspace the job is part of
|
|
57
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
58
|
+
user: the name of the user owning the :param`teamspace`
|
|
59
|
+
in case it is owned directly by a user instead of an org.
|
|
60
|
+
"""
|
|
61
|
+
internal_mmt_cls = _MMTV2 if _has_mmt_v2() and not self._force_v1 else _MMTV1
|
|
52
62
|
|
|
53
63
|
self._internal_mmt = internal_mmt_cls(
|
|
54
64
|
name=name,
|
|
@@ -79,6 +89,39 @@ class MMT(_BaseMMT):
|
|
|
79
89
|
artifacts_remote: Optional[str] = None,
|
|
80
90
|
cluster: Optional[str] = None, # deprecated in favor of cloud_account
|
|
81
91
|
) -> "MMT":
|
|
92
|
+
"""Run async workloads using a docker image across multiple machines.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
name: The name of the job. Needs to be unique within the teamspace.
|
|
96
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
97
|
+
num_machine: The number of machines to run on.
|
|
98
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
99
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
100
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
101
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
102
|
+
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
103
|
+
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
104
|
+
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
105
|
+
cloud_account: The cloud account to run the job on.
|
|
106
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
107
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
108
|
+
env: Environment variables to set inside the job.
|
|
109
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
110
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
111
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
112
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
113
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
114
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
115
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
116
|
+
Only supported for jobs with a docker image compute environment.
|
|
117
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
118
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
119
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
120
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
121
|
+
within it.
|
|
122
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
123
|
+
Only supported for jobs with a docker image compute environment.
|
|
124
|
+
"""
|
|
82
125
|
ret_val = super().run(
|
|
83
126
|
name=name,
|
|
84
127
|
num_machines=num_machines,
|
|
@@ -98,8 +141,15 @@ class MMT(_BaseMMT):
|
|
|
98
141
|
artifacts_remote=artifacts_remote,
|
|
99
142
|
cluster=cluster, # deprecated in favor of cloud_account
|
|
100
143
|
)
|
|
101
|
-
# required for typing with "
|
|
144
|
+
# required for typing with "MMT"
|
|
102
145
|
assert isinstance(ret_val, cls)
|
|
146
|
+
|
|
147
|
+
msg = "Multi-Machine Job was successfully launched."
|
|
148
|
+
|
|
149
|
+
with suppress(NotImplementedError):
|
|
150
|
+
msg += f" View it at {ret_val.link}"
|
|
151
|
+
|
|
152
|
+
_logger.info(msg)
|
|
103
153
|
return ret_val
|
|
104
154
|
|
|
105
155
|
def _submit(
|
|
@@ -117,6 +167,35 @@ class MMT(_BaseMMT):
|
|
|
117
167
|
artifacts_local: Optional[str] = None,
|
|
118
168
|
artifacts_remote: Optional[str] = None,
|
|
119
169
|
) -> "MMT":
|
|
170
|
+
"""Submit a new multi-machine job to the Lightning AI platform.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
num_machines: The number of machines to run on.
|
|
174
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
175
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
176
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
177
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
178
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
179
|
+
env: Environment variables to set inside the job.
|
|
180
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
181
|
+
cloud_account: The cloud account to run the job on.
|
|
182
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
183
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
184
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
185
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
186
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
187
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
188
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
189
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
190
|
+
Only supported for jobs with a docker image compute environment.
|
|
191
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
192
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
193
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
194
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
195
|
+
within it.
|
|
196
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
197
|
+
Only supported for jobs with a docker image compute environment.
|
|
198
|
+
"""
|
|
120
199
|
self._job = self._internal_mmt._submit(
|
|
121
200
|
num_machines=num_machines,
|
|
122
201
|
machine=machine,
|
|
@@ -134,33 +213,44 @@ class MMT(_BaseMMT):
|
|
|
134
213
|
return self
|
|
135
214
|
|
|
136
215
|
def stop(self) -> None:
|
|
216
|
+
"""Stops the job."""
|
|
137
217
|
return self._internal_mmt.stop()
|
|
138
218
|
|
|
139
219
|
def delete(self) -> None:
|
|
220
|
+
"""Deletes the job.
|
|
221
|
+
|
|
222
|
+
Caution: This also deletes all artifacts and snapshots associated with the job.
|
|
223
|
+
"""
|
|
140
224
|
return self._internal_mmt.delete()
|
|
141
225
|
|
|
142
226
|
@property
|
|
143
227
|
def status(self) -> "Status":
|
|
228
|
+
"""The current status of the job (accumulated over all machines)."""
|
|
144
229
|
return self._internal_mmt.status
|
|
145
230
|
|
|
146
231
|
@property
|
|
147
232
|
def machines(self) -> Tuple[MMTMachine, ...]:
|
|
233
|
+
"""Returns the sub-jobs for each individual instance."""
|
|
148
234
|
return self._internal_mmt.machines
|
|
149
235
|
|
|
150
236
|
@property
|
|
151
237
|
def machine(self) -> "Machine":
|
|
238
|
+
"""Returns the machine type this job is running on."""
|
|
152
239
|
return self._internal_mmt.machine
|
|
153
240
|
|
|
154
241
|
@property
|
|
155
242
|
def artifact_path(self) -> Optional[str]:
|
|
243
|
+
"""Path to the artifacts created by the job within the distributed teamspace filesystem."""
|
|
156
244
|
return self._internal_mmt.artifact_path
|
|
157
245
|
|
|
158
246
|
@property
|
|
159
247
|
def snapshot_path(self) -> Optional[str]:
|
|
248
|
+
"""Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
|
|
160
249
|
return self._internal_mmt.snapshot_path
|
|
161
250
|
|
|
162
251
|
@property
|
|
163
252
|
def share_path(self) -> Optional[str]:
|
|
253
|
+
"""Path to the jobs share path."""
|
|
164
254
|
return None
|
|
165
255
|
|
|
166
256
|
def _update_internal_job(self) -> None:
|
|
@@ -168,15 +258,17 @@ class MMT(_BaseMMT):
|
|
|
168
258
|
|
|
169
259
|
@property
|
|
170
260
|
def name(self) -> str:
|
|
261
|
+
"""The job's name."""
|
|
171
262
|
return self._internal_mmt.name
|
|
172
263
|
|
|
173
264
|
@property
|
|
174
265
|
def teamspace(self) -> "Teamspace":
|
|
266
|
+
"""The teamspace the job is part of."""
|
|
175
267
|
return self._internal_mmt._teamspace
|
|
176
268
|
|
|
177
269
|
@property
|
|
178
|
-
def
|
|
179
|
-
return self._internal_mmt.
|
|
270
|
+
def link(self) -> str:
|
|
271
|
+
return self._internal_mmt.link
|
|
180
272
|
|
|
181
273
|
def __getattr__(self, key: str) -> Any:
|
|
182
274
|
"""Forward the attribute lookup to the internal job implementation."""
|
lightning_sdk/mmt/v1.py
CHANGED
|
@@ -16,6 +16,8 @@ from lightning_sdk.mmt.base import _BaseMMT
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class _MMTV1(_BaseMMT):
|
|
19
|
+
"""V1 Implementation of Multi-Machine Training."""
|
|
20
|
+
|
|
19
21
|
def __init__(
|
|
20
22
|
self,
|
|
21
23
|
name: str,
|
|
@@ -25,6 +27,15 @@ class _MMTV1(_BaseMMT):
|
|
|
25
27
|
*,
|
|
26
28
|
_fetch_job: bool = True,
|
|
27
29
|
) -> None:
|
|
30
|
+
"""Fetch already existing jobs.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
name: the name of the job
|
|
34
|
+
teamspace: the teamspace the job is part of
|
|
35
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
36
|
+
user: the name of the user owning the :param`teamspace`
|
|
37
|
+
in case it is owned directly by a user instead of an org.
|
|
38
|
+
"""
|
|
28
39
|
self._job_api = MMTApiV1()
|
|
29
40
|
super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
|
|
30
41
|
|
|
@@ -43,6 +54,35 @@ class _MMTV1(_BaseMMT):
|
|
|
43
54
|
artifacts_local: Optional[str] = None,
|
|
44
55
|
artifacts_remote: Optional[str] = None,
|
|
45
56
|
) -> "_MMTV1":
|
|
57
|
+
"""Submit a new multi-machine job to the Lightning AI platform.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
num_machines: The number of machines to run on.
|
|
61
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
62
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
63
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
64
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
65
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
66
|
+
env: Environment variables to set inside the job.
|
|
67
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
68
|
+
cloud_account: The cloud account to run the job on.
|
|
69
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
70
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
71
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
72
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
73
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
74
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
75
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
76
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
77
|
+
Only supported for jobs with a docker image compute environment.
|
|
78
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
79
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
80
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
81
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
82
|
+
within it.
|
|
83
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
84
|
+
Only supported for jobs with a docker image compute environment.
|
|
85
|
+
"""
|
|
46
86
|
if studio is None:
|
|
47
87
|
raise ValueError("Studio is required for submitting jobs")
|
|
48
88
|
if image is not None or image_credentials is not None or cloud_account_auth:
|
|
@@ -80,18 +120,25 @@ class _MMTV1(_BaseMMT):
|
|
|
80
120
|
|
|
81
121
|
@property
|
|
82
122
|
def machines(self) -> Tuple["Work", ...]:
|
|
123
|
+
"""Returns the sub-jobs for each individual instance."""
|
|
83
124
|
works = self._job_api.list_works(self._guaranteed_job.id, self.teamspace.id)
|
|
84
125
|
|
|
85
126
|
return tuple(Work(w.id, self, self.teamspace) for w in works)
|
|
86
127
|
|
|
87
128
|
def stop(self) -> None:
|
|
129
|
+
"""Stops the job."""
|
|
88
130
|
self._job_api.stop_job(self._guaranteed_job.id, self.teamspace.id)
|
|
89
131
|
|
|
90
132
|
def delete(self) -> None:
|
|
133
|
+
"""Deletes the job.
|
|
134
|
+
|
|
135
|
+
Caution: This also deletes all artifacts and snapshots associated with the job.
|
|
136
|
+
"""
|
|
91
137
|
self._job_api.delete_job(self._guaranteed_job.id, self.teamspace.id)
|
|
92
138
|
|
|
93
139
|
@property
|
|
94
140
|
def status(self) -> "Status":
|
|
141
|
+
"""The current status of the job."""
|
|
95
142
|
try:
|
|
96
143
|
status = self._job_api.get_job_status(self._job.id, self.teamspace.id)
|
|
97
144
|
return _internal_status_to_external_status(status)
|
|
@@ -102,24 +149,33 @@ class _MMTV1(_BaseMMT):
|
|
|
102
149
|
|
|
103
150
|
@property
|
|
104
151
|
def artifact_path(self) -> Optional[str]:
|
|
152
|
+
"""Path to the artifacts created by the job within the distributed teamspace filesystem."""
|
|
105
153
|
return f"/teamspace/jobs/{self.name}"
|
|
106
154
|
|
|
107
155
|
@property
|
|
108
156
|
def snapshot_path(self) -> Optional[str]:
|
|
157
|
+
"""Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
|
|
109
158
|
return f"/teamspace/jobs/{self.name}/snapshot"
|
|
110
159
|
|
|
111
160
|
@property
|
|
112
161
|
def machine(self) -> "Machine":
|
|
162
|
+
"""Returns the machine type this job is running on."""
|
|
113
163
|
return self.machines[0].machine
|
|
114
164
|
|
|
115
165
|
@property
|
|
116
166
|
def name(self) -> str:
|
|
167
|
+
"""The job's name."""
|
|
117
168
|
return self._name
|
|
118
169
|
|
|
119
170
|
@property
|
|
120
171
|
def teamspace(self) -> "Teamspace":
|
|
172
|
+
"""The teamspace the job is part of."""
|
|
121
173
|
return self._teamspace
|
|
122
174
|
|
|
175
|
+
@property
|
|
176
|
+
def link(self) -> str:
|
|
177
|
+
return f"https://lightning.ai/{self.teamspace.owner.name}/{self.teamspace.name}/studios/{self._job_api.get_studio_name(self._guaranteed_job)}/app?app_id=mmt&app_tab=Runs&job_name={self.name}"
|
|
178
|
+
|
|
123
179
|
# the following and functions are solely to make the Work class function
|
|
124
180
|
@property
|
|
125
181
|
def _id(self) -> str:
|