lightning-sdk 0.1.42__py3-none-any.whl → 0.1.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -58,6 +58,7 @@ class V1ProjectMembership(object):
58
58
  'last_name': 'str',
59
59
  'membership_count': 'str',
60
60
  'name': 'str',
61
+ 'next_free_credits_grant_at': 'datetime',
61
62
  'organization': 'str',
62
63
  'owner_id': 'str',
63
64
  'owner_type': 'V1OwnerType',
@@ -87,6 +88,7 @@ class V1ProjectMembership(object):
87
88
  'last_name': 'lastName',
88
89
  'membership_count': 'membershipCount',
89
90
  'name': 'name',
91
+ 'next_free_credits_grant_at': 'nextFreeCreditsGrantAt',
90
92
  'organization': 'organization',
91
93
  'owner_id': 'ownerId',
92
94
  'owner_type': 'ownerType',
@@ -98,7 +100,7 @@ class V1ProjectMembership(object):
98
100
  'username': 'username'
99
101
  }
100
102
 
101
- def __init__(self, avatar_url: 'str' =None, balance: 'float' =None, created_at: 'datetime' =None, creator_id: 'str' =None, datastore_count: 'str' =None, description: 'str' =None, display_name: 'str' =None, email: 'str' =None, first_name: 'str' =None, free_credits_enabled: 'bool' =None, inactive: 'bool' =None, is_default: 'bool' =None, job_count: 'str' =None, job_title: 'str' =None, last_name: 'str' =None, membership_count: 'str' =None, name: 'str' =None, organization: 'str' =None, owner_id: 'str' =None, owner_type: 'V1OwnerType' =None, project_id: 'str' =None, quotas: 'V1Quotas' =None, roles: 'list[V1Role]' =None, updated_at: 'datetime' =None, user_id: 'str' =None, username: 'str' =None): # noqa: E501
103
+ def __init__(self, avatar_url: 'str' =None, balance: 'float' =None, created_at: 'datetime' =None, creator_id: 'str' =None, datastore_count: 'str' =None, description: 'str' =None, display_name: 'str' =None, email: 'str' =None, first_name: 'str' =None, free_credits_enabled: 'bool' =None, inactive: 'bool' =None, is_default: 'bool' =None, job_count: 'str' =None, job_title: 'str' =None, last_name: 'str' =None, membership_count: 'str' =None, name: 'str' =None, next_free_credits_grant_at: 'datetime' =None, organization: 'str' =None, owner_id: 'str' =None, owner_type: 'V1OwnerType' =None, project_id: 'str' =None, quotas: 'V1Quotas' =None, roles: 'list[V1Role]' =None, updated_at: 'datetime' =None, user_id: 'str' =None, username: 'str' =None): # noqa: E501
102
104
  """V1ProjectMembership - a model defined in Swagger""" # noqa: E501
103
105
  self._avatar_url = None
104
106
  self._balance = None
@@ -117,6 +119,7 @@ class V1ProjectMembership(object):
117
119
  self._last_name = None
118
120
  self._membership_count = None
119
121
  self._name = None
122
+ self._next_free_credits_grant_at = None
120
123
  self._organization = None
121
124
  self._owner_id = None
122
125
  self._owner_type = None
@@ -161,6 +164,8 @@ class V1ProjectMembership(object):
161
164
  self.membership_count = membership_count
162
165
  if name is not None:
163
166
  self.name = name
167
+ if next_free_credits_grant_at is not None:
168
+ self.next_free_credits_grant_at = next_free_credits_grant_at
164
169
  if organization is not None:
165
170
  self.organization = organization
166
171
  if owner_id is not None:
@@ -537,6 +542,27 @@ class V1ProjectMembership(object):
537
542
 
538
543
  self._name = name
539
544
 
545
+ @property
546
+ def next_free_credits_grant_at(self) -> 'datetime':
547
+ """Gets the next_free_credits_grant_at of this V1ProjectMembership. # noqa: E501
548
+
549
+
550
+ :return: The next_free_credits_grant_at of this V1ProjectMembership. # noqa: E501
551
+ :rtype: datetime
552
+ """
553
+ return self._next_free_credits_grant_at
554
+
555
+ @next_free_credits_grant_at.setter
556
+ def next_free_credits_grant_at(self, next_free_credits_grant_at: 'datetime'):
557
+ """Sets the next_free_credits_grant_at of this V1ProjectMembership.
558
+
559
+
560
+ :param next_free_credits_grant_at: The next_free_credits_grant_at of this V1ProjectMembership. # noqa: E501
561
+ :type: datetime
562
+ """
563
+
564
+ self._next_free_credits_grant_at = next_free_credits_grant_at
565
+
540
566
  @property
541
567
  def organization(self) -> 'str':
542
568
  """Gets the organization of this V1ProjectMembership. # noqa: E501
@@ -1,3 +1,4 @@
1
+ from lightning_sdk.mmt.base import MMTMachine
1
2
  from lightning_sdk.mmt.mmt import MMT
2
3
 
3
- __all__ = ["MMT"]
4
+ __all__ = ["MMT", "MMTMachine"]
lightning_sdk/mmt/base.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from abc import abstractmethod
2
- from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
2
+ from typing import TYPE_CHECKING, Dict, Optional, Protocol, Tuple, Union
3
3
 
4
4
  if TYPE_CHECKING:
5
5
  from lightning_sdk.machine import Machine
@@ -10,11 +10,41 @@ if TYPE_CHECKING:
10
10
  from lightning_sdk.user import User
11
11
 
12
12
  from lightning_sdk.job.base import _BaseJob
13
- from lightning_sdk.job.job import Job
14
13
  from lightning_sdk.utils.resolve import _resolve_deprecated_cluster
15
14
 
16
15
 
16
+ class MMTMachine(Protocol):
17
+ """A single machine in multi-machine training."""
18
+
19
+ @property
20
+ def name(self) -> str:
21
+ """The Name of the individual machine. Usually corresponds to the rank."""
22
+ ...
23
+
24
+ @property
25
+ def machine(self) -> "Machine":
26
+ """The actual machine type this node is running on."""
27
+ ...
28
+
29
+ @property
30
+ def artifact_path(self) -> Optional[str]:
31
+ """The path to the artifacts of this job."""
32
+ ...
33
+
34
+ @property
35
+ def status(self) -> "Status":
36
+ """The status of this job."""
37
+ ...
38
+
39
+ @property
40
+ def logs(self) -> str:
41
+ """The logs of the given machine."""
42
+ ...
43
+
44
+
17
45
  class _BaseMMT(_BaseJob):
46
+ """Base interface to all job types."""
47
+
18
48
  @classmethod
19
49
  def run(
20
50
  cls,
@@ -36,6 +66,39 @@ class _BaseMMT(_BaseJob):
36
66
  artifacts_remote: Optional[str] = None,
37
67
  cluster: Optional[str] = None, # deprecated in favor of cloud_account
38
68
  ) -> "_BaseMMT":
69
+ """Run async workloads using a docker image across multiple machines.
70
+
71
+ Args:
72
+ name: The name of the job. Needs to be unique within the teamspace.
73
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
74
+ num_machine: The number of machines to run on.
75
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
76
+ If not provided for images, will run the container entrypoint and default command.
77
+ studio: The studio env to run the job with. Mutually exclusive with image.
78
+ image: The docker image to run the job with. Mutually exclusive with studio.
79
+ teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
80
+ org: The organization owning the teamspace (if any). Defaults to the current organization.
81
+ user: The user owning the teamspace (if any). Defaults to the current user.
82
+ cloud_account: The cloud account to run the job on.
83
+ Defaults to the studio cloud account if running with studio compute env.
84
+ If not provided will fall back to the teamspaces default cloud account.
85
+ env: Environment variables to set inside the job.
86
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
87
+ image_credentials: The credentials used to pull the image. Required if the image is private.
88
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
89
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
90
+ Required if the registry is part of a cloud provider (e.g. ECR).
91
+ artifacts_local: The path of inside the docker container, you want to persist images from.
92
+ CAUTION: When setting this to "/", it will effectively erase your container.
93
+ Only supported for jobs with a docker image compute environment.
94
+ artifacts_remote: The remote storage to persist your artifacts to.
95
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
96
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
97
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
98
+ within it.
99
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
100
+ Only supported for jobs with a docker image compute environment.
101
+ """
39
102
  from lightning_sdk.studio import Studio
40
103
 
41
104
  cloud_account = _resolve_deprecated_cluster(cloud_account, cluster)
@@ -134,53 +197,92 @@ class _BaseMMT(_BaseJob):
134
197
  artifacts_local: Optional[str] = None,
135
198
  artifacts_remote: Optional[str] = None,
136
199
  ) -> None:
137
- """Submits a job and updates the internal _job attribute as well as the _name attribute."""
200
+ """Submit a new multi-machine job to the Lightning AI platform.
201
+
202
+ Args:
203
+ num_machines: The number of machines to run on.
204
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
205
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
206
+ If not provided for images, will run the container entrypoint and default command.
207
+ studio: The studio env to run the job with. Mutually exclusive with image.
208
+ image: The docker image to run the job with. Mutually exclusive with studio.
209
+ env: Environment variables to set inside the job.
210
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
211
+ cloud_account: The cloud account to run the job on.
212
+ Defaults to the studio cloud account if running with studio compute env.
213
+ If not provided will fall back to the teamspaces default cloud account.
214
+ image_credentials: The credentials used to pull the image. Required if the image is private.
215
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
216
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
217
+ Required if the registry is part of a cloud provider (e.g. ECR).
218
+ artifacts_local: The path of inside the docker container, you want to persist images from.
219
+ CAUTION: When setting this to "/", it will effectively erase your container.
220
+ Only supported for jobs with a docker image compute environment.
221
+ artifacts_remote: The remote storage to persist your artifacts to.
222
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
223
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
224
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
225
+ within it.
226
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
227
+ Only supported for jobs with a docker image compute environment.
228
+ """
138
229
 
139
230
  @property
140
231
  @abstractmethod
141
- def machines(self) -> Tuple["Job", ...]:
142
- pass
232
+ def machines(self) -> Tuple[MMTMachine, ...]:
233
+ """Returns the sub-jobs for each individual instance."""
143
234
 
144
235
  @property
145
236
  @abstractmethod
146
237
  def machine(self) -> "Machine":
147
- pass
238
+ """Returns the machine type this job is running on."""
148
239
 
149
240
  @abstractmethod
150
241
  def stop(self) -> None:
151
- pass
242
+ """Stops the job."""
152
243
 
153
244
  @abstractmethod
154
245
  def delete(self) -> None:
155
- pass
246
+ """Deletes the job.
247
+
248
+ Caution: This also deletes all artifacts and snapshots associated with the job.
249
+ """
156
250
 
157
251
  @property
158
252
  @abstractmethod
159
253
  def status(self) -> "Status":
160
- pass
254
+ """The current status of the job."""
161
255
 
162
256
  @property
163
257
  @abstractmethod
164
258
  def artifact_path(self) -> Optional[str]:
165
- pass
259
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
166
260
 
167
261
  @property
168
262
  @abstractmethod
169
263
  def snapshot_path(self) -> Optional[str]:
170
- pass
264
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
171
265
 
172
266
  @property
173
267
  def share_path(self) -> Optional[str]:
268
+ """Path to the jobs share path."""
174
269
  return None
175
270
 
176
- @abstractmethod
177
- def _update_internal_job(self) -> None:
178
- pass
179
-
180
271
  @property
181
272
  def name(self) -> str:
273
+ """The job's name."""
182
274
  return self._name
183
275
 
184
276
  @property
185
277
  def teamspace(self) -> "Teamspace":
278
+ """The teamspace the job is part of."""
186
279
  return self._teamspace
280
+
281
+ @property
282
+ def logs(self) -> str:
283
+ """Logs of the rank 0 machine."""
284
+ return self.machines[0].logs
285
+
286
+ @abstractmethod
287
+ def _update_internal_job(self) -> None:
288
+ pass
lightning_sdk/mmt/mmt.py CHANGED
@@ -1,9 +1,13 @@
1
- from typing import TYPE_CHECKING, Any, Dict, Optional, Protocol, Tuple, Union
1
+ from contextlib import suppress
2
+ from functools import lru_cache
3
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
2
4
 
5
+ from lightning_sdk.api.user_api import UserApi
3
6
  from lightning_sdk.job.job import _has_jobs_v2
4
- from lightning_sdk.mmt.base import _BaseMMT
7
+ from lightning_sdk.mmt.base import MMTMachine, _BaseMMT
5
8
  from lightning_sdk.mmt.v1 import _MMTV1
6
9
  from lightning_sdk.mmt.v2 import _MMTV2
10
+ from lightning_sdk.utils.resolve import _setup_logger
7
11
 
8
12
  if TYPE_CHECKING:
9
13
  from lightning_sdk.machine import Machine
@@ -13,28 +17,25 @@ if TYPE_CHECKING:
13
17
  from lightning_sdk.teamspace import Teamspace
14
18
  from lightning_sdk.user import User
15
19
 
20
+ _logger = _setup_logger(__name__)
16
21
 
17
- class MMTMachine(Protocol):
18
- """A single machine in multi-machine training."""
19
22
 
20
- @property
21
- def name(self) -> str:
22
- ...
23
-
24
- @property
25
- def machine(self) -> "Machine":
26
- ...
23
+ @lru_cache(maxsize=None)
24
+ def _has_mmt_v2() -> bool:
25
+ # users need both mmtv2 and jobsv2 flags in order for mmtv2 to work correctly
26
+ if not _has_jobs_v2():
27
+ return False
27
28
 
28
- @property
29
- def artifact_path(self) -> Optional[str]:
30
- ...
31
-
32
- @property
33
- def status(self) -> "Status":
34
- ...
29
+ api = UserApi()
30
+ try:
31
+ return api._get_feature_flags().mmt_v2
32
+ except Exception:
33
+ return False
35
34
 
36
35
 
37
36
  class MMT(_BaseMMT):
37
+ """Class to submit and manage multi-machine jobs on the Lightning AI Platform."""
38
+
38
39
  _force_v1: (
39
40
  bool
40
41
  ) = False # required for studio plugin still working correctly as v2 currently does not support the studio env
@@ -48,7 +49,16 @@ class MMT(_BaseMMT):
48
49
  *,
49
50
  _fetch_job: bool = True,
50
51
  ) -> None:
51
- internal_mmt_cls = _MMTV2 if _has_jobs_v2() and not self._force_v1 else _MMTV1
52
+ """Fetch already existing jobs.
53
+
54
+ Args:
55
+ name: the name of the job
56
+ teamspace: the teamspace the job is part of
57
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
58
+ user: the name of the user owning the :param`teamspace`
59
+ in case it is owned directly by a user instead of an org.
60
+ """
61
+ internal_mmt_cls = _MMTV2 if _has_mmt_v2() and not self._force_v1 else _MMTV1
52
62
 
53
63
  self._internal_mmt = internal_mmt_cls(
54
64
  name=name,
@@ -79,6 +89,39 @@ class MMT(_BaseMMT):
79
89
  artifacts_remote: Optional[str] = None,
80
90
  cluster: Optional[str] = None, # deprecated in favor of cloud_account
81
91
  ) -> "MMT":
92
+ """Run async workloads using a docker image across multiple machines.
93
+
94
+ Args:
95
+ name: The name of the job. Needs to be unique within the teamspace.
96
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
97
+ num_machine: The number of machines to run on.
98
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
99
+ If not provided for images, will run the container entrypoint and default command.
100
+ studio: The studio env to run the job with. Mutually exclusive with image.
101
+ image: The docker image to run the job with. Mutually exclusive with studio.
102
+ teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
103
+ org: The organization owning the teamspace (if any). Defaults to the current organization.
104
+ user: The user owning the teamspace (if any). Defaults to the current user.
105
+ cloud_account: The cloud account to run the job on.
106
+ Defaults to the studio cloud account if running with studio compute env.
107
+ If not provided will fall back to the teamspaces default cloud account.
108
+ env: Environment variables to set inside the job.
109
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
110
+ image_credentials: The credentials used to pull the image. Required if the image is private.
111
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
112
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
113
+ Required if the registry is part of a cloud provider (e.g. ECR).
114
+ artifacts_local: The path of inside the docker container, you want to persist images from.
115
+ CAUTION: When setting this to "/", it will effectively erase your container.
116
+ Only supported for jobs with a docker image compute environment.
117
+ artifacts_remote: The remote storage to persist your artifacts to.
118
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
119
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
120
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
121
+ within it.
122
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
123
+ Only supported for jobs with a docker image compute environment.
124
+ """
82
125
  ret_val = super().run(
83
126
  name=name,
84
127
  num_machines=num_machines,
@@ -98,8 +141,15 @@ class MMT(_BaseMMT):
98
141
  artifacts_remote=artifacts_remote,
99
142
  cluster=cluster, # deprecated in favor of cloud_account
100
143
  )
101
- # required for typing with "Job"
144
+ # required for typing with "MMT"
102
145
  assert isinstance(ret_val, cls)
146
+
147
+ msg = "Multi-Machine Job was successfully launched."
148
+
149
+ with suppress(NotImplementedError):
150
+ msg += f" View it at {ret_val.link}"
151
+
152
+ _logger.info(msg)
103
153
  return ret_val
104
154
 
105
155
  def _submit(
@@ -117,6 +167,35 @@ class MMT(_BaseMMT):
117
167
  artifacts_local: Optional[str] = None,
118
168
  artifacts_remote: Optional[str] = None,
119
169
  ) -> "MMT":
170
+ """Submit a new multi-machine job to the Lightning AI platform.
171
+
172
+ Args:
173
+ num_machines: The number of machines to run on.
174
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
175
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
176
+ If not provided for images, will run the container entrypoint and default command.
177
+ studio: The studio env to run the job with. Mutually exclusive with image.
178
+ image: The docker image to run the job with. Mutually exclusive with studio.
179
+ env: Environment variables to set inside the job.
180
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
181
+ cloud_account: The cloud account to run the job on.
182
+ Defaults to the studio cloud account if running with studio compute env.
183
+ If not provided will fall back to the teamspaces default cloud account.
184
+ image_credentials: The credentials used to pull the image. Required if the image is private.
185
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
186
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
187
+ Required if the registry is part of a cloud provider (e.g. ECR).
188
+ artifacts_local: The path of inside the docker container, you want to persist images from.
189
+ CAUTION: When setting this to "/", it will effectively erase your container.
190
+ Only supported for jobs with a docker image compute environment.
191
+ artifacts_remote: The remote storage to persist your artifacts to.
192
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
193
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
194
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
195
+ within it.
196
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
197
+ Only supported for jobs with a docker image compute environment.
198
+ """
120
199
  self._job = self._internal_mmt._submit(
121
200
  num_machines=num_machines,
122
201
  machine=machine,
@@ -134,33 +213,44 @@ class MMT(_BaseMMT):
134
213
  return self
135
214
 
136
215
  def stop(self) -> None:
216
+ """Stops the job."""
137
217
  return self._internal_mmt.stop()
138
218
 
139
219
  def delete(self) -> None:
220
+ """Deletes the job.
221
+
222
+ Caution: This also deletes all artifacts and snapshots associated with the job.
223
+ """
140
224
  return self._internal_mmt.delete()
141
225
 
142
226
  @property
143
227
  def status(self) -> "Status":
228
+ """The current status of the job (accumulated over all machines)."""
144
229
  return self._internal_mmt.status
145
230
 
146
231
  @property
147
232
  def machines(self) -> Tuple[MMTMachine, ...]:
233
+ """Returns the sub-jobs for each individual instance."""
148
234
  return self._internal_mmt.machines
149
235
 
150
236
  @property
151
237
  def machine(self) -> "Machine":
238
+ """Returns the machine type this job is running on."""
152
239
  return self._internal_mmt.machine
153
240
 
154
241
  @property
155
242
  def artifact_path(self) -> Optional[str]:
243
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
156
244
  return self._internal_mmt.artifact_path
157
245
 
158
246
  @property
159
247
  def snapshot_path(self) -> Optional[str]:
248
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
160
249
  return self._internal_mmt.snapshot_path
161
250
 
162
251
  @property
163
252
  def share_path(self) -> Optional[str]:
253
+ """Path to the jobs share path."""
164
254
  return None
165
255
 
166
256
  def _update_internal_job(self) -> None:
@@ -168,15 +258,17 @@ class MMT(_BaseMMT):
168
258
 
169
259
  @property
170
260
  def name(self) -> str:
261
+ """The job's name."""
171
262
  return self._internal_mmt.name
172
263
 
173
264
  @property
174
265
  def teamspace(self) -> "Teamspace":
266
+ """The teamspace the job is part of."""
175
267
  return self._internal_mmt._teamspace
176
268
 
177
269
  @property
178
- def cloud_account(self) -> Optional[str]:
179
- return self._internal_mmt.cloud_account
270
+ def link(self) -> str:
271
+ return self._internal_mmt.link
180
272
 
181
273
  def __getattr__(self, key: str) -> Any:
182
274
  """Forward the attribute lookup to the internal job implementation."""
lightning_sdk/mmt/v1.py CHANGED
@@ -16,6 +16,8 @@ from lightning_sdk.mmt.base import _BaseMMT
16
16
 
17
17
 
18
18
  class _MMTV1(_BaseMMT):
19
+ """V1 Implementation of Multi-Machine Training."""
20
+
19
21
  def __init__(
20
22
  self,
21
23
  name: str,
@@ -25,6 +27,15 @@ class _MMTV1(_BaseMMT):
25
27
  *,
26
28
  _fetch_job: bool = True,
27
29
  ) -> None:
30
+ """Fetch already existing jobs.
31
+
32
+ Args:
33
+ name: the name of the job
34
+ teamspace: the teamspace the job is part of
35
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
36
+ user: the name of the user owning the :param`teamspace`
37
+ in case it is owned directly by a user instead of an org.
38
+ """
28
39
  self._job_api = MMTApiV1()
29
40
  super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
30
41
 
@@ -43,6 +54,35 @@ class _MMTV1(_BaseMMT):
43
54
  artifacts_local: Optional[str] = None,
44
55
  artifacts_remote: Optional[str] = None,
45
56
  ) -> "_MMTV1":
57
+ """Submit a new multi-machine job to the Lightning AI platform.
58
+
59
+ Args:
60
+ num_machines: The number of machines to run on.
61
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
62
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
63
+ If not provided for images, will run the container entrypoint and default command.
64
+ studio: The studio env to run the job with. Mutually exclusive with image.
65
+ image: The docker image to run the job with. Mutually exclusive with studio.
66
+ env: Environment variables to set inside the job.
67
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
68
+ cloud_account: The cloud account to run the job on.
69
+ Defaults to the studio cloud account if running with studio compute env.
70
+ If not provided will fall back to the teamspaces default cloud account.
71
+ image_credentials: The credentials used to pull the image. Required if the image is private.
72
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
73
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
74
+ Required if the registry is part of a cloud provider (e.g. ECR).
75
+ artifacts_local: The path of inside the docker container, you want to persist images from.
76
+ CAUTION: When setting this to "/", it will effectively erase your container.
77
+ Only supported for jobs with a docker image compute environment.
78
+ artifacts_remote: The remote storage to persist your artifacts to.
79
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
80
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
81
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
82
+ within it.
83
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
84
+ Only supported for jobs with a docker image compute environment.
85
+ """
46
86
  if studio is None:
47
87
  raise ValueError("Studio is required for submitting jobs")
48
88
  if image is not None or image_credentials is not None or cloud_account_auth:
@@ -80,18 +120,25 @@ class _MMTV1(_BaseMMT):
80
120
 
81
121
  @property
82
122
  def machines(self) -> Tuple["Work", ...]:
123
+ """Returns the sub-jobs for each individual instance."""
83
124
  works = self._job_api.list_works(self._guaranteed_job.id, self.teamspace.id)
84
125
 
85
126
  return tuple(Work(w.id, self, self.teamspace) for w in works)
86
127
 
87
128
  def stop(self) -> None:
129
+ """Stops the job."""
88
130
  self._job_api.stop_job(self._guaranteed_job.id, self.teamspace.id)
89
131
 
90
132
  def delete(self) -> None:
133
+ """Deletes the job.
134
+
135
+ Caution: This also deletes all artifacts and snapshots associated with the job.
136
+ """
91
137
  self._job_api.delete_job(self._guaranteed_job.id, self.teamspace.id)
92
138
 
93
139
  @property
94
140
  def status(self) -> "Status":
141
+ """The current status of the job."""
95
142
  try:
96
143
  status = self._job_api.get_job_status(self._job.id, self.teamspace.id)
97
144
  return _internal_status_to_external_status(status)
@@ -102,24 +149,33 @@ class _MMTV1(_BaseMMT):
102
149
 
103
150
  @property
104
151
  def artifact_path(self) -> Optional[str]:
152
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
105
153
  return f"/teamspace/jobs/{self.name}"
106
154
 
107
155
  @property
108
156
  def snapshot_path(self) -> Optional[str]:
157
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
109
158
  return f"/teamspace/jobs/{self.name}/snapshot"
110
159
 
111
160
  @property
112
161
  def machine(self) -> "Machine":
162
+ """Returns the machine type this job is running on."""
113
163
  return self.machines[0].machine
114
164
 
115
165
  @property
116
166
  def name(self) -> str:
167
+ """The job's name."""
117
168
  return self._name
118
169
 
119
170
  @property
120
171
  def teamspace(self) -> "Teamspace":
172
+ """The teamspace the job is part of."""
121
173
  return self._teamspace
122
174
 
175
+ @property
176
+ def link(self) -> str:
177
+ return f"https://lightning.ai/{self.teamspace.owner.name}/{self.teamspace.name}/studios/{self._job_api.get_studio_name(self._guaranteed_job)}/app?app_id=mmt&app_tab=Runs&job_name={self.name}"
178
+
123
179
  # the following and functions are solely to make the Work class function
124
180
  @property
125
181
  def _id(self) -> str: