lightning-sdk 0.1.41__py3-none-any.whl → 0.1.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. lightning_sdk/__init__.py +1 -1
  2. lightning_sdk/ai_hub.py +8 -3
  3. lightning_sdk/api/ai_hub_api.py +3 -3
  4. lightning_sdk/api/deployment_api.py +6 -6
  5. lightning_sdk/api/job_api.py +32 -6
  6. lightning_sdk/api/mmt_api.py +59 -19
  7. lightning_sdk/api/studio_api.py +37 -19
  8. lightning_sdk/api/teamspace_api.py +34 -29
  9. lightning_sdk/api/utils.py +46 -34
  10. lightning_sdk/cli/ai_hub.py +3 -3
  11. lightning_sdk/cli/entrypoint.py +3 -1
  12. lightning_sdk/cli/run.py +122 -12
  13. lightning_sdk/cli/serve.py +218 -0
  14. lightning_sdk/deployment/deployment.py +18 -12
  15. lightning_sdk/job/base.py +118 -24
  16. lightning_sdk/job/job.py +98 -9
  17. lightning_sdk/job/v1.py +75 -18
  18. lightning_sdk/job/v2.py +51 -15
  19. lightning_sdk/job/work.py +36 -7
  20. lightning_sdk/lightning_cloud/openapi/__init__.py +12 -0
  21. lightning_sdk/lightning_cloud/openapi/api/jobs_service_api.py +215 -5
  22. lightning_sdk/lightning_cloud/openapi/api/lit_logger_service_api.py +218 -0
  23. lightning_sdk/lightning_cloud/openapi/api/models_store_api.py +226 -0
  24. lightning_sdk/lightning_cloud/openapi/api/snowflake_service_api.py +21 -1
  25. lightning_sdk/lightning_cloud/openapi/models/__init__.py +12 -0
  26. lightning_sdk/lightning_cloud/openapi/models/deploymenttemplates_id_body.py +27 -1
  27. lightning_sdk/lightning_cloud/openapi/models/id_visibility_body.py +123 -0
  28. lightning_sdk/lightning_cloud/openapi/models/model_id_versions_body.py +29 -3
  29. lightning_sdk/lightning_cloud/openapi/models/project_id_multimachinejobs_body.py +27 -1
  30. lightning_sdk/lightning_cloud/openapi/models/project_id_snowflake_body.py +15 -67
  31. lightning_sdk/lightning_cloud/openapi/models/query_query_id_body.py +17 -69
  32. lightning_sdk/lightning_cloud/openapi/models/snowflake_export_body.py +29 -81
  33. lightning_sdk/lightning_cloud/openapi/models/snowflake_query_body.py +17 -69
  34. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_file_url_response.py +27 -1
  35. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_response.py +17 -17
  36. lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_url_response.py +149 -0
  37. lightning_sdk/lightning_cloud/openapi/models/v1_get_project_balance_response.py +27 -1
  38. lightning_sdk/lightning_cloud/openapi/models/v1_list_multi_machine_job_events_response.py +123 -0
  39. lightning_sdk/lightning_cloud/openapi/models/v1_metrics_stream.py +27 -1
  40. lightning_sdk/lightning_cloud/openapi/models/v1_model_file.py +175 -0
  41. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
  42. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event.py +331 -0
  43. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event_type.py +104 -0
  44. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance.py +149 -0
  45. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance_strategy.py +105 -0
  46. lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_status.py +27 -1
  47. lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +1 -0
  48. lightning_sdk/lightning_cloud/openapi/models/v1_snowflake_data_connection.py +29 -81
  49. lightning_sdk/lightning_cloud/openapi/models/v1_system_metrics.py +29 -3
  50. lightning_sdk/lightning_cloud/openapi/models/v1_trainium_system_metrics.py +175 -0
  51. lightning_sdk/lightning_cloud/openapi/models/v1_update_metrics_stream_visibility_response.py +97 -0
  52. lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +27 -53
  53. lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_request.py +149 -0
  54. lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_response.py +97 -0
  55. lightning_sdk/lightning_cloud/rest_client.py +2 -0
  56. lightning_sdk/mmt/__init__.py +4 -0
  57. lightning_sdk/mmt/base.py +278 -0
  58. lightning_sdk/mmt/mmt.py +267 -0
  59. lightning_sdk/mmt/v1.py +181 -0
  60. lightning_sdk/mmt/v2.py +188 -0
  61. lightning_sdk/plugin.py +43 -16
  62. lightning_sdk/services/file_endpoint.py +11 -5
  63. lightning_sdk/studio.py +16 -9
  64. lightning_sdk/teamspace.py +21 -8
  65. lightning_sdk/utils/resolve.py +18 -0
  66. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/METADATA +4 -1
  67. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/RECORD +71 -59
  68. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/entry_points.txt +0 -1
  69. lightning_sdk/_mmt/__init__.py +0 -3
  70. lightning_sdk/_mmt/base.py +0 -180
  71. lightning_sdk/_mmt/mmt.py +0 -161
  72. lightning_sdk/_mmt/v1.py +0 -69
  73. lightning_sdk/_mmt/v2.py +0 -141
  74. lightning_sdk/cli/mmt.py +0 -137
  75. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/LICENSE +0 -0
  76. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/WHEEL +0 -0
  77. {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,278 @@
1
+ from abc import abstractmethod
2
+ from typing import TYPE_CHECKING, Dict, Optional, Protocol, Tuple, Union
3
+
4
+ if TYPE_CHECKING:
5
+ from lightning_sdk.machine import Machine
6
+ from lightning_sdk.organization import Organization
7
+ from lightning_sdk.status import Status
8
+ from lightning_sdk.studio import Studio
9
+ from lightning_sdk.teamspace import Teamspace
10
+ from lightning_sdk.user import User
11
+
12
+ from lightning_sdk.job.base import _BaseJob
13
+ from lightning_sdk.utils.resolve import _resolve_deprecated_cluster
14
+
15
+
16
+ class MMTMachine(Protocol):
17
+ """A single machine in multi-machine training."""
18
+
19
+ @property
20
+ def name(self) -> str:
21
+ """The Name of the individual machine. Usually corresponds to the rank."""
22
+ ...
23
+
24
+ @property
25
+ def machine(self) -> "Machine":
26
+ """The actual machine type this node is running on."""
27
+ ...
28
+
29
+ @property
30
+ def artifact_path(self) -> Optional[str]:
31
+ """The path to the artifacts of this job."""
32
+ ...
33
+
34
+ @property
35
+ def status(self) -> "Status":
36
+ """The status of this job."""
37
+ ...
38
+
39
+
40
+ class _BaseMMT(_BaseJob):
41
+ """Base interface to all job types."""
42
+
43
+ @classmethod
44
+ def run(
45
+ cls,
46
+ name: str,
47
+ machine: "Machine",
48
+ num_machines: int,
49
+ command: Optional[str] = None,
50
+ studio: Union["Studio", str, None] = None,
51
+ image: Optional[str] = None,
52
+ teamspace: Union[str, "Teamspace", None] = None,
53
+ org: Union[str, "Organization", None] = None,
54
+ user: Union[str, "User", None] = None,
55
+ cloud_account: Optional[str] = None,
56
+ env: Optional[Dict[str, str]] = None,
57
+ interruptible: bool = False,
58
+ image_credentials: Optional[str] = None,
59
+ cloud_account_auth: bool = False,
60
+ artifacts_local: Optional[str] = None,
61
+ artifacts_remote: Optional[str] = None,
62
+ cluster: Optional[str] = None, # deprecated in favor of cloud_account
63
+ ) -> "_BaseMMT":
64
+ """Run async workloads using a docker image across multiple machines.
65
+
66
+ Args:
67
+ name: The name of the job. Needs to be unique within the teamspace.
68
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
69
+ num_machine: The number of machines to run on.
70
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
71
+ If not provided for images, will run the container entrypoint and default command.
72
+ studio: The studio env to run the job with. Mutually exclusive with image.
73
+ image: The docker image to run the job with. Mutually exclusive with studio.
74
+ teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
75
+ org: The organization owning the teamspace (if any). Defaults to the current organization.
76
+ user: The user owning the teamspace (if any). Defaults to the current user.
77
+ cloud_account: The cloud account to run the job on.
78
+ Defaults to the studio cloud account if running with studio compute env.
79
+ If not provided will fall back to the teamspaces default cloud account.
80
+ env: Environment variables to set inside the job.
81
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
82
+ image_credentials: The credentials used to pull the image. Required if the image is private.
83
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
84
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
85
+ Required if the registry is part of a cloud provider (e.g. ECR).
86
+ artifacts_local: The path of inside the docker container, you want to persist images from.
87
+ CAUTION: When setting this to "/", it will effectively erase your container.
88
+ Only supported for jobs with a docker image compute environment.
89
+ artifacts_remote: The remote storage to persist your artifacts to.
90
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
91
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
92
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
93
+ within it.
94
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
95
+ Only supported for jobs with a docker image compute environment.
96
+ """
97
+ from lightning_sdk.studio import Studio
98
+
99
+ cloud_account = _resolve_deprecated_cluster(cloud_account, cluster)
100
+
101
+ if num_machines <= 1:
102
+ raise ValueError("Multi-Machine training cannot be run with less than 2 Machines")
103
+
104
+ if not name:
105
+ raise ValueError("A job needs to have a name!")
106
+
107
+ if image is None:
108
+ if not isinstance(studio, Studio):
109
+ studio = Studio(
110
+ name=studio, teamspace=teamspace, org=org, user=user, cloud_account=cloud_account, create_ok=False
111
+ )
112
+
113
+ # studio is a Studio instance at this point
114
+ if teamspace is None:
115
+ teamspace = studio.teamspace
116
+ else:
117
+ teamspace_name = teamspace if isinstance(teamspace, str) else teamspace.name
118
+
119
+ if studio.teamspace.name != teamspace_name:
120
+ raise ValueError(
121
+ "Studio teamspace does not match provided teamspace. "
122
+ "Can only run jobs with Studio envs in the teamspace of that Studio."
123
+ )
124
+
125
+ if cloud_account is None:
126
+ cloud_account = studio.cloud_account
127
+
128
+ if cloud_account != studio.cloud_account:
129
+ raise ValueError(
130
+ "Studio cloud_account does not match provided cloud_account. "
131
+ "Can only run jobs with Studio envs in the same cloud_account."
132
+ )
133
+
134
+ if image_credentials is not None:
135
+ raise ValueError("image_credentials is only supported when using a custom image")
136
+
137
+ if cloud_account_auth:
138
+ raise ValueError("cloud_account_auth is only supported when using a custom image")
139
+
140
+ if artifacts_local is not None or artifacts_remote is not None:
141
+ raise ValueError(
142
+ "Specifying artifacts persistence is supported for docker images only. "
143
+ "Other jobs will automatically persist artifacts to the teamspace distributed filesystem."
144
+ )
145
+
146
+ else:
147
+ if studio is not None:
148
+ raise RuntimeError(
149
+ "image and studio are mutually exclusive as both define the environment to run the job in"
150
+ )
151
+
152
+ # they either need to specified both or none of them
153
+ if bool(artifacts_local) != bool(artifacts_remote):
154
+ raise ValueError("Artifact persistence requires both artifacts_local and artifacts_remote to be set")
155
+
156
+ if artifacts_remote and len(artifacts_remote.split(":")) != 3:
157
+ raise ValueError(
158
+ "Artifact persistence requires exactly three arguments separated by colon of kind "
159
+ f"<CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>, got {artifacts_local}"
160
+ )
161
+
162
+ inst = cls(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=False)
163
+ inst._submit(
164
+ num_machines=num_machines,
165
+ machine=machine,
166
+ cloud_account=cloud_account,
167
+ command=command,
168
+ studio=studio,
169
+ image=image,
170
+ env=env,
171
+ interruptible=interruptible,
172
+ image_credentials=image_credentials,
173
+ cloud_account_auth=cloud_account_auth,
174
+ artifacts_local=artifacts_local,
175
+ artifacts_remote=artifacts_remote,
176
+ )
177
+ return inst
178
+
179
+ @abstractmethod
180
+ def _submit(
181
+ self,
182
+ num_machines: int,
183
+ machine: "Machine",
184
+ command: Optional[str] = None,
185
+ studio: Optional["Studio"] = None,
186
+ image: Optional[str] = None,
187
+ env: Optional[Dict[str, str]] = None,
188
+ interruptible: bool = False,
189
+ cloud_account: Optional[str] = None,
190
+ image_credentials: Optional[str] = None,
191
+ cloud_account_auth: bool = False,
192
+ artifacts_local: Optional[str] = None,
193
+ artifacts_remote: Optional[str] = None,
194
+ ) -> None:
195
+ """Submit a new multi-machine job to the Lightning AI platform.
196
+
197
+ Args:
198
+ num_machines: The number of machines to run on.
199
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
200
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
201
+ If not provided for images, will run the container entrypoint and default command.
202
+ studio: The studio env to run the job with. Mutually exclusive with image.
203
+ image: The docker image to run the job with. Mutually exclusive with studio.
204
+ env: Environment variables to set inside the job.
205
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
206
+ cloud_account: The cloud account to run the job on.
207
+ Defaults to the studio cloud account if running with studio compute env.
208
+ If not provided will fall back to the teamspaces default cloud account.
209
+ image_credentials: The credentials used to pull the image. Required if the image is private.
210
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
211
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
212
+ Required if the registry is part of a cloud provider (e.g. ECR).
213
+ artifacts_local: The path of inside the docker container, you want to persist images from.
214
+ CAUTION: When setting this to "/", it will effectively erase your container.
215
+ Only supported for jobs with a docker image compute environment.
216
+ artifacts_remote: The remote storage to persist your artifacts to.
217
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
218
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
219
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
220
+ within it.
221
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
222
+ Only supported for jobs with a docker image compute environment.
223
+ """
224
+
225
+ @property
226
+ @abstractmethod
227
+ def machines(self) -> Tuple[MMTMachine, ...]:
228
+ """Returns the sub-jobs for each individual instance."""
229
+
230
+ @property
231
+ @abstractmethod
232
+ def machine(self) -> "Machine":
233
+ """Returns the machine type this job is running on."""
234
+
235
+ @abstractmethod
236
+ def stop(self) -> None:
237
+ """Stops the job."""
238
+
239
+ @abstractmethod
240
+ def delete(self) -> None:
241
+ """Deletes the job.
242
+
243
+ Caution: This also deletes all artifacts and snapshots associated with the job.
244
+ """
245
+
246
+ @property
247
+ @abstractmethod
248
+ def status(self) -> "Status":
249
+ """The current status of the job."""
250
+
251
+ @property
252
+ @abstractmethod
253
+ def artifact_path(self) -> Optional[str]:
254
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
255
+
256
+ @property
257
+ @abstractmethod
258
+ def snapshot_path(self) -> Optional[str]:
259
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
260
+
261
+ @property
262
+ def share_path(self) -> Optional[str]:
263
+ """Path to the jobs share path."""
264
+ return None
265
+
266
+ @property
267
+ def name(self) -> str:
268
+ """The job's name."""
269
+ return self._name
270
+
271
+ @property
272
+ def teamspace(self) -> "Teamspace":
273
+ """The teamspace the job is part of."""
274
+ return self._teamspace
275
+
276
+ @abstractmethod
277
+ def _update_internal_job(self) -> None:
278
+ pass
@@ -0,0 +1,267 @@
1
+ from functools import lru_cache
2
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
3
+
4
+ from lightning_sdk.api.user_api import UserApi
5
+ from lightning_sdk.job.job import _has_jobs_v2
6
+ from lightning_sdk.mmt.base import MMTMachine, _BaseMMT
7
+ from lightning_sdk.mmt.v1 import _MMTV1
8
+ from lightning_sdk.mmt.v2 import _MMTV2
9
+
10
+ if TYPE_CHECKING:
11
+ from lightning_sdk.machine import Machine
12
+ from lightning_sdk.organization import Organization
13
+ from lightning_sdk.status import Status
14
+ from lightning_sdk.studio import Studio
15
+ from lightning_sdk.teamspace import Teamspace
16
+ from lightning_sdk.user import User
17
+
18
+
19
+ @lru_cache(maxsize=None)
20
+ def _has_mmt_v2() -> bool:
21
+ # users need both mmtv2 and jobsv2 flags in order for mmtv2 to work correctly
22
+ if not _has_jobs_v2():
23
+ return False
24
+
25
+ api = UserApi()
26
+ try:
27
+ return api._get_feature_flags().mmt_v2
28
+ except Exception:
29
+ return False
30
+
31
+
32
+ class MMT(_BaseMMT):
33
+ """Class to submit and manage multi-machine jobs on the Lightning AI Platform."""
34
+
35
+ _force_v1: (
36
+ bool
37
+ ) = False # required for studio plugin still working correctly as v2 currently does not support the studio env
38
+
39
+ def __init__(
40
+ self,
41
+ name: str,
42
+ teamspace: Union[str, "Teamspace", None] = None,
43
+ org: Union[str, "Organization", None] = None,
44
+ user: Union[str, "User", None] = None,
45
+ *,
46
+ _fetch_job: bool = True,
47
+ ) -> None:
48
+ """Fetch already existing jobs.
49
+
50
+ Args:
51
+ name: the name of the job
52
+ teamspace: the teamspace the job is part of
53
+ org: the name of the organization owning the :param`teamspace` in case it is owned by an org
54
+ user: the name of the user owning the :param`teamspace`
55
+ in case it is owned directly by a user instead of an org.
56
+ """
57
+ internal_mmt_cls = _MMTV2 if _has_mmt_v2() and not self._force_v1 else _MMTV1
58
+
59
+ self._internal_mmt = internal_mmt_cls(
60
+ name=name,
61
+ teamspace=teamspace,
62
+ org=org,
63
+ user=user,
64
+ _fetch_job=_fetch_job,
65
+ )
66
+
67
+ @classmethod
68
+ def run(
69
+ cls,
70
+ name: str,
71
+ num_machines: int,
72
+ machine: "Machine",
73
+ command: Optional[str] = None,
74
+ studio: Union["Studio", str, None] = None,
75
+ image: Union[str, None] = None,
76
+ teamspace: Union[str, "Teamspace", None] = None,
77
+ org: Union[str, "Organization", None] = None,
78
+ user: Union[str, "User", None] = None,
79
+ cloud_account: Optional[str] = None,
80
+ env: Optional[Dict[str, str]] = None,
81
+ interruptible: bool = False,
82
+ image_credentials: Optional[str] = None,
83
+ cloud_account_auth: bool = False,
84
+ artifacts_local: Optional[str] = None,
85
+ artifacts_remote: Optional[str] = None,
86
+ cluster: Optional[str] = None, # deprecated in favor of cloud_account
87
+ ) -> "MMT":
88
+ """Run async workloads using a docker image across multiple machines.
89
+
90
+ Args:
91
+ name: The name of the job. Needs to be unique within the teamspace.
92
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
93
+ num_machine: The number of machines to run on.
94
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
95
+ If not provided for images, will run the container entrypoint and default command.
96
+ studio: The studio env to run the job with. Mutually exclusive with image.
97
+ image: The docker image to run the job with. Mutually exclusive with studio.
98
+ teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
99
+ org: The organization owning the teamspace (if any). Defaults to the current organization.
100
+ user: The user owning the teamspace (if any). Defaults to the current user.
101
+ cloud_account: The cloud account to run the job on.
102
+ Defaults to the studio cloud account if running with studio compute env.
103
+ If not provided will fall back to the teamspaces default cloud account.
104
+ env: Environment variables to set inside the job.
105
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
106
+ image_credentials: The credentials used to pull the image. Required if the image is private.
107
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
108
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
109
+ Required if the registry is part of a cloud provider (e.g. ECR).
110
+ artifacts_local: The path of inside the docker container, you want to persist images from.
111
+ CAUTION: When setting this to "/", it will effectively erase your container.
112
+ Only supported for jobs with a docker image compute environment.
113
+ artifacts_remote: The remote storage to persist your artifacts to.
114
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
115
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
116
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
117
+ within it.
118
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
119
+ Only supported for jobs with a docker image compute environment.
120
+ """
121
+ ret_val = super().run(
122
+ name=name,
123
+ num_machines=num_machines,
124
+ machine=machine,
125
+ command=command,
126
+ studio=studio,
127
+ image=image,
128
+ teamspace=teamspace,
129
+ org=org,
130
+ user=user,
131
+ cloud_account=cloud_account,
132
+ env=env,
133
+ interruptible=interruptible,
134
+ image_credentials=image_credentials,
135
+ cloud_account_auth=cloud_account_auth,
136
+ artifacts_local=artifacts_local,
137
+ artifacts_remote=artifacts_remote,
138
+ cluster=cluster, # deprecated in favor of cloud_account
139
+ )
140
+ # required for typing with "Job"
141
+ assert isinstance(ret_val, cls)
142
+ return ret_val
143
+
144
+ def _submit(
145
+ self,
146
+ num_machines: int,
147
+ machine: "Machine",
148
+ command: Optional[str] = None,
149
+ studio: Optional["Studio"] = None,
150
+ image: Optional[str] = None,
151
+ env: Optional[Dict[str, str]] = None,
152
+ interruptible: bool = False,
153
+ cloud_account: Optional[str] = None,
154
+ image_credentials: Optional[str] = None,
155
+ cloud_account_auth: bool = False,
156
+ artifacts_local: Optional[str] = None,
157
+ artifacts_remote: Optional[str] = None,
158
+ ) -> "MMT":
159
+ """Submit a new multi-machine job to the Lightning AI platform.
160
+
161
+ Args:
162
+ num_machines: The number of machines to run on.
163
+ machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
164
+ command: The command to run inside your job. Required if using a studio. Optional if using an image.
165
+ If not provided for images, will run the container entrypoint and default command.
166
+ studio: The studio env to run the job with. Mutually exclusive with image.
167
+ image: The docker image to run the job with. Mutually exclusive with studio.
168
+ env: Environment variables to set inside the job.
169
+ interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
170
+ cloud_account: The cloud account to run the job on.
171
+ Defaults to the studio cloud account if running with studio compute env.
172
+ If not provided will fall back to the teamspaces default cloud account.
173
+ image_credentials: The credentials used to pull the image. Required if the image is private.
174
+ This should be the name of the respective credentials secret created on the Lightning AI platform.
175
+ cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
176
+ Required if the registry is part of a cloud provider (e.g. ECR).
177
+ artifacts_local: The path of inside the docker container, you want to persist images from.
178
+ CAUTION: When setting this to "/", it will effectively erase your container.
179
+ Only supported for jobs with a docker image compute environment.
180
+ artifacts_remote: The remote storage to persist your artifacts to.
181
+ Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
182
+ PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
183
+ E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
184
+ within it.
185
+ Note that the connection needs to be added to the teamspace already in order for it to be found.
186
+ Only supported for jobs with a docker image compute environment.
187
+ """
188
+ self._job = self._internal_mmt._submit(
189
+ num_machines=num_machines,
190
+ machine=machine,
191
+ cloud_account=cloud_account,
192
+ command=command,
193
+ studio=studio,
194
+ image=image,
195
+ env=env,
196
+ interruptible=interruptible,
197
+ image_credentials=image_credentials,
198
+ cloud_account_auth=cloud_account_auth,
199
+ artifacts_local=artifacts_local,
200
+ artifacts_remote=artifacts_remote,
201
+ )
202
+ return self
203
+
204
+ def stop(self) -> None:
205
+ """Stops the job."""
206
+ return self._internal_mmt.stop()
207
+
208
+ def delete(self) -> None:
209
+ """Deletes the job.
210
+
211
+ Caution: This also deletes all artifacts and snapshots associated with the job.
212
+ """
213
+ return self._internal_mmt.delete()
214
+
215
+ @property
216
+ def status(self) -> "Status":
217
+ """The current status of the job (accumulated over all machines)."""
218
+ return self._internal_mmt.status
219
+
220
+ @property
221
+ def machines(self) -> Tuple[MMTMachine, ...]:
222
+ """Returns the sub-jobs for each individual instance."""
223
+ return self._internal_mmt.machines
224
+
225
+ @property
226
+ def machine(self) -> "Machine":
227
+ """Returns the machine type this job is running on."""
228
+ return self._internal_mmt.machine
229
+
230
+ @property
231
+ def artifact_path(self) -> Optional[str]:
232
+ """Path to the artifacts created by the job within the distributed teamspace filesystem."""
233
+ return self._internal_mmt.artifact_path
234
+
235
+ @property
236
+ def snapshot_path(self) -> Optional[str]:
237
+ """Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
238
+ return self._internal_mmt.snapshot_path
239
+
240
+ @property
241
+ def share_path(self) -> Optional[str]:
242
+ """Path to the jobs share path."""
243
+ return None
244
+
245
+ def _update_internal_job(self) -> None:
246
+ return self._internal_mmt._update_internal_job()
247
+
248
+ @property
249
+ def name(self) -> str:
250
+ """The job's name."""
251
+ return self._internal_mmt.name
252
+
253
+ @property
254
+ def teamspace(self) -> "Teamspace":
255
+ """The teamspace the job is part of."""
256
+ return self._internal_mmt._teamspace
257
+
258
+ def __getattr__(self, key: str) -> Any:
259
+ """Forward the attribute lookup to the internal job implementation."""
260
+ try:
261
+ return getattr(super(), key)
262
+ except AttributeError:
263
+ return getattr(self._internal_mmt, key)
264
+
265
+ @property
266
+ def _guaranteed_job(self) -> Any:
267
+ return self._internal_mmt._guaranteed_job