lightning-sdk 0.1.40__py3-none-any.whl → 0.1.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightning_sdk/__init__.py +1 -1
- lightning_sdk/ai_hub.py +8 -3
- lightning_sdk/api/ai_hub_api.py +3 -3
- lightning_sdk/api/deployment_api.py +6 -6
- lightning_sdk/api/job_api.py +32 -6
- lightning_sdk/api/mmt_api.py +60 -19
- lightning_sdk/api/studio_api.py +37 -19
- lightning_sdk/api/teamspace_api.py +34 -29
- lightning_sdk/api/utils.py +48 -35
- lightning_sdk/cli/ai_hub.py +3 -3
- lightning_sdk/cli/entrypoint.py +3 -1
- lightning_sdk/cli/mmt.py +11 -10
- lightning_sdk/cli/run.py +9 -8
- lightning_sdk/cli/serve.py +130 -0
- lightning_sdk/deployment/deployment.py +18 -12
- lightning_sdk/job/base.py +118 -24
- lightning_sdk/job/job.py +87 -9
- lightning_sdk/job/v1.py +75 -18
- lightning_sdk/job/v2.py +51 -15
- lightning_sdk/job/work.py +36 -7
- lightning_sdk/lightning_cloud/openapi/__init__.py +13 -0
- lightning_sdk/lightning_cloud/openapi/api/jobs_service_api.py +215 -5
- lightning_sdk/lightning_cloud/openapi/api/lit_logger_service_api.py +218 -0
- lightning_sdk/lightning_cloud/openapi/api/models_store_api.py +226 -0
- lightning_sdk/lightning_cloud/openapi/api/secret_service_api.py +5 -1
- lightning_sdk/lightning_cloud/openapi/api/snowflake_service_api.py +21 -1
- lightning_sdk/lightning_cloud/openapi/models/__init__.py +13 -0
- lightning_sdk/lightning_cloud/openapi/models/create_deployment_request_defines_a_spec_for_the_job_that_allows_for_autoscaling_jobs.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/deploymenttemplates_id_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/id_visibility_body.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/model_id_versions_body.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/project_id_multimachinejobs_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/project_id_snowflake_body.py +15 -67
- lightning_sdk/lightning_cloud/openapi/models/query_query_id_body.py +17 -69
- lightning_sdk/lightning_cloud/openapi/models/snowflake_export_body.py +29 -81
- lightning_sdk/lightning_cloud/openapi/models/snowflake_query_body.py +17 -69
- lightning_sdk/lightning_cloud/openapi/models/v1_deployment_api.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_deployment_spec.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_file_url_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_response.py +17 -17
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_url_response.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_get_project_balance_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_header.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_job_spec.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_list_multi_machine_job_events_response.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_managed_model.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/v1_metrics_stream.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_model_file.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event.py +331 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event_type.py +104 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance_strategy.py +105 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_status.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +2 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_secret_type.py +1 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_snowflake_data_connection.py +29 -81
- lightning_sdk/lightning_cloud/openapi/models/v1_system_metrics.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/v1_trainium_system_metrics.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_update_metrics_stream_visibility_response.py +97 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +41 -67
- lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_request.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_response.py +97 -0
- lightning_sdk/lightning_cloud/rest_client.py +2 -0
- lightning_sdk/mmt/__init__.py +3 -0
- lightning_sdk/{_mmt → mmt}/base.py +20 -14
- lightning_sdk/{_mmt → mmt}/mmt.py +46 -17
- lightning_sdk/mmt/v1.py +129 -0
- lightning_sdk/{_mmt → mmt}/v2.py +16 -21
- lightning_sdk/plugin.py +43 -16
- lightning_sdk/services/file_endpoint.py +11 -5
- lightning_sdk/studio.py +16 -9
- lightning_sdk/teamspace.py +26 -14
- lightning_sdk/utils/resolve.py +18 -0
- {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/METADATA +3 -1
- {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/RECORD +80 -66
- lightning_sdk/_mmt/__init__.py +0 -3
- lightning_sdk/_mmt/v1.py +0 -69
- {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/LICENSE +0 -0
- {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/WHEEL +0 -0
- {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/entry_points.txt +0 -0
- {lightning_sdk-0.1.40.dist-info → lightning_sdk-0.1.42.dist-info}/top_level.txt +0 -0
lightning_sdk/job/base.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import TYPE_CHECKING, Dict, Optional, Union
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
|
|
3
3
|
|
|
4
|
-
from lightning_sdk.utils.resolve import _resolve_teamspace
|
|
4
|
+
from lightning_sdk.utils.resolve import _resolve_deprecated_cluster, _resolve_teamspace
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from lightning_sdk.machine import Machine
|
|
@@ -13,6 +13,8 @@ if TYPE_CHECKING:
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class _BaseJob(ABC):
|
|
16
|
+
"""Base interface to all job types."""
|
|
17
|
+
|
|
16
18
|
def __init__(
|
|
17
19
|
self,
|
|
18
20
|
name: str,
|
|
@@ -22,6 +24,15 @@ class _BaseJob(ABC):
|
|
|
22
24
|
*,
|
|
23
25
|
_fetch_job: bool = True,
|
|
24
26
|
) -> None:
|
|
27
|
+
"""Fetch already existing jobs.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
name: the name of the job
|
|
31
|
+
teamspace: the teamspace the job is part of
|
|
32
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
33
|
+
user: the name of the user owning the :param`teamspace`
|
|
34
|
+
in case it is owned directly by a user instead of an org.
|
|
35
|
+
"""
|
|
25
36
|
_teamspace = _resolve_teamspace(teamspace=teamspace, org=org, user=user)
|
|
26
37
|
if _teamspace is None:
|
|
27
38
|
raise ValueError(
|
|
@@ -47,22 +58,59 @@ class _BaseJob(ABC):
|
|
|
47
58
|
teamspace: Union[str, "Teamspace", None] = None,
|
|
48
59
|
org: Union[str, "Organization", None] = None,
|
|
49
60
|
user: Union[str, "User", None] = None,
|
|
50
|
-
|
|
61
|
+
cloud_account: Optional[str] = None,
|
|
51
62
|
env: Optional[Dict[str, str]] = None,
|
|
52
63
|
interruptible: bool = False,
|
|
53
64
|
image_credentials: Optional[str] = None,
|
|
54
|
-
|
|
65
|
+
cloud_account_auth: bool = False,
|
|
55
66
|
artifacts_local: Optional[str] = None,
|
|
56
67
|
artifacts_remote: Optional[str] = None,
|
|
68
|
+
cluster: Optional[str] = None, # deprecated in favor of cloud_account
|
|
57
69
|
) -> "_BaseJob":
|
|
70
|
+
"""Run async workloads using a docker image or a compute environment from your studio.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
name: The name of the job. Needs to be unique within the teamspace.
|
|
74
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
75
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
76
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
77
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
78
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
79
|
+
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
80
|
+
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
81
|
+
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
82
|
+
cloud_account: The cloud account to run the job on.
|
|
83
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
84
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
85
|
+
env: Environment variables to set inside the job.
|
|
86
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
87
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
88
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
89
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
90
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
91
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
92
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
93
|
+
Only supported for jobs with a docker image compute environment.
|
|
94
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
95
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
96
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
97
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
98
|
+
within it.
|
|
99
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
100
|
+
Only supported for jobs with a docker image compute environment.
|
|
101
|
+
"""
|
|
58
102
|
from lightning_sdk.studio import Studio
|
|
59
103
|
|
|
104
|
+
cloud_account = _resolve_deprecated_cluster(cloud_account, cluster)
|
|
105
|
+
|
|
60
106
|
if not name:
|
|
61
107
|
raise ValueError("A job needs to have a name!")
|
|
62
108
|
|
|
63
109
|
if image is None:
|
|
64
110
|
if not isinstance(studio, Studio):
|
|
65
|
-
studio = Studio(
|
|
111
|
+
studio = Studio(
|
|
112
|
+
name=studio, teamspace=teamspace, org=org, user=user, cloud_account=cloud_account, create_ok=False
|
|
113
|
+
)
|
|
66
114
|
|
|
67
115
|
# studio is a Studio instance at this point
|
|
68
116
|
if teamspace is None:
|
|
@@ -76,20 +124,20 @@ class _BaseJob(ABC):
|
|
|
76
124
|
"Can only run jobs with Studio envs in the teamspace of that Studio."
|
|
77
125
|
)
|
|
78
126
|
|
|
79
|
-
if
|
|
80
|
-
|
|
127
|
+
if cloud_account is None:
|
|
128
|
+
cloud_account = studio.cloud_account
|
|
81
129
|
|
|
82
|
-
if
|
|
130
|
+
if cloud_account != studio.cloud_account:
|
|
83
131
|
raise ValueError(
|
|
84
|
-
"Studio
|
|
85
|
-
"Can only run jobs with Studio envs in the same
|
|
132
|
+
"Studio cloud account does not match provided cloud account. "
|
|
133
|
+
"Can only run jobs with Studio envs in the same cloud account."
|
|
86
134
|
)
|
|
87
135
|
|
|
88
136
|
if image_credentials is not None:
|
|
89
137
|
raise ValueError("image_credentials is only supported when using a custom image")
|
|
90
138
|
|
|
91
|
-
if
|
|
92
|
-
raise ValueError("
|
|
139
|
+
if cloud_account_auth:
|
|
140
|
+
raise ValueError("cloud_account_auth is only supported when using a custom image")
|
|
93
141
|
|
|
94
142
|
if artifacts_local is not None or artifacts_remote is not None:
|
|
95
143
|
raise ValueError(
|
|
@@ -116,14 +164,14 @@ class _BaseJob(ABC):
|
|
|
116
164
|
inst = cls(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=False)
|
|
117
165
|
return inst._submit(
|
|
118
166
|
machine=machine,
|
|
119
|
-
|
|
167
|
+
cloud_account=cloud_account,
|
|
120
168
|
command=command,
|
|
121
169
|
studio=studio,
|
|
122
170
|
image=image,
|
|
123
171
|
env=env,
|
|
124
172
|
interruptible=interruptible,
|
|
125
173
|
image_credentials=image_credentials,
|
|
126
|
-
|
|
174
|
+
cloud_account_auth=cloud_account_auth,
|
|
127
175
|
artifacts_local=artifacts_local,
|
|
128
176
|
artifacts_remote=artifacts_remote,
|
|
129
177
|
)
|
|
@@ -137,46 +185,79 @@ class _BaseJob(ABC):
|
|
|
137
185
|
image: Optional[str] = None,
|
|
138
186
|
env: Optional[Dict[str, str]] = None,
|
|
139
187
|
interruptible: bool = False,
|
|
140
|
-
|
|
188
|
+
cloud_account: Optional[str] = None,
|
|
141
189
|
image_credentials: Optional[str] = None,
|
|
142
|
-
|
|
190
|
+
cloud_account_auth: bool = False,
|
|
143
191
|
artifacts_local: Optional[str] = None,
|
|
144
192
|
artifacts_remote: Optional[str] = None,
|
|
145
193
|
) -> "_BaseJob":
|
|
146
|
-
"""
|
|
194
|
+
"""Submit a new job to the Lightning AI platform.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
198
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
199
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
200
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
201
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
202
|
+
env: Environment variables to set inside the job.
|
|
203
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
204
|
+
cloud_account: The cloud account to run the job on.
|
|
205
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
206
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
207
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
208
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
209
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
210
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
211
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
212
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
213
|
+
Only supported for jobs with a docker image compute environment.
|
|
214
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
215
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
216
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
217
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
218
|
+
within it.
|
|
219
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
220
|
+
Only supported for jobs with a docker image compute environment.
|
|
221
|
+
"""
|
|
147
222
|
|
|
148
223
|
@abstractmethod
|
|
149
224
|
def stop(self) -> None:
|
|
150
|
-
|
|
225
|
+
"""Stops the job.
|
|
226
|
+
|
|
227
|
+
This is blocking until the job is stopped.
|
|
228
|
+
"""
|
|
151
229
|
|
|
152
230
|
@abstractmethod
|
|
153
231
|
def delete(self) -> None:
|
|
154
|
-
|
|
232
|
+
"""Deletes the job.
|
|
233
|
+
|
|
234
|
+
Caution: This also deletes all artifacts and snapshots associated with the job.
|
|
235
|
+
"""
|
|
155
236
|
|
|
156
237
|
@property
|
|
157
238
|
@abstractmethod
|
|
158
239
|
def status(self) -> "Status":
|
|
159
|
-
|
|
240
|
+
"""The current status of the job."""
|
|
160
241
|
|
|
161
242
|
@property
|
|
162
243
|
@abstractmethod
|
|
163
244
|
def machine(self) -> "Machine":
|
|
164
|
-
|
|
245
|
+
"""The machine type the job is running on."""
|
|
165
246
|
|
|
166
247
|
@property
|
|
167
248
|
@abstractmethod
|
|
168
249
|
def artifact_path(self) -> Optional[str]:
|
|
169
|
-
|
|
250
|
+
"""Path to the artifacts created by the job within the distributed teamspace filesystem."""
|
|
170
251
|
|
|
171
252
|
@property
|
|
172
253
|
@abstractmethod
|
|
173
254
|
def snapshot_path(self) -> Optional[str]:
|
|
174
|
-
|
|
255
|
+
"""Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
|
|
175
256
|
|
|
176
257
|
@property
|
|
177
258
|
@abstractmethod
|
|
178
259
|
def share_path(self) -> Optional[str]:
|
|
179
|
-
|
|
260
|
+
"""Path to the jobs share path."""
|
|
180
261
|
|
|
181
262
|
@abstractmethod
|
|
182
263
|
def _update_internal_job(self) -> None:
|
|
@@ -184,8 +265,21 @@ class _BaseJob(ABC):
|
|
|
184
265
|
|
|
185
266
|
@property
|
|
186
267
|
def name(self) -> str:
|
|
268
|
+
"""The job's name."""
|
|
187
269
|
return self._name
|
|
188
270
|
|
|
189
271
|
@property
|
|
190
272
|
def teamspace(self) -> "Teamspace":
|
|
273
|
+
"""The teamspace the job is part of."""
|
|
191
274
|
return self._teamspace
|
|
275
|
+
|
|
276
|
+
@property
|
|
277
|
+
def _guaranteed_job(self) -> Any:
|
|
278
|
+
"""Guarantees that the job was fetched at some point before returning it.
|
|
279
|
+
|
|
280
|
+
Doesn't guarantee to have the lastest version of the job. Use _latest_job for that.
|
|
281
|
+
"""
|
|
282
|
+
if getattr(self, "_job", None) is None:
|
|
283
|
+
self._update_internal_job()
|
|
284
|
+
|
|
285
|
+
return self._job
|
lightning_sdk/job/job.py
CHANGED
|
@@ -55,14 +55,47 @@ class Job(_BaseJob):
|
|
|
55
55
|
teamspace: Union[str, "Teamspace", None] = None,
|
|
56
56
|
org: Union[str, "Organization", None] = None,
|
|
57
57
|
user: Union[str, "User", None] = None,
|
|
58
|
-
|
|
58
|
+
cloud_account: Optional[str] = None,
|
|
59
59
|
env: Optional[Dict[str, str]] = None,
|
|
60
60
|
interruptible: bool = False,
|
|
61
61
|
image_credentials: Optional[str] = None,
|
|
62
|
-
|
|
62
|
+
cloud_account_auth: bool = False,
|
|
63
63
|
artifacts_local: Optional[str] = None,
|
|
64
64
|
artifacts_remote: Optional[str] = None,
|
|
65
|
+
cluster: Optional[str] = None, # deprecated in favor of cloud_account
|
|
65
66
|
) -> "Job":
|
|
67
|
+
"""Run async workloads using a docker image or a compute environment from your studio.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
name: The name of the job. Needs to be unique within the teamspace.
|
|
71
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
72
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
73
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
74
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
75
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
76
|
+
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
77
|
+
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
78
|
+
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
79
|
+
cloud_account: The cloud acocunt to run the job on.
|
|
80
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
81
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
82
|
+
env: Environment variables to set inside the job.
|
|
83
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
84
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
85
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
86
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
87
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
88
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
89
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
90
|
+
Only supported for jobs with a docker image compute environment.
|
|
91
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
92
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
93
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
94
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
95
|
+
within it.
|
|
96
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
97
|
+
Only supported for jobs with a docker image compute environment.
|
|
98
|
+
"""
|
|
66
99
|
ret_val = super().run(
|
|
67
100
|
name=name,
|
|
68
101
|
machine=machine,
|
|
@@ -72,13 +105,14 @@ class Job(_BaseJob):
|
|
|
72
105
|
teamspace=teamspace,
|
|
73
106
|
org=org,
|
|
74
107
|
user=user,
|
|
75
|
-
|
|
108
|
+
cloud_account=cloud_account,
|
|
76
109
|
env=env,
|
|
77
110
|
interruptible=interruptible,
|
|
78
111
|
image_credentials=image_credentials,
|
|
79
|
-
|
|
112
|
+
cloud_account_auth=cloud_account_auth,
|
|
80
113
|
artifacts_local=artifacts_local,
|
|
81
114
|
artifacts_remote=artifacts_remote,
|
|
115
|
+
cluster=cluster,
|
|
82
116
|
)
|
|
83
117
|
# required for typing with "Job"
|
|
84
118
|
assert isinstance(ret_val, cls)
|
|
@@ -92,51 +126,92 @@ class Job(_BaseJob):
|
|
|
92
126
|
image: Optional[str] = None,
|
|
93
127
|
env: Optional[Dict[str, str]] = None,
|
|
94
128
|
interruptible: bool = False,
|
|
95
|
-
|
|
129
|
+
cloud_account: Optional[str] = None,
|
|
96
130
|
image_credentials: Optional[str] = None,
|
|
97
|
-
|
|
131
|
+
cloud_account_auth: bool = False,
|
|
98
132
|
artifacts_local: Optional[str] = None,
|
|
99
133
|
artifacts_remote: Optional[str] = None,
|
|
100
|
-
) ->
|
|
134
|
+
) -> "Job":
|
|
135
|
+
"""Submit a new job to the Lightning AI platform.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
139
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
140
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
141
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
142
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
143
|
+
env: Environment variables to set inside the job.
|
|
144
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
145
|
+
cloud_account: The cloud account to run the job on.
|
|
146
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
147
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
148
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
149
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
150
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
151
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
152
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
153
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
154
|
+
Only supported for jobs with a docker image compute environment.
|
|
155
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
156
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
157
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
158
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
159
|
+
within it.
|
|
160
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
161
|
+
Only supported for jobs with a docker image compute environment.
|
|
162
|
+
"""
|
|
101
163
|
self._job = self._internal_job._submit(
|
|
102
164
|
machine=machine,
|
|
103
|
-
|
|
165
|
+
cloud_account=cloud_account,
|
|
104
166
|
command=command,
|
|
105
167
|
studio=studio,
|
|
106
168
|
image=image,
|
|
107
169
|
env=env,
|
|
108
170
|
interruptible=interruptible,
|
|
109
171
|
image_credentials=image_credentials,
|
|
110
|
-
|
|
172
|
+
cloud_account_auth=cloud_account_auth,
|
|
111
173
|
artifacts_local=artifacts_local,
|
|
112
174
|
artifacts_remote=artifacts_remote,
|
|
113
175
|
)
|
|
114
176
|
return self
|
|
115
177
|
|
|
116
178
|
def stop(self) -> None:
|
|
179
|
+
"""Stops the job.
|
|
180
|
+
|
|
181
|
+
This is blocking until the job is stopped.
|
|
182
|
+
"""
|
|
117
183
|
return self._internal_job.stop()
|
|
118
184
|
|
|
119
185
|
def delete(self) -> None:
|
|
186
|
+
"""Deletes the job.
|
|
187
|
+
|
|
188
|
+
Caution: This also deletes all artifacts and snapshots associated with the job.
|
|
189
|
+
"""
|
|
120
190
|
return self._internal_job.delete()
|
|
121
191
|
|
|
122
192
|
@property
|
|
123
193
|
def status(self) -> "Status":
|
|
194
|
+
"""The current status of the job."""
|
|
124
195
|
return self._internal_job.status
|
|
125
196
|
|
|
126
197
|
@property
|
|
127
198
|
def machine(self) -> "Machine":
|
|
199
|
+
"""The machine type the job is running on."""
|
|
128
200
|
return self._internal_job.machine
|
|
129
201
|
|
|
130
202
|
@property
|
|
131
203
|
def artifact_path(self) -> Optional[str]:
|
|
204
|
+
"""Path to the artifacts created by the job within the distributed teamspace filesystem."""
|
|
132
205
|
return self._internal_job.artifact_path
|
|
133
206
|
|
|
134
207
|
@property
|
|
135
208
|
def snapshot_path(self) -> Optional[str]:
|
|
209
|
+
"""Path to the studio snapshot used to create the job within the distributed teamspace filesystem."""
|
|
136
210
|
return self._internal_job.snapshot_path
|
|
137
211
|
|
|
138
212
|
@property
|
|
139
213
|
def share_path(self) -> Optional[str]:
|
|
214
|
+
"""Path to the jobs share path."""
|
|
140
215
|
return self._internal_job.share_path
|
|
141
216
|
|
|
142
217
|
def _update_internal_job(self) -> None:
|
|
@@ -144,14 +219,17 @@ class Job(_BaseJob):
|
|
|
144
219
|
|
|
145
220
|
@property
|
|
146
221
|
def name(self) -> str:
|
|
222
|
+
"""The job's name."""
|
|
147
223
|
return self._internal_job.name
|
|
148
224
|
|
|
149
225
|
@property
|
|
150
226
|
def teamspace(self) -> "Teamspace":
|
|
227
|
+
"""The teamspace the job is part of."""
|
|
151
228
|
return self._internal_job._teamspace
|
|
152
229
|
|
|
153
230
|
@property
|
|
154
231
|
def cluster(self) -> Optional[str]:
|
|
232
|
+
"""The cluster the job is running on."""
|
|
155
233
|
return self._internal_job.cluster
|
|
156
234
|
|
|
157
235
|
def __getattr__(self, key: str) -> Any:
|
lightning_sdk/job/v1.py
CHANGED
|
@@ -17,6 +17,8 @@ from lightning_sdk.job.work import Work
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class _JobV1(_BaseJob):
|
|
20
|
+
"""Implementation to run async workloads from your Studio."""
|
|
21
|
+
|
|
20
22
|
def __init__(
|
|
21
23
|
self,
|
|
22
24
|
name: str,
|
|
@@ -26,6 +28,15 @@ class _JobV1(_BaseJob):
|
|
|
26
28
|
*,
|
|
27
29
|
_fetch_job: bool = True,
|
|
28
30
|
) -> None:
|
|
31
|
+
"""Fetch already existing jobs.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
name: the name of the job
|
|
35
|
+
teamspace: the teamspace the job is part of
|
|
36
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
37
|
+
user: the name of the user owning the :param`teamspace`
|
|
38
|
+
in case it is owned directly by a user instead of an org
|
|
39
|
+
"""
|
|
29
40
|
self._job_api = JobApiV1()
|
|
30
41
|
super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
|
|
31
42
|
|
|
@@ -39,9 +50,26 @@ class _JobV1(_BaseJob):
|
|
|
39
50
|
teamspace: Union[str, "Teamspace", None] = None,
|
|
40
51
|
org: Union[str, "Organization", None] = None,
|
|
41
52
|
user: Union[str, "User", None] = None,
|
|
42
|
-
|
|
53
|
+
cloud_account: Optional[str] = None,
|
|
43
54
|
interruptible: bool = False,
|
|
55
|
+
cluster: Optional[str] = None, # deprecated in favor of cloud_account
|
|
44
56
|
) -> "_BaseJob":
|
|
57
|
+
"""Start a new async workload from your studio.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
name: the name of the job
|
|
61
|
+
machine: the machine to run the workload on
|
|
62
|
+
command: the command to execute
|
|
63
|
+
studio: the studio the job belongs to
|
|
64
|
+
teamspace: the teamspace the job is part of
|
|
65
|
+
org: the organization owning the teamspace (if applicable)
|
|
66
|
+
user: the user owning the teamspace (if applicable)
|
|
67
|
+
cloud_account: the cloud account to run the workload on
|
|
68
|
+
interruptible: whether the workload can be interrupted
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
the created job
|
|
72
|
+
"""
|
|
45
73
|
return super().run(
|
|
46
74
|
name=name,
|
|
47
75
|
machine=machine,
|
|
@@ -51,11 +79,12 @@ class _JobV1(_BaseJob):
|
|
|
51
79
|
teamspace=teamspace,
|
|
52
80
|
org=org,
|
|
53
81
|
user=user,
|
|
54
|
-
|
|
82
|
+
cloud_account=cloud_account,
|
|
55
83
|
env=None,
|
|
56
84
|
interruptible=interruptible,
|
|
57
85
|
image_credentials=None,
|
|
58
|
-
|
|
86
|
+
cloud_account_auth=False,
|
|
87
|
+
cluster=cluster,
|
|
59
88
|
)
|
|
60
89
|
|
|
61
90
|
def _submit(
|
|
@@ -66,16 +95,34 @@ class _JobV1(_BaseJob):
|
|
|
66
95
|
image: Optional[str] = None,
|
|
67
96
|
env: Optional[Dict[str, str]] = None,
|
|
68
97
|
interruptible: bool = False,
|
|
69
|
-
|
|
98
|
+
cloud_account: Optional[str] = None,
|
|
70
99
|
image_credentials: Optional[str] = None,
|
|
71
|
-
|
|
100
|
+
cloud_account_auth: bool = False,
|
|
72
101
|
artifacts_local: Optional[str] = None,
|
|
73
102
|
artifacts_remote: Optional[str] = None,
|
|
74
103
|
) -> "_JobV1":
|
|
104
|
+
"""Submit a job to run on a machine.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
machine: The machine to run the job on.
|
|
108
|
+
command: The command to execute.
|
|
109
|
+
studio: The studio the job belongs to.
|
|
110
|
+
image: The image to use for the job (not supported).
|
|
111
|
+
env: The environment variables for the job (not supported).
|
|
112
|
+
interruptible: Whether the job can be interrupted.
|
|
113
|
+
cloud_account: The cloud account to run the job on.
|
|
114
|
+
image_credentials: The image credentials for the job (not supported).
|
|
115
|
+
cloud_account_auth: Whether to use cloud account authentication for the job (not supported).
|
|
116
|
+
artifacts_local: The local path for persisting artifacts (not supported).
|
|
117
|
+
artifacts_remote: The remote path for persisting artifacts (not supported).
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
The submitted job.
|
|
121
|
+
|
|
122
|
+
"""
|
|
75
123
|
if studio is None:
|
|
76
124
|
raise ValueError("Studio is required for submitting jobs")
|
|
77
|
-
|
|
78
|
-
if image is not None or image_credentials is not None or cluster_auth:
|
|
125
|
+
if image is not None or image_credentials is not None or cloud_account_auth:
|
|
79
126
|
raise ValueError("Image is not supported for submitting jobs")
|
|
80
127
|
|
|
81
128
|
if artifacts_local is not None or artifacts_remote is not None:
|
|
@@ -83,18 +130,15 @@ class _JobV1(_BaseJob):
|
|
|
83
130
|
|
|
84
131
|
if env is not None:
|
|
85
132
|
raise ValueError("Environment variables are not supported for submitting jobs")
|
|
86
|
-
|
|
87
133
|
if command is None:
|
|
88
134
|
raise ValueError("Command is required for submitting jobs")
|
|
89
|
-
|
|
90
135
|
# TODO: add support for empty names (will give an empty string)
|
|
91
|
-
|
|
92
136
|
_submitted = self._job_api.submit_job(
|
|
93
137
|
name=self._name,
|
|
94
138
|
command=command,
|
|
95
139
|
studio_id=studio._studio.id,
|
|
96
140
|
teamspace_id=self._teamspace.id,
|
|
97
|
-
|
|
141
|
+
cloud_account=cloud_account or "",
|
|
98
142
|
machine=machine,
|
|
99
143
|
interruptible=interruptible,
|
|
100
144
|
)
|
|
@@ -110,6 +154,7 @@ class _JobV1(_BaseJob):
|
|
|
110
154
|
|
|
111
155
|
@property
|
|
112
156
|
def status(self) -> "Status":
|
|
157
|
+
"""Returns the status of the job."""
|
|
113
158
|
try:
|
|
114
159
|
status = self._job_api.get_job_status(self._job.id, self.teamspace.id)
|
|
115
160
|
return _internal_status_to_external_status(status)
|
|
@@ -119,19 +164,22 @@ class _JobV1(_BaseJob):
|
|
|
119
164
|
) from None
|
|
120
165
|
|
|
121
166
|
def stop(self) -> None:
|
|
167
|
+
"""Stops the job. is blocking until the ob is stopped."""
|
|
122
168
|
if self.status in (Status.Stopped, Status.Failed):
|
|
123
169
|
return None
|
|
124
170
|
|
|
125
171
|
return self._job_api.stop_job(self._job.id, self.teamspace.id)
|
|
126
172
|
|
|
127
173
|
def delete(self) -> None:
|
|
128
|
-
|
|
174
|
+
"""Deletes the job.
|
|
129
175
|
|
|
130
|
-
|
|
131
|
-
|
|
176
|
+
Caution: this also deletes all artifacts created by the job.
|
|
177
|
+
"""
|
|
178
|
+
self._job_api.delete_job(self._job.id, self.teamspace.id)
|
|
132
179
|
|
|
133
180
|
@cached_property
|
|
134
181
|
def work(self) -> Work:
|
|
182
|
+
"""Get the work associated with the job."""
|
|
135
183
|
_work = self._job_api.list_works(self._job.id, self.teamspace.id)
|
|
136
184
|
if len(_work) == 0:
|
|
137
185
|
raise ValueError("No works found for job")
|
|
@@ -139,28 +187,37 @@ class _JobV1(_BaseJob):
|
|
|
139
187
|
|
|
140
188
|
@property
|
|
141
189
|
def machine(self) -> "Machine":
|
|
190
|
+
"""Get the machine the job is running on."""
|
|
142
191
|
return self.work.machine
|
|
143
192
|
|
|
144
|
-
@property
|
|
145
|
-
def id(self) -> str:
|
|
146
|
-
return self._job.id
|
|
147
|
-
|
|
148
193
|
@property
|
|
149
194
|
def name(self) -> str:
|
|
195
|
+
"""The name of the job."""
|
|
150
196
|
return self._job.name
|
|
151
197
|
|
|
152
198
|
@property
|
|
153
199
|
def artifact_path(self) -> Optional[str]:
|
|
200
|
+
"""The path to the artifacts of the job in the distributed teamspace filesystem."""
|
|
154
201
|
return self.work.artifact_path
|
|
155
202
|
|
|
156
203
|
@property
|
|
157
204
|
def snapshot_path(self) -> Optional[str]:
|
|
205
|
+
"""The path to the snapshot of the job in the distributed teamspace filesystem."""
|
|
158
206
|
return f"/teamspace/jobs/{self.name}/snapshot"
|
|
159
207
|
|
|
160
208
|
@property
|
|
161
209
|
def share_path(self) -> Optional[str]:
|
|
210
|
+
"""The path to the share of the job in the distributed teamspace filesystem."""
|
|
162
211
|
return f"/teamspace/jobs/{self.name}/share"
|
|
163
212
|
|
|
213
|
+
# the following and functions are solely to make the Work class function
|
|
214
|
+
@property
|
|
215
|
+
def _id(self) -> str:
|
|
216
|
+
return self._guaranteed_job.id
|
|
217
|
+
|
|
218
|
+
def _name_filter(self, name: str) -> str:
|
|
219
|
+
return name.replace("root.", "")
|
|
220
|
+
|
|
164
221
|
|
|
165
222
|
def _internal_status_to_external_status(internal_status: str) -> "Status":
|
|
166
223
|
"""Converts internal status strings from HTTP requests to external enums."""
|