lightning-sdk 0.1.41__py3-none-any.whl → 0.1.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightning_sdk/__init__.py +1 -1
- lightning_sdk/ai_hub.py +8 -3
- lightning_sdk/api/ai_hub_api.py +3 -3
- lightning_sdk/api/deployment_api.py +6 -6
- lightning_sdk/api/job_api.py +32 -6
- lightning_sdk/api/mmt_api.py +59 -19
- lightning_sdk/api/studio_api.py +37 -19
- lightning_sdk/api/teamspace_api.py +34 -29
- lightning_sdk/api/utils.py +46 -34
- lightning_sdk/cli/ai_hub.py +3 -3
- lightning_sdk/cli/entrypoint.py +3 -1
- lightning_sdk/cli/run.py +122 -12
- lightning_sdk/cli/serve.py +218 -0
- lightning_sdk/deployment/deployment.py +18 -12
- lightning_sdk/job/base.py +118 -24
- lightning_sdk/job/job.py +98 -9
- lightning_sdk/job/v1.py +75 -18
- lightning_sdk/job/v2.py +51 -15
- lightning_sdk/job/work.py +36 -7
- lightning_sdk/lightning_cloud/openapi/__init__.py +12 -0
- lightning_sdk/lightning_cloud/openapi/api/jobs_service_api.py +215 -5
- lightning_sdk/lightning_cloud/openapi/api/lit_logger_service_api.py +218 -0
- lightning_sdk/lightning_cloud/openapi/api/models_store_api.py +226 -0
- lightning_sdk/lightning_cloud/openapi/api/snowflake_service_api.py +21 -1
- lightning_sdk/lightning_cloud/openapi/models/__init__.py +12 -0
- lightning_sdk/lightning_cloud/openapi/models/deploymenttemplates_id_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/id_visibility_body.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/model_id_versions_body.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/project_id_multimachinejobs_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/project_id_snowflake_body.py +15 -67
- lightning_sdk/lightning_cloud/openapi/models/query_query_id_body.py +17 -69
- lightning_sdk/lightning_cloud/openapi/models/snowflake_export_body.py +29 -81
- lightning_sdk/lightning_cloud/openapi/models/snowflake_query_body.py +17 -69
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_file_url_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_response.py +17 -17
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_url_response.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_get_project_balance_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_list_multi_machine_job_events_response.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_metrics_stream.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_model_file.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event.py +331 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event_type.py +104 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance_strategy.py +105 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_status.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +1 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_snowflake_data_connection.py +29 -81
- lightning_sdk/lightning_cloud/openapi/models/v1_system_metrics.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/v1_trainium_system_metrics.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_update_metrics_stream_visibility_response.py +97 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +27 -53
- lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_request.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_response.py +97 -0
- lightning_sdk/lightning_cloud/rest_client.py +2 -0
- lightning_sdk/mmt/__init__.py +4 -0
- lightning_sdk/mmt/base.py +278 -0
- lightning_sdk/mmt/mmt.py +267 -0
- lightning_sdk/mmt/v1.py +181 -0
- lightning_sdk/mmt/v2.py +188 -0
- lightning_sdk/plugin.py +43 -16
- lightning_sdk/services/file_endpoint.py +11 -5
- lightning_sdk/studio.py +16 -9
- lightning_sdk/teamspace.py +21 -8
- lightning_sdk/utils/resolve.py +18 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/METADATA +4 -1
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/RECORD +71 -59
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/entry_points.txt +0 -1
- lightning_sdk/_mmt/__init__.py +0 -3
- lightning_sdk/_mmt/base.py +0 -180
- lightning_sdk/_mmt/mmt.py +0 -161
- lightning_sdk/_mmt/v1.py +0 -69
- lightning_sdk/_mmt/v2.py +0 -141
- lightning_sdk/cli/mmt.py +0 -137
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/LICENSE +0 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/WHEEL +0 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/top_level.txt +0 -0
lightning_sdk/job/v1.py
CHANGED
|
@@ -17,6 +17,8 @@ from lightning_sdk.job.work import Work
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class _JobV1(_BaseJob):
|
|
20
|
+
"""Implementation to run async workloads from your Studio."""
|
|
21
|
+
|
|
20
22
|
def __init__(
|
|
21
23
|
self,
|
|
22
24
|
name: str,
|
|
@@ -26,6 +28,15 @@ class _JobV1(_BaseJob):
|
|
|
26
28
|
*,
|
|
27
29
|
_fetch_job: bool = True,
|
|
28
30
|
) -> None:
|
|
31
|
+
"""Fetch already existing jobs.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
name: the name of the job
|
|
35
|
+
teamspace: the teamspace the job is part of
|
|
36
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
37
|
+
user: the name of the user owning the :param`teamspace`
|
|
38
|
+
in case it is owned directly by a user instead of an org
|
|
39
|
+
"""
|
|
29
40
|
self._job_api = JobApiV1()
|
|
30
41
|
super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
|
|
31
42
|
|
|
@@ -39,9 +50,26 @@ class _JobV1(_BaseJob):
|
|
|
39
50
|
teamspace: Union[str, "Teamspace", None] = None,
|
|
40
51
|
org: Union[str, "Organization", None] = None,
|
|
41
52
|
user: Union[str, "User", None] = None,
|
|
42
|
-
|
|
53
|
+
cloud_account: Optional[str] = None,
|
|
43
54
|
interruptible: bool = False,
|
|
55
|
+
cluster: Optional[str] = None, # deprecated in favor of cloud_account
|
|
44
56
|
) -> "_BaseJob":
|
|
57
|
+
"""Start a new async workload from your studio.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
name: the name of the job
|
|
61
|
+
machine: the machine to run the workload on
|
|
62
|
+
command: the command to execute
|
|
63
|
+
studio: the studio the job belongs to
|
|
64
|
+
teamspace: the teamspace the job is part of
|
|
65
|
+
org: the organization owning the teamspace (if applicable)
|
|
66
|
+
user: the user owning the teamspace (if applicable)
|
|
67
|
+
cloud_account: the cloud account to run the workload on
|
|
68
|
+
interruptible: whether the workload can be interrupted
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
the created job
|
|
72
|
+
"""
|
|
45
73
|
return super().run(
|
|
46
74
|
name=name,
|
|
47
75
|
machine=machine,
|
|
@@ -51,11 +79,12 @@ class _JobV1(_BaseJob):
|
|
|
51
79
|
teamspace=teamspace,
|
|
52
80
|
org=org,
|
|
53
81
|
user=user,
|
|
54
|
-
|
|
82
|
+
cloud_account=cloud_account,
|
|
55
83
|
env=None,
|
|
56
84
|
interruptible=interruptible,
|
|
57
85
|
image_credentials=None,
|
|
58
|
-
|
|
86
|
+
cloud_account_auth=False,
|
|
87
|
+
cluster=cluster,
|
|
59
88
|
)
|
|
60
89
|
|
|
61
90
|
def _submit(
|
|
@@ -66,16 +95,34 @@ class _JobV1(_BaseJob):
|
|
|
66
95
|
image: Optional[str] = None,
|
|
67
96
|
env: Optional[Dict[str, str]] = None,
|
|
68
97
|
interruptible: bool = False,
|
|
69
|
-
|
|
98
|
+
cloud_account: Optional[str] = None,
|
|
70
99
|
image_credentials: Optional[str] = None,
|
|
71
|
-
|
|
100
|
+
cloud_account_auth: bool = False,
|
|
72
101
|
artifacts_local: Optional[str] = None,
|
|
73
102
|
artifacts_remote: Optional[str] = None,
|
|
74
103
|
) -> "_JobV1":
|
|
104
|
+
"""Submit a job to run on a machine.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
machine: The machine to run the job on.
|
|
108
|
+
command: The command to execute.
|
|
109
|
+
studio: The studio the job belongs to.
|
|
110
|
+
image: The image to use for the job (not supported).
|
|
111
|
+
env: The environment variables for the job (not supported).
|
|
112
|
+
interruptible: Whether the job can be interrupted.
|
|
113
|
+
cloud_account: The cloud account to run the job on.
|
|
114
|
+
image_credentials: The image credentials for the job (not supported).
|
|
115
|
+
cloud_account_auth: Whether to use cloud account authentication for the job (not supported).
|
|
116
|
+
artifacts_local: The local path for persisting artifacts (not supported).
|
|
117
|
+
artifacts_remote: The remote path for persisting artifacts (not supported).
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
The submitted job.
|
|
121
|
+
|
|
122
|
+
"""
|
|
75
123
|
if studio is None:
|
|
76
124
|
raise ValueError("Studio is required for submitting jobs")
|
|
77
|
-
|
|
78
|
-
if image is not None or image_credentials is not None or cluster_auth:
|
|
125
|
+
if image is not None or image_credentials is not None or cloud_account_auth:
|
|
79
126
|
raise ValueError("Image is not supported for submitting jobs")
|
|
80
127
|
|
|
81
128
|
if artifacts_local is not None or artifacts_remote is not None:
|
|
@@ -83,18 +130,15 @@ class _JobV1(_BaseJob):
|
|
|
83
130
|
|
|
84
131
|
if env is not None:
|
|
85
132
|
raise ValueError("Environment variables are not supported for submitting jobs")
|
|
86
|
-
|
|
87
133
|
if command is None:
|
|
88
134
|
raise ValueError("Command is required for submitting jobs")
|
|
89
|
-
|
|
90
135
|
# TODO: add support for empty names (will give an empty string)
|
|
91
|
-
|
|
92
136
|
_submitted = self._job_api.submit_job(
|
|
93
137
|
name=self._name,
|
|
94
138
|
command=command,
|
|
95
139
|
studio_id=studio._studio.id,
|
|
96
140
|
teamspace_id=self._teamspace.id,
|
|
97
|
-
|
|
141
|
+
cloud_account=cloud_account or "",
|
|
98
142
|
machine=machine,
|
|
99
143
|
interruptible=interruptible,
|
|
100
144
|
)
|
|
@@ -110,6 +154,7 @@ class _JobV1(_BaseJob):
|
|
|
110
154
|
|
|
111
155
|
@property
|
|
112
156
|
def status(self) -> "Status":
|
|
157
|
+
"""Returns the status of the job."""
|
|
113
158
|
try:
|
|
114
159
|
status = self._job_api.get_job_status(self._job.id, self.teamspace.id)
|
|
115
160
|
return _internal_status_to_external_status(status)
|
|
@@ -119,19 +164,22 @@ class _JobV1(_BaseJob):
|
|
|
119
164
|
) from None
|
|
120
165
|
|
|
121
166
|
def stop(self) -> None:
|
|
167
|
+
"""Stops the job. is blocking until the ob is stopped."""
|
|
122
168
|
if self.status in (Status.Stopped, Status.Failed):
|
|
123
169
|
return None
|
|
124
170
|
|
|
125
171
|
return self._job_api.stop_job(self._job.id, self.teamspace.id)
|
|
126
172
|
|
|
127
173
|
def delete(self) -> None:
|
|
128
|
-
|
|
174
|
+
"""Deletes the job.
|
|
129
175
|
|
|
130
|
-
|
|
131
|
-
|
|
176
|
+
Caution: this also deletes all artifacts created by the job.
|
|
177
|
+
"""
|
|
178
|
+
self._job_api.delete_job(self._job.id, self.teamspace.id)
|
|
132
179
|
|
|
133
180
|
@cached_property
|
|
134
181
|
def work(self) -> Work:
|
|
182
|
+
"""Get the work associated with the job."""
|
|
135
183
|
_work = self._job_api.list_works(self._job.id, self.teamspace.id)
|
|
136
184
|
if len(_work) == 0:
|
|
137
185
|
raise ValueError("No works found for job")
|
|
@@ -139,28 +187,37 @@ class _JobV1(_BaseJob):
|
|
|
139
187
|
|
|
140
188
|
@property
|
|
141
189
|
def machine(self) -> "Machine":
|
|
190
|
+
"""Get the machine the job is running on."""
|
|
142
191
|
return self.work.machine
|
|
143
192
|
|
|
144
|
-
@property
|
|
145
|
-
def id(self) -> str:
|
|
146
|
-
return self._job.id
|
|
147
|
-
|
|
148
193
|
@property
|
|
149
194
|
def name(self) -> str:
|
|
195
|
+
"""The name of the job."""
|
|
150
196
|
return self._job.name
|
|
151
197
|
|
|
152
198
|
@property
|
|
153
199
|
def artifact_path(self) -> Optional[str]:
|
|
200
|
+
"""The path to the artifacts of the job in the distributed teamspace filesystem."""
|
|
154
201
|
return self.work.artifact_path
|
|
155
202
|
|
|
156
203
|
@property
|
|
157
204
|
def snapshot_path(self) -> Optional[str]:
|
|
205
|
+
"""The path to the snapshot of the job in the distributed teamspace filesystem."""
|
|
158
206
|
return f"/teamspace/jobs/{self.name}/snapshot"
|
|
159
207
|
|
|
160
208
|
@property
|
|
161
209
|
def share_path(self) -> Optional[str]:
|
|
210
|
+
"""The path to the share of the job in the distributed teamspace filesystem."""
|
|
162
211
|
return f"/teamspace/jobs/{self.name}/share"
|
|
163
212
|
|
|
213
|
+
# the following and functions are solely to make the Work class function
|
|
214
|
+
@property
|
|
215
|
+
def _id(self) -> str:
|
|
216
|
+
return self._guaranteed_job.id
|
|
217
|
+
|
|
218
|
+
def _name_filter(self, name: str) -> str:
|
|
219
|
+
return name.replace("root.", "")
|
|
220
|
+
|
|
164
221
|
|
|
165
222
|
def _internal_status_to_external_status(internal_status: str) -> "Status":
|
|
166
223
|
"""Converts internal status strings from HTTP requests to external enums."""
|
lightning_sdk/job/v2.py
CHANGED
|
@@ -22,6 +22,15 @@ class _JobV2(_BaseJob):
|
|
|
22
22
|
*,
|
|
23
23
|
_fetch_job: bool = True,
|
|
24
24
|
) -> None:
|
|
25
|
+
"""Fetch already existing jobs.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
name: the name of the job
|
|
29
|
+
teamspace: the teamspace the job is part of
|
|
30
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
31
|
+
user: the name of the user owning the :param`teamspace`
|
|
32
|
+
in case it is owned directly by a user instead of an org.
|
|
33
|
+
"""
|
|
25
34
|
self._job_api = JobApiV2()
|
|
26
35
|
super().__init__(name=name, teamspace=teamspace, org=org, user=user, _fetch_job=_fetch_job)
|
|
27
36
|
|
|
@@ -33,12 +42,40 @@ class _JobV2(_BaseJob):
|
|
|
33
42
|
image: Optional[str] = None,
|
|
34
43
|
env: Optional[Dict[str, str]] = None,
|
|
35
44
|
interruptible: bool = False,
|
|
36
|
-
|
|
45
|
+
cloud_account: Optional[str] = None,
|
|
37
46
|
image_credentials: Optional[str] = None,
|
|
38
|
-
|
|
47
|
+
cloud_account_auth: bool = False,
|
|
39
48
|
artifacts_local: Optional[str] = None,
|
|
40
49
|
artifacts_remote: Optional[str] = None,
|
|
41
50
|
) -> "_JobV2":
|
|
51
|
+
"""Submit a new job to the Lightning AI platform.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}.
|
|
55
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
56
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
57
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
58
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
59
|
+
env: Environment variables to set inside the job.
|
|
60
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
61
|
+
cloud_account: The cloud account to run the job on.
|
|
62
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
63
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
64
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
65
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
66
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
67
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
68
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
69
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
70
|
+
Only supported for jobs with a docker image compute environment.
|
|
71
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
72
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
73
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
74
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
75
|
+
within it.
|
|
76
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
77
|
+
Only supported for jobs with a docker image compute environment.
|
|
78
|
+
"""
|
|
42
79
|
# Command is required if Studio is provided to know what to run
|
|
43
80
|
# Image is mutually exclusive with Studio
|
|
44
81
|
# Command is optional for Image
|
|
@@ -59,7 +96,7 @@ class _JobV2(_BaseJob):
|
|
|
59
96
|
submitted = self._job_api.submit_job(
|
|
60
97
|
name=self.name,
|
|
61
98
|
command=command,
|
|
62
|
-
|
|
99
|
+
cloud_account=cloud_account,
|
|
63
100
|
teamspace_id=self._teamspace.id,
|
|
64
101
|
studio_id=studio_id,
|
|
65
102
|
image=image,
|
|
@@ -67,7 +104,7 @@ class _JobV2(_BaseJob):
|
|
|
67
104
|
interruptible=interruptible,
|
|
68
105
|
env=env,
|
|
69
106
|
image_credentials=image_credentials,
|
|
70
|
-
|
|
107
|
+
cloud_account_auth=cloud_account_auth,
|
|
71
108
|
artifacts_local=artifacts_local,
|
|
72
109
|
artifacts_remote=artifacts_remote,
|
|
73
110
|
)
|
|
@@ -76,9 +113,14 @@ class _JobV2(_BaseJob):
|
|
|
76
113
|
return self
|
|
77
114
|
|
|
78
115
|
def stop(self) -> None:
|
|
116
|
+
"""Stop the job. If the job is already stopped, this is a no-op. This is blocking until the job is stopped."""
|
|
79
117
|
self._job_api.stop_job(job_id=self._guaranteed_job.id, teamspace_id=self._teamspace.id)
|
|
80
118
|
|
|
81
119
|
def delete(self) -> None:
|
|
120
|
+
"""Delete the job.
|
|
121
|
+
|
|
122
|
+
Caution: This also deletes all artifacts created by the job.
|
|
123
|
+
"""
|
|
82
124
|
self._job_api.delete_job(
|
|
83
125
|
job_id=self._guaranteed_job.id,
|
|
84
126
|
teamspace_id=self._teamspace.id,
|
|
@@ -91,28 +133,20 @@ class _JobV2(_BaseJob):
|
|
|
91
133
|
self._update_internal_job()
|
|
92
134
|
return self._job
|
|
93
135
|
|
|
94
|
-
@property
|
|
95
|
-
def _guaranteed_job(self) -> Any:
|
|
96
|
-
"""Guarantees that the job was fetched at some point before returning it.
|
|
97
|
-
|
|
98
|
-
Doesn't guarantee to have the lastest version of the job. Use _latest_job for that.
|
|
99
|
-
"""
|
|
100
|
-
if getattr(self, "_job", None) is None:
|
|
101
|
-
self._update_internal_job()
|
|
102
|
-
|
|
103
|
-
return self._job
|
|
104
|
-
|
|
105
136
|
@property
|
|
106
137
|
def status(self) -> "Status":
|
|
138
|
+
"""The current status of the job."""
|
|
107
139
|
return self._job_api._job_state_to_external(self._latest_job.state)
|
|
108
140
|
|
|
109
141
|
@property
|
|
110
142
|
def machine(self) -> "Machine":
|
|
143
|
+
"""The machine type the job is running on."""
|
|
111
144
|
# only fetch the job it it hasn't been fetched yet as machine cannot change over time
|
|
112
145
|
return self._job_api._get_job_machine_from_spec(self._guaranteed_job.spec)
|
|
113
146
|
|
|
114
147
|
@property
|
|
115
148
|
def artifact_path(self) -> Optional[str]:
|
|
149
|
+
"""The path to the artifacts of the job within the distributed teamspace filesystem."""
|
|
116
150
|
if self._guaranteed_job.spec.image != "":
|
|
117
151
|
if self._guaranteed_job.spec.artifacts_destination != "":
|
|
118
152
|
splits = self._guaranteed_job.spec.artifacts_destination.split(":")
|
|
@@ -123,12 +157,14 @@ class _JobV2(_BaseJob):
|
|
|
123
157
|
|
|
124
158
|
@property
|
|
125
159
|
def snapshot_path(self) -> Optional[str]:
|
|
160
|
+
"""The path to the snapshot of the Studio used to create the job within the distributed teamspace filesystem."""
|
|
126
161
|
if self._guaranteed_job.spec.image != "":
|
|
127
162
|
return None
|
|
128
163
|
return f"/teamspace/jobs/{self._guaranteed_job.name}/snapshot"
|
|
129
164
|
|
|
130
165
|
@property
|
|
131
166
|
def share_path(self) -> Optional[str]:
|
|
167
|
+
"""The path to the share of the job within the distributed teamspace filesystem."""
|
|
132
168
|
raise NotImplementedError("Not implemented yet")
|
|
133
169
|
|
|
134
170
|
def _update_internal_job(self) -> None:
|
lightning_sdk/job/work.py
CHANGED
|
@@ -1,33 +1,62 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Optional
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Optional, Protocol
|
|
2
2
|
|
|
3
3
|
from lightning_sdk.api.job_api import JobApiV1
|
|
4
4
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
|
-
from lightning_sdk.
|
|
6
|
+
from lightning_sdk.status import Status
|
|
7
7
|
from lightning_sdk.teamspace import Teamspace
|
|
8
8
|
from lightning_sdk.machine import Machine
|
|
9
9
|
|
|
10
10
|
|
|
11
|
+
class _WorkHolder(Protocol):
|
|
12
|
+
@property
|
|
13
|
+
def _id(self) -> str:
|
|
14
|
+
...
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def name(self) -> str:
|
|
18
|
+
...
|
|
19
|
+
|
|
20
|
+
def _name_filter(self, name: str) -> str:
|
|
21
|
+
...
|
|
22
|
+
|
|
23
|
+
|
|
11
24
|
class Work:
|
|
12
|
-
def __init__(self, work_id: str, job:
|
|
25
|
+
def __init__(self, work_id: str, job: _WorkHolder, teamspace: "Teamspace") -> None:
|
|
13
26
|
self._id = work_id
|
|
14
27
|
self._job = job
|
|
15
28
|
self._teamspace = teamspace
|
|
16
29
|
self._job_api = JobApiV1()
|
|
17
|
-
self._work =
|
|
30
|
+
self._work = None
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def _latest_work(self) -> Any:
|
|
34
|
+
self._work = self._job_api.get_work(work_id=self._id, job_id=self._job._id, teamspace_id=self._teamspace.id)
|
|
35
|
+
return self._work
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def _guaranteed_work(self) -> Any:
|
|
39
|
+
if self._work is None:
|
|
40
|
+
return self._latest_work
|
|
41
|
+
|
|
42
|
+
return self._work
|
|
18
43
|
|
|
19
44
|
@property
|
|
20
45
|
def id(self) -> str:
|
|
21
|
-
return self.
|
|
46
|
+
return self._guaranteed_work.id
|
|
22
47
|
|
|
23
48
|
@property
|
|
24
49
|
def name(self) -> str:
|
|
25
|
-
return self._job._name_filter(self.
|
|
50
|
+
return self._job._name_filter(self._guaranteed_work.name)
|
|
26
51
|
|
|
27
52
|
@property
|
|
28
53
|
def machine(self) -> "Machine":
|
|
29
|
-
return self._job_api.get_machine_from_work(self.
|
|
54
|
+
return self._job_api.get_machine_from_work(self._guaranteed_work)
|
|
30
55
|
|
|
31
56
|
@property
|
|
32
57
|
def artifact_path(self) -> Optional[str]:
|
|
33
58
|
return f"/teamspace/jobs/{self._job.name}/{self.name}"
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def status(self) -> "Status":
|
|
62
|
+
return self._job_api.get_status_from_work(self._latest_work)
|
|
@@ -125,6 +125,7 @@ from lightning_sdk.lightning_cloud.openapi.models.id_start_body import IdStartBo
|
|
|
125
125
|
from lightning_sdk.lightning_cloud.openapi.models.id_storage_body import IdStorageBody
|
|
126
126
|
from lightning_sdk.lightning_cloud.openapi.models.id_uploads_body import IdUploadsBody
|
|
127
127
|
from lightning_sdk.lightning_cloud.openapi.models.id_uploads_body1 import IdUploadsBody1
|
|
128
|
+
from lightning_sdk.lightning_cloud.openapi.models.id_visibility_body import IdVisibilityBody
|
|
128
129
|
from lightning_sdk.lightning_cloud.openapi.models.jobs_id_body import JobsIdBody
|
|
129
130
|
from lightning_sdk.lightning_cloud.openapi.models.jobs_id_body1 import JobsIdBody1
|
|
130
131
|
from lightning_sdk.lightning_cloud.openapi.models.jobs_id_body2 import JobsIdBody2
|
|
@@ -445,6 +446,7 @@ from lightning_sdk.lightning_cloud.openapi.models.v1_get_long_running_command_in
|
|
|
445
446
|
from lightning_sdk.lightning_cloud.openapi.models.v1_get_model_file_upload_urls_response import V1GetModelFileUploadUrlsResponse
|
|
446
447
|
from lightning_sdk.lightning_cloud.openapi.models.v1_get_model_file_url_response import V1GetModelFileUrlResponse
|
|
447
448
|
from lightning_sdk.lightning_cloud.openapi.models.v1_get_model_files_response import V1GetModelFilesResponse
|
|
449
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_get_model_files_url_response import V1GetModelFilesUrlResponse
|
|
448
450
|
from lightning_sdk.lightning_cloud.openapi.models.v1_get_project_artifact_response import V1GetProjectArtifactResponse
|
|
449
451
|
from lightning_sdk.lightning_cloud.openapi.models.v1_get_project_balance_response import V1GetProjectBalanceResponse
|
|
450
452
|
from lightning_sdk.lightning_cloud.openapi.models.v1_get_project_compute_usage_response import V1GetProjectComputeUsageResponse
|
|
@@ -558,6 +560,7 @@ from lightning_sdk.lightning_cloud.openapi.models.v1_list_memberships_response i
|
|
|
558
560
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_metrics_streams_response import V1ListMetricsStreamsResponse
|
|
559
561
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_model_versions_response import V1ListModelVersionsResponse
|
|
560
562
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_models_response import V1ListModelsResponse
|
|
563
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_list_multi_machine_job_events_response import V1ListMultiMachineJobEventsResponse
|
|
561
564
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_multi_machine_jobs_response import V1ListMultiMachineJobsResponse
|
|
562
565
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_new_features_for_user_response import V1ListNewFeaturesForUserResponse
|
|
563
566
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_org_members_response import V1ListOrgMembersResponse
|
|
@@ -612,9 +615,14 @@ from lightning_sdk.lightning_cloud.openapi.models.v1_metrics_stream import V1Met
|
|
|
612
615
|
from lightning_sdk.lightning_cloud.openapi.models.v1_metrics_tags import V1MetricsTags
|
|
613
616
|
from lightning_sdk.lightning_cloud.openapi.models.v1_metrics_tracker import V1MetricsTracker
|
|
614
617
|
from lightning_sdk.lightning_cloud.openapi.models.v1_model import V1Model
|
|
618
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_model_file import V1ModelFile
|
|
615
619
|
from lightning_sdk.lightning_cloud.openapi.models.v1_model_version_archive import V1ModelVersionArchive
|
|
616
620
|
from lightning_sdk.lightning_cloud.openapi.models.v1_mount_target import V1MountTarget
|
|
617
621
|
from lightning_sdk.lightning_cloud.openapi.models.v1_multi_machine_job import V1MultiMachineJob
|
|
622
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_multi_machine_job_event import V1MultiMachineJobEvent
|
|
623
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_multi_machine_job_event_type import V1MultiMachineJobEventType
|
|
624
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_multi_machine_job_fault_tolerance import V1MultiMachineJobFaultTolerance
|
|
625
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_multi_machine_job_fault_tolerance_strategy import V1MultiMachineJobFaultToleranceStrategy
|
|
618
626
|
from lightning_sdk.lightning_cloud.openapi.models.v1_multi_machine_job_state import V1MultiMachineJobState
|
|
619
627
|
from lightning_sdk.lightning_cloud.openapi.models.v1_multi_machine_job_status import V1MultiMachineJobStatus
|
|
620
628
|
from lightning_sdk.lightning_cloud.openapi.models.v1_named_get_logger_metrics import V1NamedGetLoggerMetrics
|
|
@@ -724,6 +732,7 @@ from lightning_sdk.lightning_cloud.openapi.models.v1_system_metrics import V1Sys
|
|
|
724
732
|
from lightning_sdk.lightning_cloud.openapi.models.v1_system_metrics_list import V1SystemMetricsList
|
|
725
733
|
from lightning_sdk.lightning_cloud.openapi.models.v1_telemetry import V1Telemetry
|
|
726
734
|
from lightning_sdk.lightning_cloud.openapi.models.v1_timestamp_code_telemetry import V1TimestampCodeTelemetry
|
|
735
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_trainium_system_metrics import V1TrainiumSystemMetrics
|
|
727
736
|
from lightning_sdk.lightning_cloud.openapi.models.v1_transaction import V1Transaction
|
|
728
737
|
from lightning_sdk.lightning_cloud.openapi.models.v1_transfer_org_balance_response import V1TransferOrgBalanceResponse
|
|
729
738
|
from lightning_sdk.lightning_cloud.openapi.models.v1_transfer_project_balance_response import V1TransferProjectBalanceResponse
|
|
@@ -740,6 +749,7 @@ from lightning_sdk.lightning_cloud.openapi.models.v1_update_cluster_accelerators
|
|
|
740
749
|
from lightning_sdk.lightning_cloud.openapi.models.v1_update_cluster_availability_request import V1UpdateClusterAvailabilityRequest
|
|
741
750
|
from lightning_sdk.lightning_cloud.openapi.models.v1_update_index_response import V1UpdateIndexResponse
|
|
742
751
|
from lightning_sdk.lightning_cloud.openapi.models.v1_update_lit_page_response import V1UpdateLitPageResponse
|
|
752
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_update_metrics_stream_visibility_response import V1UpdateMetricsStreamVisibilityResponse
|
|
743
753
|
from lightning_sdk.lightning_cloud.openapi.models.v1_update_model_visibility_response import V1UpdateModelVisibilityResponse
|
|
744
754
|
from lightning_sdk.lightning_cloud.openapi.models.v1_update_project_cluster_accelerators_response import V1UpdateProjectClusterAcceleratorsResponse
|
|
745
755
|
from lightning_sdk.lightning_cloud.openapi.models.v1_update_shared_metrics_stream_response import V1UpdateSharedMetricsStreamResponse
|
|
@@ -769,6 +779,8 @@ from lightning_sdk.lightning_cloud.openapi.models.v1_user_slurm_job_action_respo
|
|
|
769
779
|
from lightning_sdk.lightning_cloud.openapi.models.v1_validate_assistant_status_response import V1ValidateAssistantStatusResponse
|
|
770
780
|
from lightning_sdk.lightning_cloud.openapi.models.v1_validate_auto_join_domain_response import V1ValidateAutoJoinDomainResponse
|
|
771
781
|
from lightning_sdk.lightning_cloud.openapi.models.v1_validate_data_connection_response import V1ValidateDataConnectionResponse
|
|
782
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_validate_deployment_image_request import V1ValidateDeploymentImageRequest
|
|
783
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_validate_deployment_image_response import V1ValidateDeploymentImageResponse
|
|
772
784
|
from lightning_sdk.lightning_cloud.openapi.models.v1_validate_managed_endpoint_request import V1ValidateManagedEndpointRequest
|
|
773
785
|
from lightning_sdk.lightning_cloud.openapi.models.v1_validate_managed_endpoint_response import V1ValidateManagedEndpointResponse
|
|
774
786
|
from lightning_sdk.lightning_cloud.openapi.models.v1_validate_managed_model_response import V1ValidateManagedModelResponse
|