lightning-sdk 0.1.41__py3-none-any.whl → 0.1.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightning_sdk/__init__.py +1 -1
- lightning_sdk/ai_hub.py +8 -3
- lightning_sdk/api/ai_hub_api.py +3 -3
- lightning_sdk/api/deployment_api.py +6 -6
- lightning_sdk/api/job_api.py +32 -6
- lightning_sdk/api/mmt_api.py +59 -19
- lightning_sdk/api/studio_api.py +37 -19
- lightning_sdk/api/teamspace_api.py +34 -29
- lightning_sdk/api/utils.py +46 -34
- lightning_sdk/cli/ai_hub.py +3 -3
- lightning_sdk/cli/entrypoint.py +3 -1
- lightning_sdk/cli/mmt.py +11 -10
- lightning_sdk/cli/run.py +9 -8
- lightning_sdk/cli/serve.py +130 -0
- lightning_sdk/deployment/deployment.py +18 -12
- lightning_sdk/job/base.py +118 -24
- lightning_sdk/job/job.py +87 -9
- lightning_sdk/job/v1.py +75 -18
- lightning_sdk/job/v2.py +51 -15
- lightning_sdk/job/work.py +36 -7
- lightning_sdk/lightning_cloud/openapi/__init__.py +12 -0
- lightning_sdk/lightning_cloud/openapi/api/jobs_service_api.py +215 -5
- lightning_sdk/lightning_cloud/openapi/api/lit_logger_service_api.py +218 -0
- lightning_sdk/lightning_cloud/openapi/api/models_store_api.py +226 -0
- lightning_sdk/lightning_cloud/openapi/api/snowflake_service_api.py +21 -1
- lightning_sdk/lightning_cloud/openapi/models/__init__.py +12 -0
- lightning_sdk/lightning_cloud/openapi/models/deploymenttemplates_id_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/id_visibility_body.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/model_id_versions_body.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/project_id_multimachinejobs_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/project_id_snowflake_body.py +15 -67
- lightning_sdk/lightning_cloud/openapi/models/query_query_id_body.py +17 -69
- lightning_sdk/lightning_cloud/openapi/models/snowflake_export_body.py +29 -81
- lightning_sdk/lightning_cloud/openapi/models/snowflake_query_body.py +17 -69
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_file_url_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_response.py +17 -17
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_url_response.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_get_project_balance_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_list_multi_machine_job_events_response.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_metrics_stream.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_model_file.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event.py +331 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event_type.py +104 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance_strategy.py +105 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_status.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +1 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_snowflake_data_connection.py +29 -81
- lightning_sdk/lightning_cloud/openapi/models/v1_system_metrics.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/v1_trainium_system_metrics.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_update_metrics_stream_visibility_response.py +97 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +27 -53
- lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_request.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_response.py +97 -0
- lightning_sdk/lightning_cloud/rest_client.py +2 -0
- lightning_sdk/mmt/__init__.py +3 -0
- lightning_sdk/{_mmt → mmt}/base.py +20 -14
- lightning_sdk/{_mmt → mmt}/mmt.py +46 -17
- lightning_sdk/mmt/v1.py +129 -0
- lightning_sdk/{_mmt → mmt}/v2.py +16 -21
- lightning_sdk/plugin.py +43 -16
- lightning_sdk/services/file_endpoint.py +11 -5
- lightning_sdk/studio.py +16 -9
- lightning_sdk/teamspace.py +21 -8
- lightning_sdk/utils/resolve.py +18 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.42.dist-info}/METADATA +3 -1
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.42.dist-info}/RECORD +72 -59
- lightning_sdk/_mmt/__init__.py +0 -3
- lightning_sdk/_mmt/v1.py +0 -69
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.42.dist-info}/LICENSE +0 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.42.dist-info}/WHEEL +0 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.42.dist-info}/entry_points.txt +0 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.42.dist-info}/top_level.txt +0 -0
lightning_sdk/api/utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import concurrent.futures
|
|
1
2
|
import errno
|
|
2
3
|
import math
|
|
3
4
|
import os
|
|
@@ -8,7 +9,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
8
9
|
|
|
9
10
|
import backoff
|
|
10
11
|
import requests
|
|
11
|
-
from tqdm import tqdm
|
|
12
|
+
from tqdm.auto import tqdm
|
|
12
13
|
|
|
13
14
|
from lightning_sdk.constants import __GLOBAL_LIGHTNING_UNIQUE_IDS_STORE__, _LIGHTNING_DEBUG
|
|
14
15
|
from lightning_sdk.lightning_cloud.openapi import (
|
|
@@ -66,14 +67,14 @@ class _FileUploader:
|
|
|
66
67
|
self,
|
|
67
68
|
client: LightningClient,
|
|
68
69
|
teamspace_id: str,
|
|
69
|
-
|
|
70
|
+
cloud_account: str,
|
|
70
71
|
file_path: str,
|
|
71
72
|
remote_path: str,
|
|
72
73
|
progress_bar: bool,
|
|
73
74
|
) -> None:
|
|
74
75
|
self.client = client
|
|
75
76
|
self.teamspace_id = teamspace_id
|
|
76
|
-
self.
|
|
77
|
+
self.cloud_account = cloud_account
|
|
77
78
|
|
|
78
79
|
self.local_path = file_path
|
|
79
80
|
|
|
@@ -107,7 +108,7 @@ class _FileUploader:
|
|
|
107
108
|
|
|
108
109
|
def _multipart_upload(self, count: int) -> None:
|
|
109
110
|
"""Does a parallel multipart upload."""
|
|
110
|
-
body = ProjectIdStorageBody(cluster_id=self.
|
|
111
|
+
body = ProjectIdStorageBody(cluster_id=self.cloud_account, filename=self.remote_path)
|
|
111
112
|
resp: V1UploadProjectArtifactResponse = self.client.storage_service_upload_project_artifact(
|
|
112
113
|
body=body, project_id=self.teamspace_id
|
|
113
114
|
)
|
|
@@ -123,7 +124,7 @@ class _FileUploader:
|
|
|
123
124
|
completed.extend(self._process_upload_batch(executor=p, batch=batch, upload_id=resp.upload_id))
|
|
124
125
|
|
|
125
126
|
completed_body = StorageCompleteBody(
|
|
126
|
-
cluster_id=self.
|
|
127
|
+
cluster_id=self.cloud_account, filename=self.remote_path, parts=completed, upload_id=resp.upload_id
|
|
127
128
|
)
|
|
128
129
|
self.client.storage_service_complete_upload_project_artifact(body=completed_body, project_id=self.teamspace_id)
|
|
129
130
|
|
|
@@ -135,7 +136,7 @@ class _FileUploader:
|
|
|
135
136
|
|
|
136
137
|
def _request_urls(self, parts: List[int], upload_id: str) -> List[V1PresignedUrl]:
|
|
137
138
|
"""Requests urls for a batch of parts."""
|
|
138
|
-
body = UploadsUploadIdBody(cluster_id=self.
|
|
139
|
+
body = UploadsUploadIdBody(cluster_id=self.cloud_account, filename=self.remote_path, parts=parts)
|
|
139
140
|
resp: V1UploadProjectArtifactPartsResponse = self.client.storage_service_upload_project_artifact_parts(
|
|
140
141
|
body, self.teamspace_id, upload_id
|
|
141
142
|
)
|
|
@@ -192,7 +193,7 @@ class _ModelFileUploader:
|
|
|
192
193
|
model_id: str,
|
|
193
194
|
version: str,
|
|
194
195
|
teamspace_id: str,
|
|
195
|
-
|
|
196
|
+
cloud_account: str,
|
|
196
197
|
file_path: str,
|
|
197
198
|
remote_path: str,
|
|
198
199
|
progress_bar: bool,
|
|
@@ -201,7 +202,6 @@ class _ModelFileUploader:
|
|
|
201
202
|
self.model_id = model_id
|
|
202
203
|
self.version = version
|
|
203
204
|
self.teamspace_id = teamspace_id
|
|
204
|
-
self.cluster_id = cluster_id
|
|
205
205
|
self.local_path = file_path
|
|
206
206
|
self.remote_path = remote_path
|
|
207
207
|
|
|
@@ -215,6 +215,8 @@ class _ModelFileUploader:
|
|
|
215
215
|
unit="B",
|
|
216
216
|
unit_scale=True,
|
|
217
217
|
unit_divisor=1000,
|
|
218
|
+
position=1,
|
|
219
|
+
leave=False,
|
|
218
220
|
)
|
|
219
221
|
else:
|
|
220
222
|
self.progress_bar = None
|
|
@@ -376,6 +378,7 @@ class _FileDownloader:
|
|
|
376
378
|
teamspace_id: str,
|
|
377
379
|
remote_path: str,
|
|
378
380
|
file_path: str,
|
|
381
|
+
executor: ThreadPoolExecutor,
|
|
379
382
|
num_workers: int = 20,
|
|
380
383
|
progress_bar: Optional[tqdm] = None,
|
|
381
384
|
) -> None:
|
|
@@ -389,7 +392,7 @@ class _FileDownloader:
|
|
|
389
392
|
self.num_workers = num_workers
|
|
390
393
|
self._url = ""
|
|
391
394
|
self._size = 0
|
|
392
|
-
self.
|
|
395
|
+
self.executor = executor
|
|
393
396
|
|
|
394
397
|
@backoff.on_exception(backoff.expo, ApiException, max_tries=10)
|
|
395
398
|
def refresh(self) -> None:
|
|
@@ -445,26 +448,26 @@ class _FileDownloader:
|
|
|
445
448
|
if remaining_size > 0:
|
|
446
449
|
f.write(b"\x00" * remaining_size)
|
|
447
450
|
|
|
448
|
-
def _multipart_download(self, filename: str,
|
|
449
|
-
num_chunks =
|
|
451
|
+
def _multipart_download(self, filename: str, num_workers: int) -> None:
|
|
452
|
+
num_chunks = num_workers
|
|
450
453
|
chunk_size = math.ceil(self.size / num_chunks)
|
|
451
454
|
|
|
452
455
|
if chunk_size < _DOWNLOAD_MIN_CHUNK_SIZE:
|
|
453
456
|
num_chunks = math.ceil(self.size / _DOWNLOAD_MIN_CHUNK_SIZE)
|
|
454
457
|
chunk_size = _DOWNLOAD_MIN_CHUNK_SIZE
|
|
455
458
|
|
|
456
|
-
num_workers = min(max_workers, num_chunks)
|
|
457
|
-
|
|
458
459
|
ranges = []
|
|
459
460
|
for part_number in range(num_chunks):
|
|
460
461
|
start = part_number * chunk_size
|
|
461
462
|
end = min(start + chunk_size - 1, self.size - 1)
|
|
462
463
|
ranges.append((start, end))
|
|
463
464
|
|
|
464
|
-
|
|
465
|
-
|
|
465
|
+
futures = [self.executor.submit(self._download_chunk, filename, r) for r in ranges]
|
|
466
|
+
concurrent.futures.wait(futures)
|
|
466
467
|
|
|
467
468
|
def download(self) -> None:
|
|
469
|
+
self.refresh()
|
|
470
|
+
|
|
468
471
|
tmp_filename = f"{self.local_path}.download"
|
|
469
472
|
|
|
470
473
|
try:
|
|
@@ -536,31 +539,40 @@ def _download_model_files(
|
|
|
536
539
|
unit_divisor=1000,
|
|
537
540
|
)
|
|
538
541
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
542
|
+
with ThreadPoolExecutor(max_workers=min(num_workers, len(response.filepaths))) as file_executor, ThreadPoolExecutor(
|
|
543
|
+
max_workers=num_workers
|
|
544
|
+
) as part_executor:
|
|
545
|
+
futures = []
|
|
546
|
+
|
|
547
|
+
for filepath in response.filepaths:
|
|
548
|
+
local_file = download_dir / filepath
|
|
549
|
+
local_file.parent.mkdir(parents=True, exist_ok=True)
|
|
550
|
+
|
|
551
|
+
file_downloader = _FileDownloader(
|
|
552
|
+
client=client,
|
|
553
|
+
model_id=response.model_id,
|
|
554
|
+
version=response.version,
|
|
555
|
+
teamspace_id=response.project_id,
|
|
556
|
+
remote_path=filepath,
|
|
557
|
+
file_path=str(local_file),
|
|
558
|
+
num_workers=num_workers,
|
|
559
|
+
progress_bar=pbar,
|
|
560
|
+
executor=part_executor,
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
futures.append(file_executor.submit(file_downloader.download))
|
|
553
564
|
|
|
554
|
-
|
|
565
|
+
# wait for all threads
|
|
566
|
+
concurrent.futures.wait(futures)
|
|
555
567
|
|
|
556
|
-
|
|
568
|
+
return response.filepaths
|
|
557
569
|
|
|
558
570
|
|
|
559
571
|
def _create_app(
|
|
560
572
|
client: CloudSpaceServiceApi,
|
|
561
573
|
studio_id: str,
|
|
562
574
|
teamspace_id: str,
|
|
563
|
-
|
|
575
|
+
cloud_account: str,
|
|
564
576
|
plugin_type: str,
|
|
565
577
|
**other_arguments: Any,
|
|
566
578
|
) -> Externalv1LightningappInstance:
|
|
@@ -573,7 +585,7 @@ def _create_app(
|
|
|
573
585
|
del other_arguments["interruptible"]
|
|
574
586
|
|
|
575
587
|
body = AppsIdBody(
|
|
576
|
-
cluster_id=
|
|
588
|
+
cluster_id=cloud_account,
|
|
577
589
|
plugin_arguments=other_arguments,
|
|
578
590
|
service_id=os.getenv(_LIGHTNING_SERVICE_EXECUTION_ID_KEY),
|
|
579
591
|
unique_id=__GLOBAL_LIGHTNING_UNIQUE_IDS_STORE__[studio_id],
|
|
@@ -584,6 +596,6 @@ def _create_app(
|
|
|
584
596
|
).lightningappinstance
|
|
585
597
|
|
|
586
598
|
if _LIGHTNING_DEBUG:
|
|
587
|
-
print(f"Create App: {resp.id=} {teamspace_id=} {studio_id=} {
|
|
599
|
+
print(f"Create App: {resp.id=} {teamspace_id=} {studio_id=} {cloud_account=}")
|
|
588
600
|
|
|
589
601
|
return resp
|
lightning_sdk/cli/ai_hub.py
CHANGED
|
@@ -32,7 +32,7 @@ class _AIHub(_StudiosMenu):
|
|
|
32
32
|
def deploy(
|
|
33
33
|
self,
|
|
34
34
|
api_id: str,
|
|
35
|
-
|
|
35
|
+
cloud_account: Optional[str] = None,
|
|
36
36
|
name: Optional[str] = None,
|
|
37
37
|
teamspace: Optional[str] = None,
|
|
38
38
|
org: Optional[str] = None,
|
|
@@ -41,9 +41,9 @@ class _AIHub(_StudiosMenu):
|
|
|
41
41
|
|
|
42
42
|
Args:
|
|
43
43
|
api_id: API template ID.
|
|
44
|
-
|
|
44
|
+
cloud_account: Cloud Account to deploy the API to. Defaults to user's default cloud account.
|
|
45
45
|
name: Name of the deployed API. Defaults to the name of the API template.
|
|
46
46
|
teamspace: Teamspace to deploy the API to. Defaults to user's default teamspace.
|
|
47
47
|
org: Organization to deploy the API to. Defaults to user's default organization.
|
|
48
48
|
"""
|
|
49
|
-
return self._hub.run(api_id,
|
|
49
|
+
return self._hub.run(api_id, cloud_account=cloud_account, name=name, teamspace=teamspace, org=org)
|
lightning_sdk/cli/entrypoint.py
CHANGED
|
@@ -6,6 +6,7 @@ from lightning_sdk.cli.ai_hub import _AIHub
|
|
|
6
6
|
from lightning_sdk.cli.download import _Downloads
|
|
7
7
|
from lightning_sdk.cli.legacy import _LegacyLightningCLI
|
|
8
8
|
from lightning_sdk.cli.run import _Run
|
|
9
|
+
from lightning_sdk.cli.serve import _Docker, _LitServe
|
|
9
10
|
from lightning_sdk.cli.upload import _Uploads
|
|
10
11
|
from lightning_sdk.lightning_cloud.login import Auth
|
|
11
12
|
|
|
@@ -19,8 +20,9 @@ class StudioCLI:
|
|
|
19
20
|
self.download = _Downloads()
|
|
20
21
|
self.upload = _Uploads()
|
|
21
22
|
self.aihub = _AIHub()
|
|
22
|
-
|
|
23
23
|
self.run = _Run(legacy_run=_LegacyLightningCLI() if _LIGHTNING_AVAILABLE else None)
|
|
24
|
+
self.serve = _LitServe()
|
|
25
|
+
self.dockerize = _Docker()
|
|
24
26
|
|
|
25
27
|
def login(self) -> None:
|
|
26
28
|
"""Login to Lightning AI Studios."""
|
lightning_sdk/cli/mmt.py
CHANGED
|
@@ -31,14 +31,15 @@ class MMTCLI:
|
|
|
31
31
|
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
32
32
|
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
33
33
|
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
cloud_account: The cloud account to run the job on.
|
|
35
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
36
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
36
37
|
env: Environment variables to set inside the job.
|
|
37
38
|
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
38
39
|
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
39
40
|
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
40
|
-
|
|
41
|
-
Required if the registry is part of a
|
|
41
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
42
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
42
43
|
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
43
44
|
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
44
45
|
Only supported for jobs with a docker image compute environment.
|
|
@@ -83,11 +84,11 @@ class MMTCLI:
|
|
|
83
84
|
teamspace: Optional[str] = None,
|
|
84
85
|
org: Optional[str] = None,
|
|
85
86
|
user: Optional[str] = None,
|
|
86
|
-
|
|
87
|
+
cloud_account: Optional[str] = None,
|
|
87
88
|
env: Optional[Dict[str, str]] = None,
|
|
88
89
|
interruptible: bool = False,
|
|
89
90
|
image_credentials: Optional[str] = None,
|
|
90
|
-
|
|
91
|
+
cloud_account_auth: bool = False,
|
|
91
92
|
artifacts_local: Optional[str] = None,
|
|
92
93
|
artifacts_remote: Optional[str] = None,
|
|
93
94
|
) -> None:
|
|
@@ -103,8 +104,8 @@ class MMTCLI:
|
|
|
103
104
|
machine_enum = Machine(machine.upper())
|
|
104
105
|
|
|
105
106
|
teamspace = Teamspace(name=teamspace, org=org, user=user)
|
|
106
|
-
if
|
|
107
|
-
|
|
107
|
+
if cloud_account is None:
|
|
108
|
+
cloud_account = teamspace.default_cloud_account
|
|
108
109
|
|
|
109
110
|
if image is None:
|
|
110
111
|
raise RuntimeError("Currently only docker images are specified")
|
|
@@ -118,11 +119,11 @@ class MMTCLI:
|
|
|
118
119
|
teamspace=teamspace,
|
|
119
120
|
org=org,
|
|
120
121
|
user=user,
|
|
121
|
-
|
|
122
|
+
cloud_account=cloud_account,
|
|
122
123
|
env=env,
|
|
123
124
|
interruptible=interruptible,
|
|
124
125
|
image_credentials=image_credentials,
|
|
125
|
-
|
|
126
|
+
cloud_account_auth=cloud_account_auth,
|
|
126
127
|
artifacts_local=artifacts_local,
|
|
127
128
|
artifacts_remote=artifacts_remote,
|
|
128
129
|
)
|
lightning_sdk/cli/run.py
CHANGED
|
@@ -32,14 +32,15 @@ class _Run:
|
|
|
32
32
|
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
33
33
|
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
34
34
|
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
35
|
-
|
|
36
|
-
|
|
35
|
+
cloud_account: The cloud account to run the job on.
|
|
36
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
37
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
37
38
|
env: Environment variables to set inside the job.
|
|
38
39
|
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
39
40
|
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
40
41
|
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
41
|
-
|
|
42
|
-
Required if the registry is part of a
|
|
42
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
43
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
43
44
|
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
44
45
|
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
45
46
|
Only supported for jobs with a docker image compute environment.
|
|
@@ -68,11 +69,11 @@ class _Run:
|
|
|
68
69
|
teamspace: Optional[str] = None,
|
|
69
70
|
org: Optional[str] = None,
|
|
70
71
|
user: Optional[str] = None,
|
|
71
|
-
|
|
72
|
+
cloud_account: Optional[str] = None,
|
|
72
73
|
env: Optional[Dict[str, str]] = None,
|
|
73
74
|
interruptible: bool = False,
|
|
74
75
|
image_credentials: Optional[str] = None,
|
|
75
|
-
|
|
76
|
+
cloud_account_auth: bool = False,
|
|
76
77
|
artifacts_local: Optional[str] = None,
|
|
77
78
|
artifacts_remote: Optional[str] = None,
|
|
78
79
|
) -> None:
|
|
@@ -86,11 +87,11 @@ class _Run:
|
|
|
86
87
|
teamspace=teamspace,
|
|
87
88
|
org=org,
|
|
88
89
|
user=user,
|
|
89
|
-
|
|
90
|
+
cloud_account=cloud_account,
|
|
90
91
|
env=env,
|
|
91
92
|
interruptible=interruptible,
|
|
92
93
|
image_credentials=image_credentials,
|
|
93
|
-
|
|
94
|
+
cloud_account_auth=cloud_account_auth,
|
|
94
95
|
artifacts_local=artifacts_local,
|
|
95
96
|
artifacts_remote=artifacts_remote,
|
|
96
97
|
)
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
import warnings
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Union
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _LitServe:
|
|
11
|
+
"""Serve a LitServe model.
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
lightning serve api server.py
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def api(
|
|
18
|
+
self,
|
|
19
|
+
script_path: Union[str, Path],
|
|
20
|
+
easy: bool = False,
|
|
21
|
+
) -> None:
|
|
22
|
+
"""Deploy a LitServe model script.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
script_path: Path to the script to serve
|
|
26
|
+
easy: If True, generates a client for the model
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
FileNotFoundError: If script_path doesn't exist
|
|
30
|
+
ImportError: If litserve is not installed
|
|
31
|
+
subprocess.CalledProcessError: If the script fails to run
|
|
32
|
+
IOError: If client.py generation fails
|
|
33
|
+
"""
|
|
34
|
+
console = Console()
|
|
35
|
+
script_path = Path(script_path)
|
|
36
|
+
if not script_path.exists():
|
|
37
|
+
raise FileNotFoundError(f"Script not found: {script_path}")
|
|
38
|
+
if not script_path.is_file():
|
|
39
|
+
raise ValueError(f"Path is not a file: {script_path}")
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
from litserve.python_client import client_template
|
|
43
|
+
except ImportError:
|
|
44
|
+
raise ImportError(
|
|
45
|
+
"litserve is not installed. Please install it with `pip install lightning_sdk[serve]`"
|
|
46
|
+
) from None
|
|
47
|
+
|
|
48
|
+
if easy:
|
|
49
|
+
client_path = Path("client.py")
|
|
50
|
+
if client_path.exists():
|
|
51
|
+
console.print("Skipping client generation: client.py already exists", style="blue")
|
|
52
|
+
else:
|
|
53
|
+
try:
|
|
54
|
+
client_path.write_text(client_template)
|
|
55
|
+
console.print("✅ Client generated at client.py", style="bold green")
|
|
56
|
+
except OSError as e:
|
|
57
|
+
raise OSError(f"Failed to generate client.py: {e!s}") from None
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
subprocess.run(
|
|
61
|
+
["python", str(script_path)],
|
|
62
|
+
check=True,
|
|
63
|
+
text=True,
|
|
64
|
+
)
|
|
65
|
+
except subprocess.CalledProcessError as e:
|
|
66
|
+
error_msg = f"Script execution failed with exit code {e.returncode}\nstdout: {e.stdout}\nstderr: {e.stderr}"
|
|
67
|
+
raise RuntimeError(error_msg) from None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class _Docker:
|
|
71
|
+
"""Generate a Dockerfile for a LitServe model."""
|
|
72
|
+
|
|
73
|
+
def api(self, server_filename: str, port: int = 8000, gpu: bool = False) -> None:
|
|
74
|
+
"""Generate a Dockerfile for the given server code.
|
|
75
|
+
|
|
76
|
+
Example:
|
|
77
|
+
lightning litserve dockerize server.py --port 8000 --gpu
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
server_filename (str): The path to the server file. Example sever.py or app.py.
|
|
81
|
+
port (int, optional): The port to expose in the Docker container.
|
|
82
|
+
gpu (bool, optional): Whether to use a GPU-enabled Docker image.
|
|
83
|
+
"""
|
|
84
|
+
import litserve as ls
|
|
85
|
+
from litserve import docker_builder
|
|
86
|
+
|
|
87
|
+
console = Console()
|
|
88
|
+
requirements = ""
|
|
89
|
+
if os.path.exists("requirements.txt"):
|
|
90
|
+
requirements = "-r requirements.txt"
|
|
91
|
+
else:
|
|
92
|
+
warnings.warn(
|
|
93
|
+
f"requirements.txt not found at {os.getcwd()}. "
|
|
94
|
+
f"Make sure to install the required packages in the Dockerfile.",
|
|
95
|
+
UserWarning,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
current_dir = Path.cwd()
|
|
99
|
+
if not (current_dir / server_filename).is_file():
|
|
100
|
+
raise FileNotFoundError(f"Server file `{server_filename}` must be in the current directory: {os.getcwd()}")
|
|
101
|
+
|
|
102
|
+
version = ls.__version__
|
|
103
|
+
if gpu:
|
|
104
|
+
run_cmd = f"docker run --gpus all -p {port}:{port} litserve-model:latest"
|
|
105
|
+
docker_template = docker_builder.CUDA_DOCKER_TEMPLATE
|
|
106
|
+
else:
|
|
107
|
+
run_cmd = f"docker run -p {port}:{port} litserve-model:latest"
|
|
108
|
+
docker_template = docker_builder.DOCKERFILE_TEMPLATE
|
|
109
|
+
dockerfile_content = docker_template.format(
|
|
110
|
+
server_filename=server_filename,
|
|
111
|
+
port=port,
|
|
112
|
+
version=version,
|
|
113
|
+
requirements=requirements,
|
|
114
|
+
)
|
|
115
|
+
with open("Dockerfile", "w") as f:
|
|
116
|
+
f.write(dockerfile_content)
|
|
117
|
+
|
|
118
|
+
success_msg = f"""[bold]Dockerfile created successfully[/bold]
|
|
119
|
+
Update [underline]{os.path.abspath("Dockerfile")}[/underline] to add any additional dependencies or commands.
|
|
120
|
+
|
|
121
|
+
[bold]Build the container with:[/bold]
|
|
122
|
+
> [underline]docker build -t litserve-model .[/underline]
|
|
123
|
+
|
|
124
|
+
[bold]To run the Docker container on the machine:[/bold]
|
|
125
|
+
> [underline]{run_cmd}[/underline]
|
|
126
|
+
|
|
127
|
+
[bold]To push the container to a registry:[/bold]
|
|
128
|
+
> [underline]docker push litserve-model[/underline]
|
|
129
|
+
"""
|
|
130
|
+
console.print(success_msg)
|
|
@@ -32,7 +32,7 @@ from lightning_sdk.organization import Organization
|
|
|
32
32
|
from lightning_sdk.services.utilities import _get_cluster
|
|
33
33
|
from lightning_sdk.teamspace import Teamspace
|
|
34
34
|
from lightning_sdk.user import User
|
|
35
|
-
from lightning_sdk.utils.resolve import _resolve_org, _resolve_teamspace, _resolve_user
|
|
35
|
+
from lightning_sdk.utils.resolve import _resolve_deprecated_cluster, _resolve_org, _resolve_teamspace, _resolve_user
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
class Deployment:
|
|
@@ -81,7 +81,7 @@ class Deployment:
|
|
|
81
81
|
raise ValueError("You need to pass a teamspace or an org for your deployment.")
|
|
82
82
|
|
|
83
83
|
self._deployment_api = DeploymentApi()
|
|
84
|
-
self.
|
|
84
|
+
self._cloud_account = _get_cluster(client=self._deployment_api._client, project_id=self._teamspace.id)
|
|
85
85
|
self._is_created = False
|
|
86
86
|
deployment = self._deployment_api.get_deployment_by_name(name, self._teamspace.id)
|
|
87
87
|
if deployment:
|
|
@@ -102,8 +102,9 @@ class Deployment:
|
|
|
102
102
|
replicas: Optional[int] = None,
|
|
103
103
|
health_check: Optional[Union[HttpHealthCheck, ExecHealthCheck]] = None,
|
|
104
104
|
auth: Optional[Union[BasicAuth, TokenAuth]] = None,
|
|
105
|
-
|
|
105
|
+
cloud_account: Optional[str] = None,
|
|
106
106
|
custom_domain: Optional[str] = None,
|
|
107
|
+
cluster: Optional[str] = None, # deprecated in favor of cloud_account
|
|
107
108
|
) -> None:
|
|
108
109
|
"""The Lightning AI Deployment.
|
|
109
110
|
|
|
@@ -124,7 +125,7 @@ class Deployment:
|
|
|
124
125
|
replicas: The number of replicas to deploy with.
|
|
125
126
|
health_check: The health check config to know whether your service is ready to receive traffic.
|
|
126
127
|
auth: The auth config to protect your services. Only Basic and Token supported.
|
|
127
|
-
|
|
128
|
+
cloud_account: The name of the cloud account, the studio should be created on.
|
|
128
129
|
Doesn't matter when the studio already exists.
|
|
129
130
|
custom_domain: Whether your service would be referenced under a custom doamin.
|
|
130
131
|
|
|
@@ -136,9 +137,11 @@ class Deployment:
|
|
|
136
137
|
if self._is_created:
|
|
137
138
|
raise RuntimeError("This deployment has already been started.")
|
|
138
139
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
140
|
+
cloud_account = _resolve_deprecated_cluster(cloud_account, cluster)
|
|
141
|
+
|
|
142
|
+
if cloud_account is None and self._cloud_account is not None:
|
|
143
|
+
print(f"No cloud account was provided, defaulting to {self._cloud_account.cluster_id}")
|
|
144
|
+
cloud_account = os.getenv("LIGHTNING_CLUSTER_ID") or self._cloud_account.cluster_id
|
|
142
145
|
|
|
143
146
|
self._deployment = self._deployment_api.create_deployment(
|
|
144
147
|
V1Deployment(
|
|
@@ -148,7 +151,7 @@ class Deployment:
|
|
|
148
151
|
project_id=self._teamspace.id,
|
|
149
152
|
replicas=replicas,
|
|
150
153
|
spec=to_spec(
|
|
151
|
-
|
|
154
|
+
cloud_account=cloud_account,
|
|
152
155
|
command=command,
|
|
153
156
|
entrypoint=entrypoint,
|
|
154
157
|
env=env,
|
|
@@ -171,7 +174,7 @@ class Deployment:
|
|
|
171
174
|
command: Optional[str] = None,
|
|
172
175
|
env: Optional[List[Union[Env, Secret]]] = None,
|
|
173
176
|
spot: Optional[bool] = None,
|
|
174
|
-
|
|
177
|
+
cloud_account: Optional[str] = None,
|
|
175
178
|
health_check: Optional[Union[HttpHealthCheck, ExecHealthCheck]] = None,
|
|
176
179
|
# Changing those arguments don't create a new release
|
|
177
180
|
min_replicas: Optional[int] = None,
|
|
@@ -182,7 +185,10 @@ class Deployment:
|
|
|
182
185
|
replicas: Optional[int] = None,
|
|
183
186
|
auth: Optional[Union[BasicAuth, TokenAuth]] = None,
|
|
184
187
|
custom_domain: Optional[str] = None,
|
|
188
|
+
cluster: Optional[str] = None, # deprecated in favor of cloud_account
|
|
185
189
|
) -> None:
|
|
190
|
+
cloud_account = _resolve_deprecated_cluster(cloud_account, cluster)
|
|
191
|
+
|
|
186
192
|
self._deployment = self._deployment_api.update_deployment(
|
|
187
193
|
self._deployment,
|
|
188
194
|
name=name or self._name,
|
|
@@ -190,7 +196,7 @@ class Deployment:
|
|
|
190
196
|
replicas=replicas,
|
|
191
197
|
min_replicas=min_replicas,
|
|
192
198
|
max_replicas=max_replicas,
|
|
193
|
-
|
|
199
|
+
cloud_account=cloud_account,
|
|
194
200
|
machine=machine,
|
|
195
201
|
environment=environment,
|
|
196
202
|
entrypoint=entrypoint,
|
|
@@ -312,8 +318,8 @@ class Deployment:
|
|
|
312
318
|
return None
|
|
313
319
|
|
|
314
320
|
@property
|
|
315
|
-
def
|
|
316
|
-
"""The
|
|
321
|
+
def cloud_account(self) -> Optional[str]:
|
|
322
|
+
"""The cloud_account of the replicas."""
|
|
317
323
|
if self._deployment:
|
|
318
324
|
self._deployment = self._deployment_api.get_deployment_by_name(self._name, self._teamspace.id)
|
|
319
325
|
return self._deployment.spec.cluster_id
|