lightning-sdk 0.1.41__py3-none-any.whl → 0.1.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightning_sdk/__init__.py +1 -1
- lightning_sdk/ai_hub.py +8 -3
- lightning_sdk/api/ai_hub_api.py +3 -3
- lightning_sdk/api/deployment_api.py +6 -6
- lightning_sdk/api/job_api.py +32 -6
- lightning_sdk/api/mmt_api.py +59 -19
- lightning_sdk/api/studio_api.py +37 -19
- lightning_sdk/api/teamspace_api.py +34 -29
- lightning_sdk/api/utils.py +46 -34
- lightning_sdk/cli/ai_hub.py +3 -3
- lightning_sdk/cli/entrypoint.py +3 -1
- lightning_sdk/cli/run.py +122 -12
- lightning_sdk/cli/serve.py +218 -0
- lightning_sdk/deployment/deployment.py +18 -12
- lightning_sdk/job/base.py +118 -24
- lightning_sdk/job/job.py +98 -9
- lightning_sdk/job/v1.py +75 -18
- lightning_sdk/job/v2.py +51 -15
- lightning_sdk/job/work.py +36 -7
- lightning_sdk/lightning_cloud/openapi/__init__.py +12 -0
- lightning_sdk/lightning_cloud/openapi/api/jobs_service_api.py +215 -5
- lightning_sdk/lightning_cloud/openapi/api/lit_logger_service_api.py +218 -0
- lightning_sdk/lightning_cloud/openapi/api/models_store_api.py +226 -0
- lightning_sdk/lightning_cloud/openapi/api/snowflake_service_api.py +21 -1
- lightning_sdk/lightning_cloud/openapi/models/__init__.py +12 -0
- lightning_sdk/lightning_cloud/openapi/models/deploymenttemplates_id_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/id_visibility_body.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/model_id_versions_body.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/project_id_multimachinejobs_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/project_id_snowflake_body.py +15 -67
- lightning_sdk/lightning_cloud/openapi/models/query_query_id_body.py +17 -69
- lightning_sdk/lightning_cloud/openapi/models/snowflake_export_body.py +29 -81
- lightning_sdk/lightning_cloud/openapi/models/snowflake_query_body.py +17 -69
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_file_url_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_response.py +17 -17
- lightning_sdk/lightning_cloud/openapi/models/v1_get_model_files_url_response.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_get_project_balance_response.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_list_multi_machine_job_events_response.py +123 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_metrics_stream.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_model_file.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event.py +331 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_event_type.py +104 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_fault_tolerance_strategy.py +105 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_status.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_rule_resource.py +1 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_snowflake_data_connection.py +29 -81
- lightning_sdk/lightning_cloud/openapi/models/v1_system_metrics.py +29 -3
- lightning_sdk/lightning_cloud/openapi/models/v1_trainium_system_metrics.py +175 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_update_metrics_stream_visibility_response.py +97 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +27 -53
- lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_request.py +149 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_validate_deployment_image_response.py +97 -0
- lightning_sdk/lightning_cloud/rest_client.py +2 -0
- lightning_sdk/mmt/__init__.py +4 -0
- lightning_sdk/mmt/base.py +278 -0
- lightning_sdk/mmt/mmt.py +267 -0
- lightning_sdk/mmt/v1.py +181 -0
- lightning_sdk/mmt/v2.py +188 -0
- lightning_sdk/plugin.py +43 -16
- lightning_sdk/services/file_endpoint.py +11 -5
- lightning_sdk/studio.py +16 -9
- lightning_sdk/teamspace.py +21 -8
- lightning_sdk/utils/resolve.py +18 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/METADATA +4 -1
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/RECORD +71 -59
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/entry_points.txt +0 -1
- lightning_sdk/_mmt/__init__.py +0 -3
- lightning_sdk/_mmt/base.py +0 -180
- lightning_sdk/_mmt/mmt.py +0 -161
- lightning_sdk/_mmt/v1.py +0 -69
- lightning_sdk/_mmt/v2.py +0 -141
- lightning_sdk/cli/mmt.py +0 -137
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/LICENSE +0 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/WHEEL +0 -0
- {lightning_sdk-0.1.41.dist-info → lightning_sdk-0.1.43.dist-info}/top_level.txt +0 -0
lightning_sdk/api/utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import concurrent.futures
|
|
1
2
|
import errno
|
|
2
3
|
import math
|
|
3
4
|
import os
|
|
@@ -8,7 +9,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
8
9
|
|
|
9
10
|
import backoff
|
|
10
11
|
import requests
|
|
11
|
-
from tqdm import tqdm
|
|
12
|
+
from tqdm.auto import tqdm
|
|
12
13
|
|
|
13
14
|
from lightning_sdk.constants import __GLOBAL_LIGHTNING_UNIQUE_IDS_STORE__, _LIGHTNING_DEBUG
|
|
14
15
|
from lightning_sdk.lightning_cloud.openapi import (
|
|
@@ -66,14 +67,14 @@ class _FileUploader:
|
|
|
66
67
|
self,
|
|
67
68
|
client: LightningClient,
|
|
68
69
|
teamspace_id: str,
|
|
69
|
-
|
|
70
|
+
cloud_account: str,
|
|
70
71
|
file_path: str,
|
|
71
72
|
remote_path: str,
|
|
72
73
|
progress_bar: bool,
|
|
73
74
|
) -> None:
|
|
74
75
|
self.client = client
|
|
75
76
|
self.teamspace_id = teamspace_id
|
|
76
|
-
self.
|
|
77
|
+
self.cloud_account = cloud_account
|
|
77
78
|
|
|
78
79
|
self.local_path = file_path
|
|
79
80
|
|
|
@@ -107,7 +108,7 @@ class _FileUploader:
|
|
|
107
108
|
|
|
108
109
|
def _multipart_upload(self, count: int) -> None:
|
|
109
110
|
"""Does a parallel multipart upload."""
|
|
110
|
-
body = ProjectIdStorageBody(cluster_id=self.
|
|
111
|
+
body = ProjectIdStorageBody(cluster_id=self.cloud_account, filename=self.remote_path)
|
|
111
112
|
resp: V1UploadProjectArtifactResponse = self.client.storage_service_upload_project_artifact(
|
|
112
113
|
body=body, project_id=self.teamspace_id
|
|
113
114
|
)
|
|
@@ -123,7 +124,7 @@ class _FileUploader:
|
|
|
123
124
|
completed.extend(self._process_upload_batch(executor=p, batch=batch, upload_id=resp.upload_id))
|
|
124
125
|
|
|
125
126
|
completed_body = StorageCompleteBody(
|
|
126
|
-
cluster_id=self.
|
|
127
|
+
cluster_id=self.cloud_account, filename=self.remote_path, parts=completed, upload_id=resp.upload_id
|
|
127
128
|
)
|
|
128
129
|
self.client.storage_service_complete_upload_project_artifact(body=completed_body, project_id=self.teamspace_id)
|
|
129
130
|
|
|
@@ -135,7 +136,7 @@ class _FileUploader:
|
|
|
135
136
|
|
|
136
137
|
def _request_urls(self, parts: List[int], upload_id: str) -> List[V1PresignedUrl]:
|
|
137
138
|
"""Requests urls for a batch of parts."""
|
|
138
|
-
body = UploadsUploadIdBody(cluster_id=self.
|
|
139
|
+
body = UploadsUploadIdBody(cluster_id=self.cloud_account, filename=self.remote_path, parts=parts)
|
|
139
140
|
resp: V1UploadProjectArtifactPartsResponse = self.client.storage_service_upload_project_artifact_parts(
|
|
140
141
|
body, self.teamspace_id, upload_id
|
|
141
142
|
)
|
|
@@ -192,7 +193,7 @@ class _ModelFileUploader:
|
|
|
192
193
|
model_id: str,
|
|
193
194
|
version: str,
|
|
194
195
|
teamspace_id: str,
|
|
195
|
-
|
|
196
|
+
cloud_account: str,
|
|
196
197
|
file_path: str,
|
|
197
198
|
remote_path: str,
|
|
198
199
|
progress_bar: bool,
|
|
@@ -201,7 +202,6 @@ class _ModelFileUploader:
|
|
|
201
202
|
self.model_id = model_id
|
|
202
203
|
self.version = version
|
|
203
204
|
self.teamspace_id = teamspace_id
|
|
204
|
-
self.cluster_id = cluster_id
|
|
205
205
|
self.local_path = file_path
|
|
206
206
|
self.remote_path = remote_path
|
|
207
207
|
|
|
@@ -215,6 +215,8 @@ class _ModelFileUploader:
|
|
|
215
215
|
unit="B",
|
|
216
216
|
unit_scale=True,
|
|
217
217
|
unit_divisor=1000,
|
|
218
|
+
position=1,
|
|
219
|
+
leave=False,
|
|
218
220
|
)
|
|
219
221
|
else:
|
|
220
222
|
self.progress_bar = None
|
|
@@ -376,6 +378,7 @@ class _FileDownloader:
|
|
|
376
378
|
teamspace_id: str,
|
|
377
379
|
remote_path: str,
|
|
378
380
|
file_path: str,
|
|
381
|
+
executor: ThreadPoolExecutor,
|
|
379
382
|
num_workers: int = 20,
|
|
380
383
|
progress_bar: Optional[tqdm] = None,
|
|
381
384
|
) -> None:
|
|
@@ -389,7 +392,7 @@ class _FileDownloader:
|
|
|
389
392
|
self.num_workers = num_workers
|
|
390
393
|
self._url = ""
|
|
391
394
|
self._size = 0
|
|
392
|
-
self.
|
|
395
|
+
self.executor = executor
|
|
393
396
|
|
|
394
397
|
@backoff.on_exception(backoff.expo, ApiException, max_tries=10)
|
|
395
398
|
def refresh(self) -> None:
|
|
@@ -445,26 +448,26 @@ class _FileDownloader:
|
|
|
445
448
|
if remaining_size > 0:
|
|
446
449
|
f.write(b"\x00" * remaining_size)
|
|
447
450
|
|
|
448
|
-
def _multipart_download(self, filename: str,
|
|
449
|
-
num_chunks =
|
|
451
|
+
def _multipart_download(self, filename: str, num_workers: int) -> None:
|
|
452
|
+
num_chunks = num_workers
|
|
450
453
|
chunk_size = math.ceil(self.size / num_chunks)
|
|
451
454
|
|
|
452
455
|
if chunk_size < _DOWNLOAD_MIN_CHUNK_SIZE:
|
|
453
456
|
num_chunks = math.ceil(self.size / _DOWNLOAD_MIN_CHUNK_SIZE)
|
|
454
457
|
chunk_size = _DOWNLOAD_MIN_CHUNK_SIZE
|
|
455
458
|
|
|
456
|
-
num_workers = min(max_workers, num_chunks)
|
|
457
|
-
|
|
458
459
|
ranges = []
|
|
459
460
|
for part_number in range(num_chunks):
|
|
460
461
|
start = part_number * chunk_size
|
|
461
462
|
end = min(start + chunk_size - 1, self.size - 1)
|
|
462
463
|
ranges.append((start, end))
|
|
463
464
|
|
|
464
|
-
|
|
465
|
-
|
|
465
|
+
futures = [self.executor.submit(self._download_chunk, filename, r) for r in ranges]
|
|
466
|
+
concurrent.futures.wait(futures)
|
|
466
467
|
|
|
467
468
|
def download(self) -> None:
|
|
469
|
+
self.refresh()
|
|
470
|
+
|
|
468
471
|
tmp_filename = f"{self.local_path}.download"
|
|
469
472
|
|
|
470
473
|
try:
|
|
@@ -536,31 +539,40 @@ def _download_model_files(
|
|
|
536
539
|
unit_divisor=1000,
|
|
537
540
|
)
|
|
538
541
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
542
|
+
with ThreadPoolExecutor(max_workers=min(num_workers, len(response.filepaths))) as file_executor, ThreadPoolExecutor(
|
|
543
|
+
max_workers=num_workers
|
|
544
|
+
) as part_executor:
|
|
545
|
+
futures = []
|
|
546
|
+
|
|
547
|
+
for filepath in response.filepaths:
|
|
548
|
+
local_file = download_dir / filepath
|
|
549
|
+
local_file.parent.mkdir(parents=True, exist_ok=True)
|
|
550
|
+
|
|
551
|
+
file_downloader = _FileDownloader(
|
|
552
|
+
client=client,
|
|
553
|
+
model_id=response.model_id,
|
|
554
|
+
version=response.version,
|
|
555
|
+
teamspace_id=response.project_id,
|
|
556
|
+
remote_path=filepath,
|
|
557
|
+
file_path=str(local_file),
|
|
558
|
+
num_workers=num_workers,
|
|
559
|
+
progress_bar=pbar,
|
|
560
|
+
executor=part_executor,
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
futures.append(file_executor.submit(file_downloader.download))
|
|
553
564
|
|
|
554
|
-
|
|
565
|
+
# wait for all threads
|
|
566
|
+
concurrent.futures.wait(futures)
|
|
555
567
|
|
|
556
|
-
|
|
568
|
+
return response.filepaths
|
|
557
569
|
|
|
558
570
|
|
|
559
571
|
def _create_app(
|
|
560
572
|
client: CloudSpaceServiceApi,
|
|
561
573
|
studio_id: str,
|
|
562
574
|
teamspace_id: str,
|
|
563
|
-
|
|
575
|
+
cloud_account: str,
|
|
564
576
|
plugin_type: str,
|
|
565
577
|
**other_arguments: Any,
|
|
566
578
|
) -> Externalv1LightningappInstance:
|
|
@@ -573,7 +585,7 @@ def _create_app(
|
|
|
573
585
|
del other_arguments["interruptible"]
|
|
574
586
|
|
|
575
587
|
body = AppsIdBody(
|
|
576
|
-
cluster_id=
|
|
588
|
+
cluster_id=cloud_account,
|
|
577
589
|
plugin_arguments=other_arguments,
|
|
578
590
|
service_id=os.getenv(_LIGHTNING_SERVICE_EXECUTION_ID_KEY),
|
|
579
591
|
unique_id=__GLOBAL_LIGHTNING_UNIQUE_IDS_STORE__[studio_id],
|
|
@@ -584,6 +596,6 @@ def _create_app(
|
|
|
584
596
|
).lightningappinstance
|
|
585
597
|
|
|
586
598
|
if _LIGHTNING_DEBUG:
|
|
587
|
-
print(f"Create App: {resp.id=} {teamspace_id=} {studio_id=} {
|
|
599
|
+
print(f"Create App: {resp.id=} {teamspace_id=} {studio_id=} {cloud_account=}")
|
|
588
600
|
|
|
589
601
|
return resp
|
lightning_sdk/cli/ai_hub.py
CHANGED
|
@@ -32,7 +32,7 @@ class _AIHub(_StudiosMenu):
|
|
|
32
32
|
def deploy(
|
|
33
33
|
self,
|
|
34
34
|
api_id: str,
|
|
35
|
-
|
|
35
|
+
cloud_account: Optional[str] = None,
|
|
36
36
|
name: Optional[str] = None,
|
|
37
37
|
teamspace: Optional[str] = None,
|
|
38
38
|
org: Optional[str] = None,
|
|
@@ -41,9 +41,9 @@ class _AIHub(_StudiosMenu):
|
|
|
41
41
|
|
|
42
42
|
Args:
|
|
43
43
|
api_id: API template ID.
|
|
44
|
-
|
|
44
|
+
cloud_account: Cloud Account to deploy the API to. Defaults to user's default cloud account.
|
|
45
45
|
name: Name of the deployed API. Defaults to the name of the API template.
|
|
46
46
|
teamspace: Teamspace to deploy the API to. Defaults to user's default teamspace.
|
|
47
47
|
org: Organization to deploy the API to. Defaults to user's default organization.
|
|
48
48
|
"""
|
|
49
|
-
return self._hub.run(api_id,
|
|
49
|
+
return self._hub.run(api_id, cloud_account=cloud_account, name=name, teamspace=teamspace, org=org)
|
lightning_sdk/cli/entrypoint.py
CHANGED
|
@@ -6,6 +6,7 @@ from lightning_sdk.cli.ai_hub import _AIHub
|
|
|
6
6
|
from lightning_sdk.cli.download import _Downloads
|
|
7
7
|
from lightning_sdk.cli.legacy import _LegacyLightningCLI
|
|
8
8
|
from lightning_sdk.cli.run import _Run
|
|
9
|
+
from lightning_sdk.cli.serve import _Docker, _LitServe
|
|
9
10
|
from lightning_sdk.cli.upload import _Uploads
|
|
10
11
|
from lightning_sdk.lightning_cloud.login import Auth
|
|
11
12
|
|
|
@@ -19,8 +20,9 @@ class StudioCLI:
|
|
|
19
20
|
self.download = _Downloads()
|
|
20
21
|
self.upload = _Uploads()
|
|
21
22
|
self.aihub = _AIHub()
|
|
22
|
-
|
|
23
23
|
self.run = _Run(legacy_run=_LegacyLightningCLI() if _LIGHTNING_AVAILABLE else None)
|
|
24
|
+
self.serve = _LitServe()
|
|
25
|
+
self.dockerize = _Docker()
|
|
24
26
|
|
|
25
27
|
def login(self) -> None:
|
|
26
28
|
"""Login to Lightning AI Studios."""
|
lightning_sdk/cli/run.py
CHANGED
|
@@ -2,6 +2,8 @@ from typing import TYPE_CHECKING, Dict, Optional
|
|
|
2
2
|
|
|
3
3
|
from lightning_sdk.job import Job
|
|
4
4
|
from lightning_sdk.machine import Machine
|
|
5
|
+
from lightning_sdk.mmt import MMT
|
|
6
|
+
from lightning_sdk.teamspace import Teamspace
|
|
5
7
|
|
|
6
8
|
if TYPE_CHECKING:
|
|
7
9
|
from lightning_sdk.cli.legacy import _LegacyLightningCLI
|
|
@@ -20,7 +22,7 @@ class _Run:
|
|
|
20
22
|
# Need to set the docstring here for f-strings to work.
|
|
21
23
|
# Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
|
|
22
24
|
# and fire does not show values for literals, just that it is a literal.
|
|
23
|
-
|
|
25
|
+
docstr_job = f"""Run async workloads using a docker image or a compute environment from your studio.
|
|
24
26
|
|
|
25
27
|
Args:
|
|
26
28
|
name: The name of the job. Needs to be unique within the teamspace.
|
|
@@ -32,14 +34,15 @@ class _Run:
|
|
|
32
34
|
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
33
35
|
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
34
36
|
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
+
cloud_account: The cloud account to run the job on.
|
|
38
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
39
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
37
40
|
env: Environment variables to set inside the job.
|
|
38
41
|
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
39
42
|
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
40
43
|
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
41
|
-
|
|
42
|
-
Required if the registry is part of a
|
|
44
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
45
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
43
46
|
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
44
47
|
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
45
48
|
Only supported for jobs with a docker image compute environment.
|
|
@@ -53,7 +56,47 @@ class _Run:
|
|
|
53
56
|
"""
|
|
54
57
|
# TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
|
|
55
58
|
# might need to switch to explicit cli definition
|
|
56
|
-
self.job.__func__.__doc__ =
|
|
59
|
+
self.job.__func__.__doc__ = docstr_job
|
|
60
|
+
|
|
61
|
+
# Need to set the docstring here for f-strings to work.
|
|
62
|
+
# Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
|
|
63
|
+
# and fire does not show values for literals, just that it is a literal.
|
|
64
|
+
docstr_mmt = f"""Run async workloads on multiple machines using a docker image.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
name: The name of the job. Needs to be unique within the teamspace.
|
|
68
|
+
num_machines: The number of Machines to run on. Defaults to 2 Machines
|
|
69
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}. Defaults to CPU
|
|
70
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
71
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
72
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
73
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
74
|
+
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
75
|
+
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
76
|
+
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
77
|
+
cloud_account: The cloud account to run the job on.
|
|
78
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
79
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
80
|
+
env: Environment variables to set inside the job.
|
|
81
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
82
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
83
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
84
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
85
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
86
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
87
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
88
|
+
Only supported for jobs with a docker image compute environment.
|
|
89
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
90
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
91
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
92
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
93
|
+
within it.
|
|
94
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
95
|
+
Only supported for jobs with a docker image compute environment.
|
|
96
|
+
"""
|
|
97
|
+
# TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
|
|
98
|
+
# might need to switch to explicit cli definition
|
|
99
|
+
self.mmt.__func__.__doc__ = docstr_mmt
|
|
57
100
|
|
|
58
101
|
# TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
|
|
59
102
|
# see https://github.com/google/python-fire/pull/513
|
|
@@ -61,21 +104,30 @@ class _Run:
|
|
|
61
104
|
def job(
|
|
62
105
|
self,
|
|
63
106
|
name: str,
|
|
64
|
-
machine: str,
|
|
107
|
+
machine: Optional[str] = None,
|
|
65
108
|
command: Optional[str] = None,
|
|
66
109
|
studio: Optional[str] = None,
|
|
67
110
|
image: Optional[str] = None,
|
|
68
111
|
teamspace: Optional[str] = None,
|
|
69
112
|
org: Optional[str] = None,
|
|
70
113
|
user: Optional[str] = None,
|
|
71
|
-
|
|
114
|
+
cloud_account: Optional[str] = None,
|
|
72
115
|
env: Optional[Dict[str, str]] = None,
|
|
73
116
|
interruptible: bool = False,
|
|
74
117
|
image_credentials: Optional[str] = None,
|
|
75
|
-
|
|
118
|
+
cloud_account_auth: bool = False,
|
|
76
119
|
artifacts_local: Optional[str] = None,
|
|
77
120
|
artifacts_remote: Optional[str] = None,
|
|
78
121
|
) -> None:
|
|
122
|
+
if machine is None:
|
|
123
|
+
# TODO: infer from studio
|
|
124
|
+
machine = "CPU"
|
|
125
|
+
machine_enum = Machine(machine.upper())
|
|
126
|
+
|
|
127
|
+
resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
|
|
128
|
+
|
|
129
|
+
if cloud_account is None:
|
|
130
|
+
cloud_account = resolved_teamspace.default_cloud_account
|
|
79
131
|
machine_enum = Machine(machine.upper())
|
|
80
132
|
Job.run(
|
|
81
133
|
name=name,
|
|
@@ -83,14 +135,72 @@ class _Run:
|
|
|
83
135
|
command=command,
|
|
84
136
|
studio=studio,
|
|
85
137
|
image=image,
|
|
86
|
-
teamspace=
|
|
138
|
+
teamspace=resolved_teamspace,
|
|
139
|
+
org=org,
|
|
140
|
+
user=user,
|
|
141
|
+
cloud_account=cloud_account,
|
|
142
|
+
env=env,
|
|
143
|
+
interruptible=interruptible,
|
|
144
|
+
image_credentials=image_credentials,
|
|
145
|
+
cloud_account_auth=cloud_account_auth,
|
|
146
|
+
artifacts_local=artifacts_local,
|
|
147
|
+
artifacts_remote=artifacts_remote,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
|
|
151
|
+
# see https://github.com/google/python-fire/pull/513
|
|
152
|
+
# might need to move to different cli library
|
|
153
|
+
def mmt(
|
|
154
|
+
self,
|
|
155
|
+
name: Optional[str] = None,
|
|
156
|
+
num_machines: int = 2,
|
|
157
|
+
machine: Optional[str] = None,
|
|
158
|
+
command: Optional[str] = None,
|
|
159
|
+
image: Optional[str] = None,
|
|
160
|
+
teamspace: Optional[str] = None,
|
|
161
|
+
org: Optional[str] = None,
|
|
162
|
+
user: Optional[str] = None,
|
|
163
|
+
cloud_account: Optional[str] = None,
|
|
164
|
+
env: Optional[Dict[str, str]] = None,
|
|
165
|
+
interruptible: bool = False,
|
|
166
|
+
image_credentials: Optional[str] = None,
|
|
167
|
+
cloud_account_auth: bool = False,
|
|
168
|
+
artifacts_local: Optional[str] = None,
|
|
169
|
+
artifacts_remote: Optional[str] = None,
|
|
170
|
+
) -> None:
|
|
171
|
+
if name is None:
|
|
172
|
+
from datetime import datetime
|
|
173
|
+
|
|
174
|
+
timestr = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
|
175
|
+
name = f"mmt-{timestr}"
|
|
176
|
+
|
|
177
|
+
if machine is None:
|
|
178
|
+
# TODO: infer from studio
|
|
179
|
+
machine = "CPU"
|
|
180
|
+
machine_enum = Machine(machine.upper())
|
|
181
|
+
|
|
182
|
+
resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
|
|
183
|
+
if cloud_account is None:
|
|
184
|
+
cloud_account = resolved_teamspace.default_cloud_account
|
|
185
|
+
|
|
186
|
+
if image is None:
|
|
187
|
+
raise RuntimeError("Image needs to be specified to run a multi-machine job")
|
|
188
|
+
|
|
189
|
+
MMT.run(
|
|
190
|
+
name=name,
|
|
191
|
+
num_machines=num_machines,
|
|
192
|
+
machine=machine_enum,
|
|
193
|
+
command=command,
|
|
194
|
+
studio=None,
|
|
195
|
+
image=image,
|
|
196
|
+
teamspace=resolved_teamspace,
|
|
87
197
|
org=org,
|
|
88
198
|
user=user,
|
|
89
|
-
|
|
199
|
+
cloud_account=cloud_account,
|
|
90
200
|
env=env,
|
|
91
201
|
interruptible=interruptible,
|
|
92
202
|
image_credentials=image_credentials,
|
|
93
|
-
|
|
203
|
+
cloud_account_auth=cloud_account_auth,
|
|
94
204
|
artifacts_local=artifacts_local,
|
|
95
205
|
artifacts_remote=artifacts_remote,
|
|
96
206
|
)
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
import warnings
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Union
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
9
|
+
from rich.prompt import Confirm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class _LitServe:
|
|
13
|
+
"""Serve a LitServe model.
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
lightning serve api server.py # serve locally
|
|
17
|
+
lightning serve api server.py --cloud # deploy to the cloud
|
|
18
|
+
|
|
19
|
+
You can deploy the API to the cloud by running `lightning serve api server.py --cloud`.
|
|
20
|
+
This will generate a Dockerfile, build the image, and push it to the image registry.
|
|
21
|
+
Deploying to the cloud requires pre-login to the docker registry.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def api(
|
|
25
|
+
self,
|
|
26
|
+
script_path: Union[str, Path],
|
|
27
|
+
easy: bool = False,
|
|
28
|
+
cloud: bool = False,
|
|
29
|
+
repository: Optional[str] = None,
|
|
30
|
+
non_interactive: bool = False,
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Deploy a LitServe model script.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
script_path: Path to the script to serve
|
|
36
|
+
easy: If True, generates a client for the model
|
|
37
|
+
cloud: If True, deploy the model to the Lightning Studio
|
|
38
|
+
repository: Optional Docker repository name (e.g., 'username/model-name')
|
|
39
|
+
non_interactive: If True, do not prompt for confirmation
|
|
40
|
+
Raises:
|
|
41
|
+
FileNotFoundError: If script_path doesn't exist
|
|
42
|
+
ImportError: If litserve is not installed
|
|
43
|
+
subprocess.CalledProcessError: If the script fails to run
|
|
44
|
+
IOError: If client.py generation fails
|
|
45
|
+
"""
|
|
46
|
+
console = Console()
|
|
47
|
+
script_path = Path(script_path)
|
|
48
|
+
if not script_path.exists():
|
|
49
|
+
raise FileNotFoundError(f"Script not found: {script_path}")
|
|
50
|
+
if not script_path.is_file():
|
|
51
|
+
raise ValueError(f"Path is not a file: {script_path}")
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
from litserve.python_client import client_template
|
|
55
|
+
except ImportError:
|
|
56
|
+
raise ImportError(
|
|
57
|
+
"litserve is not installed. Please install it with `pip install lightning_sdk[serve]`"
|
|
58
|
+
) from None
|
|
59
|
+
|
|
60
|
+
if easy:
|
|
61
|
+
client_path = Path("client.py")
|
|
62
|
+
if client_path.exists():
|
|
63
|
+
console.print("Skipping client generation: client.py already exists", style="blue")
|
|
64
|
+
else:
|
|
65
|
+
try:
|
|
66
|
+
client_path.write_text(client_template)
|
|
67
|
+
console.print("✅ Client generated at client.py", style="bold green")
|
|
68
|
+
except OSError as e:
|
|
69
|
+
raise OSError(f"Failed to generate client.py: {e!s}") from None
|
|
70
|
+
|
|
71
|
+
if cloud:
|
|
72
|
+
tag = repository if repository else "litserve-model"
|
|
73
|
+
return self._handle_cloud(script_path, console, tag=tag, non_interactive=non_interactive)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
subprocess.run(
|
|
77
|
+
["python", str(script_path)],
|
|
78
|
+
check=True,
|
|
79
|
+
text=True,
|
|
80
|
+
)
|
|
81
|
+
except subprocess.CalledProcessError as e:
|
|
82
|
+
error_msg = f"Script execution failed with exit code {e.returncode}\nstdout: {e.stdout}\nstderr: {e.stderr}"
|
|
83
|
+
raise RuntimeError(error_msg) from None
|
|
84
|
+
|
|
85
|
+
def _handle_cloud(
|
|
86
|
+
self,
|
|
87
|
+
script_path: Union[str, Path],
|
|
88
|
+
console: Console,
|
|
89
|
+
tag: str = "litserve-model",
|
|
90
|
+
non_interactive: bool = False,
|
|
91
|
+
) -> None:
|
|
92
|
+
try:
|
|
93
|
+
import docker
|
|
94
|
+
except ImportError:
|
|
95
|
+
raise ImportError("docker-py is not installed. Please install it with `pip install docker`") from None
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
client = docker.from_env()
|
|
99
|
+
client.ping()
|
|
100
|
+
except docker.errors.DockerException as e:
|
|
101
|
+
raise RuntimeError(f"Failed to connect to Docker daemon: {e!s}. Is Docker running?") from None
|
|
102
|
+
|
|
103
|
+
dockerizer = _Docker()
|
|
104
|
+
path = dockerizer.api(script_path, port=8000, gpu=False, tag=tag)
|
|
105
|
+
|
|
106
|
+
console.clear()
|
|
107
|
+
if non_interactive:
|
|
108
|
+
console.print("[italic]non-interactive[/italic] mode enabled, skipping confirmation prompts", style="blue")
|
|
109
|
+
|
|
110
|
+
console.print(f"\nPlease review the Dockerfile at [u]{path}[/u] and make sure it is correct.", style="bold")
|
|
111
|
+
correct_dockerfile = True if non_interactive else Confirm.ask("Is the Dockerfile correct?", default=True)
|
|
112
|
+
if not correct_dockerfile:
|
|
113
|
+
console.print("Please fix the Dockerfile and try again.", style="red")
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
with Progress(
|
|
117
|
+
SpinnerColumn(),
|
|
118
|
+
TextColumn("[progress.description]{task.description}"),
|
|
119
|
+
TimeElapsedColumn(),
|
|
120
|
+
console=console,
|
|
121
|
+
transient=False,
|
|
122
|
+
) as progress:
|
|
123
|
+
build_task = progress.add_task("Building Docker image", total=None)
|
|
124
|
+
build_status = client.api.build(
|
|
125
|
+
path=os.path.dirname(path), dockerfile=path, tag=tag, decode=True, quiet=False
|
|
126
|
+
)
|
|
127
|
+
for line in build_status:
|
|
128
|
+
if "error" in line:
|
|
129
|
+
progress.stop()
|
|
130
|
+
console.print(f"\n[red]{line}[/red]")
|
|
131
|
+
return
|
|
132
|
+
if "stream" in line and line["stream"].strip():
|
|
133
|
+
console.print(line["stream"].strip(), style="bright_black")
|
|
134
|
+
progress.update(build_task, description="Building Docker image")
|
|
135
|
+
|
|
136
|
+
progress.update(build_task, description="[green]Build completed![/green]")
|
|
137
|
+
|
|
138
|
+
push_task = progress.add_task("Pushing to registry", total=None)
|
|
139
|
+
console.print("\nPushing image...", style="bold blue")
|
|
140
|
+
push_status = client.api.push(tag, stream=True, decode=True)
|
|
141
|
+
for line in push_status:
|
|
142
|
+
if "error" in line:
|
|
143
|
+
progress.stop()
|
|
144
|
+
console.print(f"\n[red]{line}[/red]")
|
|
145
|
+
return
|
|
146
|
+
if "status" in line:
|
|
147
|
+
console.print(line["status"], style="bright_black")
|
|
148
|
+
progress.update(push_task, description="Pushing to registry")
|
|
149
|
+
|
|
150
|
+
progress.update(push_task, description="[green]Push completed![/green]")
|
|
151
|
+
|
|
152
|
+
console.print(f"\n✅ Image pushed to {tag}", style="bold green")
|
|
153
|
+
console.print(
|
|
154
|
+
"Soon you will be able to deploy this model to the Lightning Studio!",
|
|
155
|
+
)
|
|
156
|
+
# TODO: Deploy to the cloud
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class _Docker:
|
|
160
|
+
"""Generate a Dockerfile for a LitServe model."""
|
|
161
|
+
|
|
162
|
+
def api(self, server_filename: str, port: int = 8000, gpu: bool = False, tag: str = "litserve-model") -> str:
|
|
163
|
+
"""Generate a Dockerfile for the given server code.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
server_filename: The path to the server file. Example sever.py or app.py.
|
|
167
|
+
port: The port to expose in the Docker container.
|
|
168
|
+
gpu: Whether to use a GPU-enabled Docker image.
|
|
169
|
+
tag: Docker image tag to use in examples.
|
|
170
|
+
"""
|
|
171
|
+
import litserve as ls
|
|
172
|
+
from litserve import docker_builder
|
|
173
|
+
|
|
174
|
+
console = Console()
|
|
175
|
+
requirements = ""
|
|
176
|
+
if os.path.exists("requirements.txt"):
|
|
177
|
+
requirements = "-r requirements.txt"
|
|
178
|
+
else:
|
|
179
|
+
warnings.warn(
|
|
180
|
+
f"requirements.txt not found at {os.getcwd()}. "
|
|
181
|
+
f"Make sure to install the required packages in the Dockerfile.",
|
|
182
|
+
UserWarning,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
current_dir = Path.cwd()
|
|
186
|
+
if not (current_dir / server_filename).is_file():
|
|
187
|
+
raise FileNotFoundError(f"Server file `{server_filename}` must be in the current directory: {os.getcwd()}")
|
|
188
|
+
|
|
189
|
+
version = ls.__version__
|
|
190
|
+
if gpu:
|
|
191
|
+
run_cmd = f"docker run --gpus all -p {port}:{port} {tag}:latest"
|
|
192
|
+
docker_template = docker_builder.CUDA_DOCKER_TEMPLATE
|
|
193
|
+
else:
|
|
194
|
+
run_cmd = f"docker run -p {port}:{port} {tag}:latest"
|
|
195
|
+
docker_template = docker_builder.DOCKERFILE_TEMPLATE
|
|
196
|
+
dockerfile_content = docker_template.format(
|
|
197
|
+
server_filename=server_filename,
|
|
198
|
+
port=port,
|
|
199
|
+
version=version,
|
|
200
|
+
requirements=requirements,
|
|
201
|
+
)
|
|
202
|
+
with open("Dockerfile", "w") as f:
|
|
203
|
+
f.write(dockerfile_content)
|
|
204
|
+
|
|
205
|
+
success_msg = f"""[bold]Dockerfile created successfully[/bold]
|
|
206
|
+
Update [underline]{os.path.abspath("Dockerfile")}[/underline] to add any additional dependencies or commands.
|
|
207
|
+
|
|
208
|
+
[bold]Build the container with:[/bold]
|
|
209
|
+
> [underline]docker build -t {tag} .[/underline]
|
|
210
|
+
|
|
211
|
+
[bold]To run the Docker container on the machine:[/bold]
|
|
212
|
+
> [underline]{run_cmd}[/underline]
|
|
213
|
+
|
|
214
|
+
[bold]To push the container to a registry:[/bold]
|
|
215
|
+
> [underline]docker push {tag}[/underline]
|
|
216
|
+
"""
|
|
217
|
+
console.print(success_msg)
|
|
218
|
+
return os.path.abspath("Dockerfile")
|