lightning-sdk 0.1.42__py3-none-any.whl → 0.1.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightning_sdk/__init__.py +1 -1
- lightning_sdk/api/job_api.py +35 -0
- lightning_sdk/api/utils.py +8 -0
- lightning_sdk/cli/run.py +113 -4
- lightning_sdk/cli/serve.py +102 -14
- lightning_sdk/job/base.py +10 -0
- lightning_sdk/job/job.py +28 -4
- lightning_sdk/job/v1.py +5 -0
- lightning_sdk/job/v2.py +18 -0
- lightning_sdk/job/work.py +10 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_membership.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_multi_machine_job_state.py +1 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_project_membership.py +27 -1
- lightning_sdk/mmt/__init__.py +2 -1
- lightning_sdk/mmt/base.py +117 -15
- lightning_sdk/mmt/mmt.py +114 -22
- lightning_sdk/mmt/v1.py +56 -0
- lightning_sdk/mmt/v2.py +57 -0
- lightning_sdk/plugin.py +28 -23
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/METADATA +2 -1
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/RECORD +25 -26
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/entry_points.txt +0 -1
- lightning_sdk/cli/mmt.py +0 -138
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/LICENSE +0 -0
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/WHEEL +0 -0
- {lightning_sdk-0.1.42.dist-info → lightning_sdk-0.1.44.dist-info}/top_level.txt +0 -0
lightning_sdk/__init__.py
CHANGED
lightning_sdk/api/job_api.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import time
|
|
2
2
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
3
|
+
from urllib.request import urlopen
|
|
3
4
|
|
|
4
5
|
from lightning_sdk.api.utils import (
|
|
5
6
|
_COMPUTE_NAME_TO_MACHINE,
|
|
6
7
|
_MACHINE_TO_COMPUTE_NAME,
|
|
7
8
|
_create_app,
|
|
9
|
+
remove_datetime_prefix,
|
|
8
10
|
)
|
|
9
11
|
from lightning_sdk.api.utils import (
|
|
10
12
|
_get_cloud_url as _cloud_url,
|
|
@@ -16,7 +18,10 @@ from lightning_sdk.lightning_cloud.openapi import (
|
|
|
16
18
|
Externalv1Lightningwork,
|
|
17
19
|
JobsIdBody1,
|
|
18
20
|
ProjectIdJobsBody,
|
|
21
|
+
V1CloudSpace,
|
|
19
22
|
V1ComputeConfig,
|
|
23
|
+
V1DownloadJobLogsResponse,
|
|
24
|
+
V1DownloadLightningappInstanceLogsResponse,
|
|
20
25
|
V1EnvVar,
|
|
21
26
|
V1Job,
|
|
22
27
|
V1JobSpec,
|
|
@@ -102,6 +107,12 @@ class JobApiV1:
|
|
|
102
107
|
compute: str = compute_config.instance_type
|
|
103
108
|
return _COMPUTE_NAME_TO_MACHINE[compute]
|
|
104
109
|
|
|
110
|
+
def get_studio_name(self, job: Externalv1LightningappInstance) -> str:
|
|
111
|
+
cs: V1CloudSpace = self._client.cloud_space_service_get_cloud_space(
|
|
112
|
+
project_id=job.project_id, id=job.spec.cloud_space_id
|
|
113
|
+
)
|
|
114
|
+
return cs.name
|
|
115
|
+
|
|
105
116
|
def submit_job(
|
|
106
117
|
self,
|
|
107
118
|
name: str,
|
|
@@ -150,6 +161,16 @@ class JobApiV1:
|
|
|
150
161
|
|
|
151
162
|
return Status.Pending
|
|
152
163
|
|
|
164
|
+
def get_logs_finished(self, job_id: str, work_id: str, teamspace_id: str) -> str:
|
|
165
|
+
resp: (
|
|
166
|
+
V1DownloadLightningappInstanceLogsResponse
|
|
167
|
+
) = self._client.lightningapp_instance_service_download_lightningapp_instance_logs(
|
|
168
|
+
project_id=teamspace_id, id=job_id, work_id=work_id
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
data = urlopen(resp.url).read().decode("utf-8")
|
|
172
|
+
return remove_datetime_prefix(str(data))
|
|
173
|
+
|
|
153
174
|
|
|
154
175
|
class JobApiV2:
|
|
155
176
|
v2_job_state_pending = "pending"
|
|
@@ -247,6 +268,20 @@ class JobApiV2:
|
|
|
247
268
|
def delete_job(self, job_id: str, teamspace_id: str, cloudspace_id: Optional[str]) -> None:
|
|
248
269
|
self._client.jobs_service_delete_job(project_id=teamspace_id, id=job_id, cloudspace_id=cloudspace_id or "")
|
|
249
270
|
|
|
271
|
+
def get_logs_finished(self, job_id: str, teamspace_id: str) -> str:
|
|
272
|
+
resp: V1DownloadJobLogsResponse = self._client.jobs_service_download_job_logs(
|
|
273
|
+
project_id=teamspace_id, id=job_id
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
data = urlopen(resp.url).read().decode("utf-8")
|
|
277
|
+
return remove_datetime_prefix(str(data))
|
|
278
|
+
|
|
279
|
+
def get_studio_name(self, job: V1Job) -> str:
|
|
280
|
+
cs: V1CloudSpace = self._client.cloud_space_service_get_cloud_space(
|
|
281
|
+
project_id=job.project_id, id=job.spec.cloudspace_id
|
|
282
|
+
)
|
|
283
|
+
return cs.name
|
|
284
|
+
|
|
250
285
|
def _job_state_to_external(self, state: str) -> "Status":
|
|
251
286
|
from lightning_sdk.status import Status
|
|
252
287
|
|
lightning_sdk/api/utils.py
CHANGED
|
@@ -2,6 +2,7 @@ import concurrent.futures
|
|
|
2
2
|
import errno
|
|
3
3
|
import math
|
|
4
4
|
import os
|
|
5
|
+
import re
|
|
5
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
6
7
|
from functools import partial
|
|
7
8
|
from pathlib import Path
|
|
@@ -599,3 +600,10 @@ def _create_app(
|
|
|
599
600
|
print(f"Create App: {resp.id=} {teamspace_id=} {studio_id=} {cloud_account=}")
|
|
600
601
|
|
|
601
602
|
return resp
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def remove_datetime_prefix(text: str) -> str:
|
|
606
|
+
# Use a regular expression to match the datetime pattern at the start of each line
|
|
607
|
+
# lines looks something like
|
|
608
|
+
# '[2025-01-08T14:15:03.797142418Z] ⚡ ~ echo Hello\n[2025-01-08T14:15:03.803077717Z] Hello\n'
|
|
609
|
+
return re.sub(r"^\[.*?\] ", "", text, flags=re.MULTILINE)
|
lightning_sdk/cli/run.py
CHANGED
|
@@ -2,6 +2,8 @@ from typing import TYPE_CHECKING, Dict, Optional
|
|
|
2
2
|
|
|
3
3
|
from lightning_sdk.job import Job
|
|
4
4
|
from lightning_sdk.machine import Machine
|
|
5
|
+
from lightning_sdk.mmt import MMT
|
|
6
|
+
from lightning_sdk.teamspace import Teamspace
|
|
5
7
|
|
|
6
8
|
if TYPE_CHECKING:
|
|
7
9
|
from lightning_sdk.cli.legacy import _LegacyLightningCLI
|
|
@@ -20,7 +22,7 @@ class _Run:
|
|
|
20
22
|
# Need to set the docstring here for f-strings to work.
|
|
21
23
|
# Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
|
|
22
24
|
# and fire does not show values for literals, just that it is a literal.
|
|
23
|
-
|
|
25
|
+
docstr_job = f"""Run async workloads using a docker image or a compute environment from your studio.
|
|
24
26
|
|
|
25
27
|
Args:
|
|
26
28
|
name: The name of the job. Needs to be unique within the teamspace.
|
|
@@ -54,7 +56,47 @@ class _Run:
|
|
|
54
56
|
"""
|
|
55
57
|
# TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
|
|
56
58
|
# might need to switch to explicit cli definition
|
|
57
|
-
self.job.__func__.__doc__ =
|
|
59
|
+
self.job.__func__.__doc__ = docstr_job
|
|
60
|
+
|
|
61
|
+
# Need to set the docstring here for f-strings to work.
|
|
62
|
+
# Sadly this is the only way to really show options as f-strings are not allowed as docstrings directly
|
|
63
|
+
# and fire does not show values for literals, just that it is a literal.
|
|
64
|
+
docstr_mmt = f"""Run async workloads on multiple machines using a docker image.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
name: The name of the job. Needs to be unique within the teamspace.
|
|
68
|
+
num_machines: The number of Machines to run on. Defaults to 2 Machines
|
|
69
|
+
machine: The machine type to run the job on. One of {", ".join(_MACHINE_VALUES)}. Defaults to CPU
|
|
70
|
+
command: The command to run inside your job. Required if using a studio. Optional if using an image.
|
|
71
|
+
If not provided for images, will run the container entrypoint and default command.
|
|
72
|
+
studio: The studio env to run the job with. Mutually exclusive with image.
|
|
73
|
+
image: The docker image to run the job with. Mutually exclusive with studio.
|
|
74
|
+
teamspace: The teamspace the job should be associated with. Defaults to the current teamspace.
|
|
75
|
+
org: The organization owning the teamspace (if any). Defaults to the current organization.
|
|
76
|
+
user: The user owning the teamspace (if any). Defaults to the current user.
|
|
77
|
+
cloud_account: The cloud account to run the job on.
|
|
78
|
+
Defaults to the studio cloud account if running with studio compute env.
|
|
79
|
+
If not provided will fall back to the teamspaces default cloud account.
|
|
80
|
+
env: Environment variables to set inside the job.
|
|
81
|
+
interruptible: Whether the job should run on interruptible instances. They are cheaper but can be preempted.
|
|
82
|
+
image_credentials: The credentials used to pull the image. Required if the image is private.
|
|
83
|
+
This should be the name of the respective credentials secret created on the Lightning AI platform.
|
|
84
|
+
cloud_account_auth: Whether to authenticate with the cloud account to pull the image.
|
|
85
|
+
Required if the registry is part of a cloud provider (e.g. ECR).
|
|
86
|
+
artifacts_local: The path of inside the docker container, you want to persist images from.
|
|
87
|
+
CAUTION: When setting this to "/", it will effectively erase your container.
|
|
88
|
+
Only supported for jobs with a docker image compute environment.
|
|
89
|
+
artifacts_remote: The remote storage to persist your artifacts to.
|
|
90
|
+
Should be of format <CONNECTION_TYPE>:<CONNECTION_NAME>:<PATH_WITHIN_CONNECTION>.
|
|
91
|
+
PATH_WITHIN_CONNECTION hereby is a path relative to the connection's root.
|
|
92
|
+
E.g. efs:data:some-path would result in an EFS connection named `data` and to the path `some-path`
|
|
93
|
+
within it.
|
|
94
|
+
Note that the connection needs to be added to the teamspace already in order for it to be found.
|
|
95
|
+
Only supported for jobs with a docker image compute environment.
|
|
96
|
+
"""
|
|
97
|
+
# TODO: the docstrings from artifacts_local and artifacts_remote don't show up completely,
|
|
98
|
+
# might need to switch to explicit cli definition
|
|
99
|
+
self.mmt.__func__.__doc__ = docstr_mmt
|
|
58
100
|
|
|
59
101
|
# TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
|
|
60
102
|
# see https://github.com/google/python-fire/pull/513
|
|
@@ -62,7 +104,7 @@ class _Run:
|
|
|
62
104
|
def job(
|
|
63
105
|
self,
|
|
64
106
|
name: str,
|
|
65
|
-
machine: str,
|
|
107
|
+
machine: Optional[str] = None,
|
|
66
108
|
command: Optional[str] = None,
|
|
67
109
|
studio: Optional[str] = None,
|
|
68
110
|
image: Optional[str] = None,
|
|
@@ -77,6 +119,15 @@ class _Run:
|
|
|
77
119
|
artifacts_local: Optional[str] = None,
|
|
78
120
|
artifacts_remote: Optional[str] = None,
|
|
79
121
|
) -> None:
|
|
122
|
+
if machine is None:
|
|
123
|
+
# TODO: infer from studio
|
|
124
|
+
machine = "CPU"
|
|
125
|
+
machine_enum = Machine(machine.upper())
|
|
126
|
+
|
|
127
|
+
resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
|
|
128
|
+
|
|
129
|
+
if cloud_account is None:
|
|
130
|
+
cloud_account = resolved_teamspace.default_cloud_account
|
|
80
131
|
machine_enum = Machine(machine.upper())
|
|
81
132
|
Job.run(
|
|
82
133
|
name=name,
|
|
@@ -84,7 +135,65 @@ class _Run:
|
|
|
84
135
|
command=command,
|
|
85
136
|
studio=studio,
|
|
86
137
|
image=image,
|
|
87
|
-
teamspace=
|
|
138
|
+
teamspace=resolved_teamspace,
|
|
139
|
+
org=org,
|
|
140
|
+
user=user,
|
|
141
|
+
cloud_account=cloud_account,
|
|
142
|
+
env=env,
|
|
143
|
+
interruptible=interruptible,
|
|
144
|
+
image_credentials=image_credentials,
|
|
145
|
+
cloud_account_auth=cloud_account_auth,
|
|
146
|
+
artifacts_local=artifacts_local,
|
|
147
|
+
artifacts_remote=artifacts_remote,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# TODO: sadly, fire displays both Optional[type] and Union[type, None] as Optional[Optional]
|
|
151
|
+
# see https://github.com/google/python-fire/pull/513
|
|
152
|
+
# might need to move to different cli library
|
|
153
|
+
def mmt(
|
|
154
|
+
self,
|
|
155
|
+
name: Optional[str] = None,
|
|
156
|
+
num_machines: int = 2,
|
|
157
|
+
machine: Optional[str] = None,
|
|
158
|
+
command: Optional[str] = None,
|
|
159
|
+
image: Optional[str] = None,
|
|
160
|
+
teamspace: Optional[str] = None,
|
|
161
|
+
org: Optional[str] = None,
|
|
162
|
+
user: Optional[str] = None,
|
|
163
|
+
cloud_account: Optional[str] = None,
|
|
164
|
+
env: Optional[Dict[str, str]] = None,
|
|
165
|
+
interruptible: bool = False,
|
|
166
|
+
image_credentials: Optional[str] = None,
|
|
167
|
+
cloud_account_auth: bool = False,
|
|
168
|
+
artifacts_local: Optional[str] = None,
|
|
169
|
+
artifacts_remote: Optional[str] = None,
|
|
170
|
+
) -> None:
|
|
171
|
+
if name is None:
|
|
172
|
+
from datetime import datetime
|
|
173
|
+
|
|
174
|
+
timestr = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
|
175
|
+
name = f"mmt-{timestr}"
|
|
176
|
+
|
|
177
|
+
if machine is None:
|
|
178
|
+
# TODO: infer from studio
|
|
179
|
+
machine = "CPU"
|
|
180
|
+
machine_enum = Machine(machine.upper())
|
|
181
|
+
|
|
182
|
+
resolved_teamspace = Teamspace(name=teamspace, org=org, user=user)
|
|
183
|
+
if cloud_account is None:
|
|
184
|
+
cloud_account = resolved_teamspace.default_cloud_account
|
|
185
|
+
|
|
186
|
+
if image is None:
|
|
187
|
+
raise RuntimeError("Image needs to be specified to run a multi-machine job")
|
|
188
|
+
|
|
189
|
+
MMT.run(
|
|
190
|
+
name=name,
|
|
191
|
+
num_machines=num_machines,
|
|
192
|
+
machine=machine_enum,
|
|
193
|
+
command=command,
|
|
194
|
+
studio=None,
|
|
195
|
+
image=image,
|
|
196
|
+
teamspace=resolved_teamspace,
|
|
88
197
|
org=org,
|
|
89
198
|
user=user,
|
|
90
199
|
cloud_account=cloud_account,
|
lightning_sdk/cli/serve.py
CHANGED
|
@@ -2,29 +2,41 @@ import os
|
|
|
2
2
|
import subprocess
|
|
3
3
|
import warnings
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Union
|
|
5
|
+
from typing import Optional, Union
|
|
6
6
|
|
|
7
7
|
from rich.console import Console
|
|
8
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
9
|
+
from rich.prompt import Confirm
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class _LitServe:
|
|
11
13
|
"""Serve a LitServe model.
|
|
12
14
|
|
|
13
15
|
Example:
|
|
14
|
-
lightning serve api server.py
|
|
16
|
+
lightning serve api server.py # serve locally
|
|
17
|
+
lightning serve api server.py --cloud # deploy to the cloud
|
|
18
|
+
|
|
19
|
+
You can deploy the API to the cloud by running `lightning serve api server.py --cloud`.
|
|
20
|
+
This will generate a Dockerfile, build the image, and push it to the image registry.
|
|
21
|
+
Deploying to the cloud requires pre-login to the docker registry.
|
|
15
22
|
"""
|
|
16
23
|
|
|
17
24
|
def api(
|
|
18
25
|
self,
|
|
19
26
|
script_path: Union[str, Path],
|
|
20
27
|
easy: bool = False,
|
|
28
|
+
cloud: bool = False,
|
|
29
|
+
repository: Optional[str] = None,
|
|
30
|
+
non_interactive: bool = False,
|
|
21
31
|
) -> None:
|
|
22
32
|
"""Deploy a LitServe model script.
|
|
23
33
|
|
|
24
34
|
Args:
|
|
25
35
|
script_path: Path to the script to serve
|
|
26
36
|
easy: If True, generates a client for the model
|
|
27
|
-
|
|
37
|
+
cloud: If True, deploy the model to the Lightning Studio
|
|
38
|
+
repository: Optional Docker repository name (e.g., 'username/model-name')
|
|
39
|
+
non_interactive: If True, do not prompt for confirmation
|
|
28
40
|
Raises:
|
|
29
41
|
FileNotFoundError: If script_path doesn't exist
|
|
30
42
|
ImportError: If litserve is not installed
|
|
@@ -56,6 +68,10 @@ class _LitServe:
|
|
|
56
68
|
except OSError as e:
|
|
57
69
|
raise OSError(f"Failed to generate client.py: {e!s}") from None
|
|
58
70
|
|
|
71
|
+
if cloud:
|
|
72
|
+
tag = repository if repository else "litserve-model"
|
|
73
|
+
return self._handle_cloud(script_path, console, tag=tag, non_interactive=non_interactive)
|
|
74
|
+
|
|
59
75
|
try:
|
|
60
76
|
subprocess.run(
|
|
61
77
|
["python", str(script_path)],
|
|
@@ -66,20 +82,91 @@ class _LitServe:
|
|
|
66
82
|
error_msg = f"Script execution failed with exit code {e.returncode}\nstdout: {e.stdout}\nstderr: {e.stderr}"
|
|
67
83
|
raise RuntimeError(error_msg) from None
|
|
68
84
|
|
|
85
|
+
def _handle_cloud(
|
|
86
|
+
self,
|
|
87
|
+
script_path: Union[str, Path],
|
|
88
|
+
console: Console,
|
|
89
|
+
tag: str = "litserve-model",
|
|
90
|
+
non_interactive: bool = False,
|
|
91
|
+
) -> None:
|
|
92
|
+
try:
|
|
93
|
+
import docker
|
|
94
|
+
except ImportError:
|
|
95
|
+
raise ImportError("docker-py is not installed. Please install it with `pip install docker`") from None
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
client = docker.from_env()
|
|
99
|
+
client.ping()
|
|
100
|
+
except docker.errors.DockerException as e:
|
|
101
|
+
raise RuntimeError(f"Failed to connect to Docker daemon: {e!s}. Is Docker running?") from None
|
|
102
|
+
|
|
103
|
+
dockerizer = _Docker()
|
|
104
|
+
path = dockerizer.api(script_path, port=8000, gpu=False, tag=tag)
|
|
105
|
+
|
|
106
|
+
console.clear()
|
|
107
|
+
if non_interactive:
|
|
108
|
+
console.print("[italic]non-interactive[/italic] mode enabled, skipping confirmation prompts", style="blue")
|
|
109
|
+
|
|
110
|
+
console.print(f"\nPlease review the Dockerfile at [u]{path}[/u] and make sure it is correct.", style="bold")
|
|
111
|
+
correct_dockerfile = True if non_interactive else Confirm.ask("Is the Dockerfile correct?", default=True)
|
|
112
|
+
if not correct_dockerfile:
|
|
113
|
+
console.print("Please fix the Dockerfile and try again.", style="red")
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
with Progress(
|
|
117
|
+
SpinnerColumn(),
|
|
118
|
+
TextColumn("[progress.description]{task.description}"),
|
|
119
|
+
TimeElapsedColumn(),
|
|
120
|
+
console=console,
|
|
121
|
+
transient=False,
|
|
122
|
+
) as progress:
|
|
123
|
+
build_task = progress.add_task("Building Docker image", total=None)
|
|
124
|
+
build_status = client.api.build(
|
|
125
|
+
path=os.path.dirname(path), dockerfile=path, tag=tag, decode=True, quiet=False
|
|
126
|
+
)
|
|
127
|
+
for line in build_status:
|
|
128
|
+
if "error" in line:
|
|
129
|
+
progress.stop()
|
|
130
|
+
console.print(f"\n[red]{line}[/red]")
|
|
131
|
+
return
|
|
132
|
+
if "stream" in line and line["stream"].strip():
|
|
133
|
+
console.print(line["stream"].strip(), style="bright_black")
|
|
134
|
+
progress.update(build_task, description="Building Docker image")
|
|
135
|
+
|
|
136
|
+
progress.update(build_task, description="[green]Build completed![/green]")
|
|
137
|
+
|
|
138
|
+
push_task = progress.add_task("Pushing to registry", total=None)
|
|
139
|
+
console.print("\nPushing image...", style="bold blue")
|
|
140
|
+
push_status = client.api.push(tag, stream=True, decode=True)
|
|
141
|
+
for line in push_status:
|
|
142
|
+
if "error" in line:
|
|
143
|
+
progress.stop()
|
|
144
|
+
console.print(f"\n[red]{line}[/red]")
|
|
145
|
+
return
|
|
146
|
+
if "status" in line:
|
|
147
|
+
console.print(line["status"], style="bright_black")
|
|
148
|
+
progress.update(push_task, description="Pushing to registry")
|
|
149
|
+
|
|
150
|
+
progress.update(push_task, description="[green]Push completed![/green]")
|
|
151
|
+
|
|
152
|
+
console.print(f"\n✅ Image pushed to {tag}", style="bold green")
|
|
153
|
+
console.print(
|
|
154
|
+
"Soon you will be able to deploy this model to the Lightning Studio!",
|
|
155
|
+
)
|
|
156
|
+
# TODO: Deploy to the cloud
|
|
157
|
+
|
|
69
158
|
|
|
70
159
|
class _Docker:
|
|
71
160
|
"""Generate a Dockerfile for a LitServe model."""
|
|
72
161
|
|
|
73
|
-
def api(self, server_filename: str, port: int = 8000, gpu: bool = False) ->
|
|
162
|
+
def api(self, server_filename: str, port: int = 8000, gpu: bool = False, tag: str = "litserve-model") -> str:
|
|
74
163
|
"""Generate a Dockerfile for the given server code.
|
|
75
164
|
|
|
76
|
-
Example:
|
|
77
|
-
lightning litserve dockerize server.py --port 8000 --gpu
|
|
78
|
-
|
|
79
165
|
Args:
|
|
80
|
-
server_filename
|
|
81
|
-
port
|
|
82
|
-
gpu
|
|
166
|
+
server_filename: The path to the server file. Example sever.py or app.py.
|
|
167
|
+
port: The port to expose in the Docker container.
|
|
168
|
+
gpu: Whether to use a GPU-enabled Docker image.
|
|
169
|
+
tag: Docker image tag to use in examples.
|
|
83
170
|
"""
|
|
84
171
|
import litserve as ls
|
|
85
172
|
from litserve import docker_builder
|
|
@@ -101,10 +188,10 @@ class _Docker:
|
|
|
101
188
|
|
|
102
189
|
version = ls.__version__
|
|
103
190
|
if gpu:
|
|
104
|
-
run_cmd = f"docker run --gpus all -p {port}:{port}
|
|
191
|
+
run_cmd = f"docker run --gpus all -p {port}:{port} {tag}:latest"
|
|
105
192
|
docker_template = docker_builder.CUDA_DOCKER_TEMPLATE
|
|
106
193
|
else:
|
|
107
|
-
run_cmd = f"docker run -p {port}:{port}
|
|
194
|
+
run_cmd = f"docker run -p {port}:{port} {tag}:latest"
|
|
108
195
|
docker_template = docker_builder.DOCKERFILE_TEMPLATE
|
|
109
196
|
dockerfile_content = docker_template.format(
|
|
110
197
|
server_filename=server_filename,
|
|
@@ -119,12 +206,13 @@ class _Docker:
|
|
|
119
206
|
Update [underline]{os.path.abspath("Dockerfile")}[/underline] to add any additional dependencies or commands.
|
|
120
207
|
|
|
121
208
|
[bold]Build the container with:[/bold]
|
|
122
|
-
> [underline]docker build -t
|
|
209
|
+
> [underline]docker build -t {tag} .[/underline]
|
|
123
210
|
|
|
124
211
|
[bold]To run the Docker container on the machine:[/bold]
|
|
125
212
|
> [underline]{run_cmd}[/underline]
|
|
126
213
|
|
|
127
214
|
[bold]To push the container to a registry:[/bold]
|
|
128
|
-
> [underline]docker push
|
|
215
|
+
> [underline]docker push {tag}[/underline]
|
|
129
216
|
"""
|
|
130
217
|
console.print(success_msg)
|
|
218
|
+
return os.path.abspath("Dockerfile")
|
lightning_sdk/job/base.py
CHANGED
|
@@ -273,6 +273,16 @@ class _BaseJob(ABC):
|
|
|
273
273
|
"""The teamspace the job is part of."""
|
|
274
274
|
return self._teamspace
|
|
275
275
|
|
|
276
|
+
@property
|
|
277
|
+
@abstractmethod
|
|
278
|
+
def logs(self) -> str:
|
|
279
|
+
"""The logs of the job."""
|
|
280
|
+
|
|
281
|
+
@property
|
|
282
|
+
def link(self) -> str:
|
|
283
|
+
"""A link to view the current job in the UI."""
|
|
284
|
+
return f"https://lightning.ai/{self.teamspace.owner.name}/{self.teamspace.name}/studios/{self._job_api.get_studio_name(self._guaranteed_job)}/app?app_id=jobs&job_name={self.name}"
|
|
285
|
+
|
|
276
286
|
@property
|
|
277
287
|
def _guaranteed_job(self) -> Any:
|
|
278
288
|
"""Guarantees that the job was fetched at some point before returning it.
|
lightning_sdk/job/job.py
CHANGED
|
@@ -5,6 +5,9 @@ from lightning_sdk.api.user_api import UserApi
|
|
|
5
5
|
from lightning_sdk.job.base import _BaseJob
|
|
6
6
|
from lightning_sdk.job.v1 import _JobV1
|
|
7
7
|
from lightning_sdk.job.v2 import _JobV2
|
|
8
|
+
from lightning_sdk.utils.resolve import _setup_logger
|
|
9
|
+
|
|
10
|
+
_logger = _setup_logger(__name__)
|
|
8
11
|
|
|
9
12
|
if TYPE_CHECKING:
|
|
10
13
|
from lightning_sdk.machine import Machine
|
|
@@ -25,6 +28,10 @@ def _has_jobs_v2() -> bool:
|
|
|
25
28
|
|
|
26
29
|
|
|
27
30
|
class Job(_BaseJob):
|
|
31
|
+
"""Class to submit and manage single-machine jobs on the Lightning AI Platform."""
|
|
32
|
+
|
|
33
|
+
_force_v1: bool = False
|
|
34
|
+
|
|
28
35
|
def __init__(
|
|
29
36
|
self,
|
|
30
37
|
name: str,
|
|
@@ -34,7 +41,16 @@ class Job(_BaseJob):
|
|
|
34
41
|
*,
|
|
35
42
|
_fetch_job: bool = True,
|
|
36
43
|
) -> None:
|
|
37
|
-
|
|
44
|
+
"""Fetch already existing jobs.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
name: the name of the job
|
|
48
|
+
teamspace: the teamspace the job is part of
|
|
49
|
+
org: the name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
50
|
+
user: the name of the user owning the :param`teamspace`
|
|
51
|
+
in case it is owned directly by a user instead of an org.
|
|
52
|
+
"""
|
|
53
|
+
internal_job_cls = _JobV2 if _has_jobs_v2() and not self._force_v1 else _JobV1
|
|
38
54
|
|
|
39
55
|
self._internal_job = internal_job_cls(
|
|
40
56
|
name=name,
|
|
@@ -116,6 +132,8 @@ class Job(_BaseJob):
|
|
|
116
132
|
)
|
|
117
133
|
# required for typing with "Job"
|
|
118
134
|
assert isinstance(ret_val, cls)
|
|
135
|
+
|
|
136
|
+
_logger.info(f"Job was successfully launched. View it at {ret_val.link}")
|
|
119
137
|
return ret_val
|
|
120
138
|
|
|
121
139
|
def _submit(
|
|
@@ -228,9 +246,11 @@ class Job(_BaseJob):
|
|
|
228
246
|
return self._internal_job._teamspace
|
|
229
247
|
|
|
230
248
|
@property
|
|
231
|
-
def
|
|
232
|
-
"""The
|
|
233
|
-
|
|
249
|
+
def logs(self) -> str:
|
|
250
|
+
"""The logs of the job."""
|
|
251
|
+
if self.status not in (Status.Failed, Status.Completed, Status.Stopped):
|
|
252
|
+
raise RuntimeError("Getting jobs logs while the job is pending or running is not supported yet!")
|
|
253
|
+
return self._internal_job.logs
|
|
234
254
|
|
|
235
255
|
def __getattr__(self, key: str) -> Any:
|
|
236
256
|
"""Forward the attribute lookup to the internal job implementation."""
|
|
@@ -238,3 +258,7 @@ class Job(_BaseJob):
|
|
|
238
258
|
return getattr(super(), key)
|
|
239
259
|
except AttributeError:
|
|
240
260
|
return getattr(self._internal_job, key)
|
|
261
|
+
|
|
262
|
+
@property
|
|
263
|
+
def link(self) -> str:
|
|
264
|
+
return self._internal_job.link
|
lightning_sdk/job/v1.py
CHANGED
|
@@ -210,6 +210,11 @@ class _JobV1(_BaseJob):
|
|
|
210
210
|
"""The path to the share of the job in the distributed teamspace filesystem."""
|
|
211
211
|
return f"/teamspace/jobs/{self.name}/share"
|
|
212
212
|
|
|
213
|
+
@property
|
|
214
|
+
def logs(self) -> str:
|
|
215
|
+
"""The logs of the job."""
|
|
216
|
+
return self.work.logs
|
|
217
|
+
|
|
213
218
|
# the following and functions are solely to make the Work class function
|
|
214
219
|
@property
|
|
215
220
|
def _id(self) -> str:
|
lightning_sdk/job/v2.py
CHANGED
|
@@ -167,6 +167,24 @@ class _JobV2(_BaseJob):
|
|
|
167
167
|
"""The path to the share of the job within the distributed teamspace filesystem."""
|
|
168
168
|
raise NotImplementedError("Not implemented yet")
|
|
169
169
|
|
|
170
|
+
@property
|
|
171
|
+
def logs(self) -> str:
|
|
172
|
+
from lightning_sdk.status import Status
|
|
173
|
+
|
|
174
|
+
if self.status not in (Status.Failed, Status.Completed, Status.Stopped):
|
|
175
|
+
raise RuntimeError("Getting jobs logs while the job is pending or running is not supported yet!")
|
|
176
|
+
|
|
177
|
+
return self._job_api.get_logs_finished(job_id=self._guaranteed_job.id, teamspace_id=self.teamspace.id)
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def link(self) -> str:
|
|
181
|
+
if self._guaranteed_job.spec.image:
|
|
182
|
+
return (
|
|
183
|
+
f"https://lightning.ai/{self.teamspace.owner.name}/{self.teamspace.name}/jobs/{self.name}?app_id=jobs"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return super().link
|
|
187
|
+
|
|
170
188
|
def _update_internal_job(self) -> None:
|
|
171
189
|
if getattr(self, "_job", None) is None:
|
|
172
190
|
self._job = self._job_api.get_job_by_name(name=self._name, teamspace_id=self._teamspace.id)
|
lightning_sdk/job/work.py
CHANGED
|
@@ -60,3 +60,13 @@ class Work:
|
|
|
60
60
|
@property
|
|
61
61
|
def status(self) -> "Status":
|
|
62
62
|
return self._job_api.get_status_from_work(self._latest_work)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def logs(self) -> str:
|
|
66
|
+
"""The logs of the work."""
|
|
67
|
+
from lightning_sdk.status import Status
|
|
68
|
+
|
|
69
|
+
if self.status not in (Status.Failed, Status.Completed, Status.Stopped):
|
|
70
|
+
raise RuntimeError("Getting jobs logs while the job is pending or running is not supported yet!")
|
|
71
|
+
|
|
72
|
+
return self._job_api.get_logs_finished(job_id=self._job._id, work_id=self._id, teamspace_id=self._teamspace.id)
|
|
@@ -52,6 +52,7 @@ class V1Membership(object):
|
|
|
52
52
|
'job_count': 'str',
|
|
53
53
|
'membership_count': 'str',
|
|
54
54
|
'name': 'str',
|
|
55
|
+
'next_free_credits_grant_at': 'datetime',
|
|
55
56
|
'owner_id': 'str',
|
|
56
57
|
'owner_type': 'V1OwnerType',
|
|
57
58
|
'project_id': 'str',
|
|
@@ -73,6 +74,7 @@ class V1Membership(object):
|
|
|
73
74
|
'job_count': 'jobCount',
|
|
74
75
|
'membership_count': 'membershipCount',
|
|
75
76
|
'name': 'name',
|
|
77
|
+
'next_free_credits_grant_at': 'nextFreeCreditsGrantAt',
|
|
76
78
|
'owner_id': 'ownerId',
|
|
77
79
|
'owner_type': 'ownerType',
|
|
78
80
|
'project_id': 'projectId',
|
|
@@ -82,7 +84,7 @@ class V1Membership(object):
|
|
|
82
84
|
'user_id': 'userId'
|
|
83
85
|
}
|
|
84
86
|
|
|
85
|
-
def __init__(self, balance: 'float' =None, created_at: 'datetime' =None, creator_id: 'str' =None, datastore_count: 'str' =None, description: 'str' =None, display_name: 'str' =None, free_credits_enabled: 'bool' =None, is_default: 'bool' =None, job_count: 'str' =None, membership_count: 'str' =None, name: 'str' =None, owner_id: 'str' =None, owner_type: 'V1OwnerType' =None, project_id: 'str' =None, quotas: 'V1Quotas' =None, roles: 'list[V1Role]' =None, updated_at: 'datetime' =None, user_id: 'str' =None): # noqa: E501
|
|
87
|
+
def __init__(self, balance: 'float' =None, created_at: 'datetime' =None, creator_id: 'str' =None, datastore_count: 'str' =None, description: 'str' =None, display_name: 'str' =None, free_credits_enabled: 'bool' =None, is_default: 'bool' =None, job_count: 'str' =None, membership_count: 'str' =None, name: 'str' =None, next_free_credits_grant_at: 'datetime' =None, owner_id: 'str' =None, owner_type: 'V1OwnerType' =None, project_id: 'str' =None, quotas: 'V1Quotas' =None, roles: 'list[V1Role]' =None, updated_at: 'datetime' =None, user_id: 'str' =None): # noqa: E501
|
|
86
88
|
"""V1Membership - a model defined in Swagger""" # noqa: E501
|
|
87
89
|
self._balance = None
|
|
88
90
|
self._created_at = None
|
|
@@ -95,6 +97,7 @@ class V1Membership(object):
|
|
|
95
97
|
self._job_count = None
|
|
96
98
|
self._membership_count = None
|
|
97
99
|
self._name = None
|
|
100
|
+
self._next_free_credits_grant_at = None
|
|
98
101
|
self._owner_id = None
|
|
99
102
|
self._owner_type = None
|
|
100
103
|
self._project_id = None
|
|
@@ -125,6 +128,8 @@ class V1Membership(object):
|
|
|
125
128
|
self.membership_count = membership_count
|
|
126
129
|
if name is not None:
|
|
127
130
|
self.name = name
|
|
131
|
+
if next_free_credits_grant_at is not None:
|
|
132
|
+
self.next_free_credits_grant_at = next_free_credits_grant_at
|
|
128
133
|
if owner_id is not None:
|
|
129
134
|
self.owner_id = owner_id
|
|
130
135
|
if owner_type is not None:
|
|
@@ -371,6 +376,27 @@ class V1Membership(object):
|
|
|
371
376
|
|
|
372
377
|
self._name = name
|
|
373
378
|
|
|
379
|
+
@property
|
|
380
|
+
def next_free_credits_grant_at(self) -> 'datetime':
|
|
381
|
+
"""Gets the next_free_credits_grant_at of this V1Membership. # noqa: E501
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
:return: The next_free_credits_grant_at of this V1Membership. # noqa: E501
|
|
385
|
+
:rtype: datetime
|
|
386
|
+
"""
|
|
387
|
+
return self._next_free_credits_grant_at
|
|
388
|
+
|
|
389
|
+
@next_free_credits_grant_at.setter
|
|
390
|
+
def next_free_credits_grant_at(self, next_free_credits_grant_at: 'datetime'):
|
|
391
|
+
"""Sets the next_free_credits_grant_at of this V1Membership.
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
:param next_free_credits_grant_at: The next_free_credits_grant_at of this V1Membership. # noqa: E501
|
|
395
|
+
:type: datetime
|
|
396
|
+
"""
|
|
397
|
+
|
|
398
|
+
self._next_free_credits_grant_at = next_free_credits_grant_at
|
|
399
|
+
|
|
374
400
|
@property
|
|
375
401
|
def owner_id(self) -> 'str':
|
|
376
402
|
"""Gets the owner_id of this V1Membership. # noqa: E501
|
|
@@ -38,6 +38,7 @@ class V1MultiMachineJobState(object):
|
|
|
38
38
|
allowed enum values
|
|
39
39
|
"""
|
|
40
40
|
UNSPECIFIED = "MultiMachineJob_STATE_UNSPECIFIED"
|
|
41
|
+
PENDING = "MultiMachineJob_STATE_PENDING"
|
|
41
42
|
RUNNING = "MultiMachineJob_STATE_RUNNING"
|
|
42
43
|
STOPPED = "MultiMachineJob_STATE_STOPPED"
|
|
43
44
|
DELETED = "MultiMachineJob_STATE_DELETED"
|