lightning-sdk 0.1.53__py3-none-any.whl → 0.1.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightning_sdk/__init__.py +1 -1
- lightning_sdk/api/deployment_api.py +9 -1
- lightning_sdk/api/job_api.py +1 -1
- lightning_sdk/api/lit_container_api.py +29 -8
- lightning_sdk/cli/delete.py +27 -0
- lightning_sdk/cli/entrypoint.py +6 -0
- lightning_sdk/cli/generate.py +58 -0
- lightning_sdk/cli/list.py +48 -0
- lightning_sdk/cli/start.py +43 -0
- lightning_sdk/cli/stop.py +26 -0
- lightning_sdk/cli/switch.py +43 -0
- lightning_sdk/deployment/deployment.py +12 -3
- lightning_sdk/job/base.py +7 -2
- lightning_sdk/job/job.py +1 -12
- lightning_sdk/job/v1.py +1 -32
- lightning_sdk/job/v2.py +6 -1
- lightning_sdk/lightning_cloud/openapi/__init__.py +2 -0
- lightning_sdk/lightning_cloud/openapi/api/cluster_service_api.py +10 -2
- lightning_sdk/lightning_cloud/openapi/api/lit_registry_service_api.py +210 -0
- lightning_sdk/lightning_cloud/openapi/models/__init__.py +2 -0
- lightning_sdk/lightning_cloud/openapi/models/cluster_id_usagerestrictions_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/usagerestrictions_id_body.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_cloud_provider.py +3 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_cluster_accelerator.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_job.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_job_spec.py +27 -1
- lightning_sdk/lightning_cloud/openapi/models/v1_lambda_labs_direct_v1.py +55 -3
- lightning_sdk/lightning_cloud/openapi/models/v1_list_lit_registry_repository_image_artifact_versions_response.py +231 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_lit_registry_artifact.py +253 -0
- lightning_sdk/lightning_cloud/openapi/models/v1_user_features.py +27 -53
- lightning_sdk/mmt/mmt.py +7 -6
- lightning_sdk/plugin.py +5 -3
- {lightning_sdk-0.1.53.dist-info → lightning_sdk-0.1.55.dist-info}/METADATA +1 -1
- {lightning_sdk-0.1.53.dist-info → lightning_sdk-0.1.55.dist-info}/RECORD +38 -33
- {lightning_sdk-0.1.53.dist-info → lightning_sdk-0.1.55.dist-info}/LICENSE +0 -0
- {lightning_sdk-0.1.53.dist-info → lightning_sdk-0.1.55.dist-info}/WHEEL +0 -0
- {lightning_sdk-0.1.53.dist-info → lightning_sdk-0.1.55.dist-info}/entry_points.txt +0 -0
- {lightning_sdk-0.1.53.dist-info → lightning_sdk-0.1.55.dist-info}/top_level.txt +0 -0
lightning_sdk/__init__.py
CHANGED
|
@@ -213,6 +213,14 @@ class DeploymentApi:
|
|
|
213
213
|
return None
|
|
214
214
|
raise ex
|
|
215
215
|
|
|
216
|
+
def get_deployment_by_id(self, deployment_id: str, teamspace_id: str) -> Optional[V1Deployment]:
|
|
217
|
+
try:
|
|
218
|
+
return self._client.jobs_service_get_deployment(project_id=teamspace_id, id=deployment_id)
|
|
219
|
+
except ApiException as ex:
|
|
220
|
+
if "Reason: Not Found" in str(ex):
|
|
221
|
+
return None
|
|
222
|
+
raise ex
|
|
223
|
+
|
|
216
224
|
def create_deployment(
|
|
217
225
|
self,
|
|
218
226
|
deployment: V1Deployment,
|
|
@@ -269,7 +277,7 @@ class DeploymentApi:
|
|
|
269
277
|
requires_release |= apply_change(deployment.spec, "entrypoint", entrypoint)
|
|
270
278
|
requires_release |= apply_change(deployment.spec, "command", command)
|
|
271
279
|
requires_release |= apply_change(deployment.spec, "env", to_env(env))
|
|
272
|
-
requires_release |= apply_change(deployment.spec, "
|
|
280
|
+
requires_release |= apply_change(deployment.spec, "readiness_probe", to_health_check(health_check))
|
|
273
281
|
requires_release |= apply_change(deployment.spec, "cluster_id", cloud_account)
|
|
274
282
|
requires_release |= apply_change(deployment.spec, "spot", spot)
|
|
275
283
|
|
lightning_sdk/api/job_api.py
CHANGED
|
@@ -275,7 +275,7 @@ class JobApiV2:
|
|
|
275
275
|
return
|
|
276
276
|
|
|
277
277
|
if current_state != Status.Stopping:
|
|
278
|
-
update_body = JobsIdBody1(
|
|
278
|
+
update_body = JobsIdBody1(state=self.v2_job_state_stop)
|
|
279
279
|
self._client.jobs_service_update_job(body=update_body, project_id=teamspace_id, id=job_id)
|
|
280
280
|
|
|
281
281
|
while True:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from typing import Generator, List
|
|
1
|
+
from typing import Any, Callable, Generator, List
|
|
2
2
|
|
|
3
3
|
import docker
|
|
4
|
+
import requests
|
|
4
5
|
|
|
5
6
|
from lightning_sdk.api.utils import _get_registry_url
|
|
6
7
|
from lightning_sdk.lightning_cloud.env import LIGHTNING_CLOUD_URL
|
|
@@ -11,7 +12,22 @@ from lightning_sdk.teamspace import Teamspace
|
|
|
11
12
|
|
|
12
13
|
class LCRAuthFailedError(Exception):
|
|
13
14
|
def __init__(self) -> None:
|
|
14
|
-
super().__init__(
|
|
15
|
+
super().__init__(
|
|
16
|
+
"Failed to authenticate with Lightning Container Registry. Please login manually "
|
|
17
|
+
"using the following command:\n "
|
|
18
|
+
"echo $LIGHTNING_API_KEY | docker login litcr.io --username=LIGHTNING_USERNAME --password-stdin"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def retry_on_lcr_auth_failure(func: Callable) -> Callable:
|
|
23
|
+
def wrapper(self: "LitContainerApi", *args: Any, **kwargs: Any) -> Callable:
|
|
24
|
+
try:
|
|
25
|
+
return func(self, *args, **kwargs)
|
|
26
|
+
except LCRAuthFailedError:
|
|
27
|
+
self.authenticate()
|
|
28
|
+
return func(self, *args, **kwargs)
|
|
29
|
+
|
|
30
|
+
return wrapper
|
|
15
31
|
|
|
16
32
|
|
|
17
33
|
class LitContainerApi:
|
|
@@ -38,35 +54,40 @@ class LitContainerApi:
|
|
|
38
54
|
def delete_container(self, project_id: str, container: str) -> V1DeleteLitRepositoryResponse:
|
|
39
55
|
try:
|
|
40
56
|
return self._client.lit_registry_service_delete_lit_repository(project_id, container)
|
|
41
|
-
except Exception as
|
|
42
|
-
raise ValueError(f"Could not delete container {container} from project {project_id}") from
|
|
57
|
+
except Exception as e:
|
|
58
|
+
raise ValueError(f"Could not delete container {container} from project {project_id}: {e!s}") from e
|
|
43
59
|
|
|
44
|
-
def upload_container(self, container: str, teamspace: Teamspace, tag: str) -> Generator[
|
|
60
|
+
def upload_container(self, container: str, teamspace: Teamspace, tag: str) -> Generator[dict, None, None]:
|
|
45
61
|
try:
|
|
46
62
|
self._docker_client.images.get(container)
|
|
47
63
|
except docker.errors.ImageNotFound:
|
|
48
64
|
raise ValueError(f"Container {container} does not exist") from None
|
|
49
65
|
|
|
50
66
|
registry_url = _get_registry_url()
|
|
51
|
-
|
|
67
|
+
container_basename = container.split("/")[-1]
|
|
68
|
+
repository = f"{registry_url}/lit-container/{teamspace.owner.name}/{teamspace.name}/{container_basename}"
|
|
52
69
|
tagged = self._docker_client.api.tag(container, repository, tag)
|
|
53
70
|
if not tagged:
|
|
54
71
|
raise ValueError(f"Could not tag container {container} with {repository}:{tag}")
|
|
55
72
|
lines = self._docker_client.api.push(repository, stream=True, decode=True)
|
|
56
73
|
for line in lines:
|
|
57
|
-
if "errorDetail" in line and "authorization failed" in line["error"]:
|
|
74
|
+
if "errorDetail" in line and ("authorization failed" in line["error"] or "unauth" in line["error"]):
|
|
58
75
|
raise LCRAuthFailedError()
|
|
59
76
|
yield line
|
|
60
77
|
yield {
|
|
61
78
|
"finish": True,
|
|
62
|
-
"url": f"{LIGHTNING_CLOUD_URL}/{teamspace.owner.name}/{teamspace.name}/containers/{
|
|
79
|
+
"url": f"{LIGHTNING_CLOUD_URL}/{teamspace.owner.name}/{teamspace.name}/containers/{container_basename}",
|
|
63
80
|
}
|
|
64
81
|
|
|
82
|
+
@retry_on_lcr_auth_failure
|
|
65
83
|
def download_container(self, container: str, teamspace: Teamspace, tag: str) -> Generator[str, None, None]:
|
|
66
84
|
registry_url = _get_registry_url()
|
|
67
85
|
repository = f"{registry_url}/lit-container/{teamspace.owner.name}/{teamspace.name}/{container}"
|
|
68
86
|
try:
|
|
69
87
|
self._docker_client.images.pull(repository, tag=tag)
|
|
88
|
+
except requests.exceptions.HTTPError as e:
|
|
89
|
+
if "unauthorized" in e.response.text:
|
|
90
|
+
raise LCRAuthFailedError() from e
|
|
70
91
|
except docker.errors.APIError as e:
|
|
71
92
|
raise ValueError(f"Could not pull container {container} from {repository}:{tag}") from e
|
|
72
93
|
return self._docker_client.api.tag(repository, container, tag)
|
lightning_sdk/cli/delete.py
CHANGED
|
@@ -4,6 +4,7 @@ from lightning_sdk.cli.exceptions import StudioCliError
|
|
|
4
4
|
from lightning_sdk.cli.job_and_mmt_action import _JobAndMMTAction
|
|
5
5
|
from lightning_sdk.cli.teamspace_menu import _TeamspacesMenu
|
|
6
6
|
from lightning_sdk.lit_container import LitContainer
|
|
7
|
+
from lightning_sdk.studio import Studio
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class _Delete(_JobAndMMTAction, _TeamspacesMenu):
|
|
@@ -56,3 +57,29 @@ class _Delete(_JobAndMMTAction, _TeamspacesMenu):
|
|
|
56
57
|
|
|
57
58
|
mmt.delete()
|
|
58
59
|
print(f"Successfully deleted {mmt.name}!")
|
|
60
|
+
|
|
61
|
+
def studio(self, name: Optional[str] = None, teamspace: Optional[str] = None) -> None:
|
|
62
|
+
"""Delete an existing studio.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
name: The name of the studio to delete.
|
|
66
|
+
If not specified, tries to infer from the environment (e.g. when run from within a Studio.)
|
|
67
|
+
Note: This could delete your current studio if run without arguments.
|
|
68
|
+
teamspace: The teamspace the studio is part of. Should be of format <OWNER>/<TEAMSPACE_NAME>.
|
|
69
|
+
If not specified, tries to infer from the environment (e.g. when run from within a Studio.)
|
|
70
|
+
"""
|
|
71
|
+
if teamspace is not None:
|
|
72
|
+
ts_splits = teamspace.split("/")
|
|
73
|
+
if len(ts_splits) != 2:
|
|
74
|
+
raise ValueError(f"Teamspace should be of format <OWNER>/<TEAMSPACE_NAME> but got {teamspace}")
|
|
75
|
+
owner, teamspace = ts_splits
|
|
76
|
+
else:
|
|
77
|
+
owner, teamspace = None, None
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
studio = Studio(name=name, teamspace=teamspace, org=owner, user=None, create_ok=False)
|
|
81
|
+
except (RuntimeError, ValueError):
|
|
82
|
+
studio = Studio(name=name, teamspace=teamspace, org=None, user=owner, create_ok=False)
|
|
83
|
+
|
|
84
|
+
studio.delete()
|
|
85
|
+
print("Studio successfully deleted")
|
lightning_sdk/cli/entrypoint.py
CHANGED
|
@@ -9,12 +9,15 @@ from lightning_sdk.api.studio_api import _cloud_url
|
|
|
9
9
|
from lightning_sdk.cli.ai_hub import _AIHub
|
|
10
10
|
from lightning_sdk.cli.delete import _Delete
|
|
11
11
|
from lightning_sdk.cli.download import _Downloads
|
|
12
|
+
from lightning_sdk.cli.generate import _Generate
|
|
12
13
|
from lightning_sdk.cli.inspect import _Inspect
|
|
13
14
|
from lightning_sdk.cli.legacy import _LegacyLightningCLI
|
|
14
15
|
from lightning_sdk.cli.list import _List
|
|
15
16
|
from lightning_sdk.cli.run import _Run
|
|
16
17
|
from lightning_sdk.cli.serve import _Docker, _LitServe
|
|
18
|
+
from lightning_sdk.cli.start import _Start
|
|
17
19
|
from lightning_sdk.cli.stop import _Stop
|
|
20
|
+
from lightning_sdk.cli.switch import _Switch
|
|
18
21
|
from lightning_sdk.cli.upload import _Uploads
|
|
19
22
|
from lightning_sdk.lightning_cloud.login import Auth
|
|
20
23
|
|
|
@@ -35,6 +38,9 @@ class StudioCLI:
|
|
|
35
38
|
self.delete = _Delete()
|
|
36
39
|
self.inspect = _Inspect()
|
|
37
40
|
self.stop = _Stop()
|
|
41
|
+
self.start = _Start()
|
|
42
|
+
self.switch = _Switch()
|
|
43
|
+
self.generate = _Generate()
|
|
38
44
|
|
|
39
45
|
sys.excepthook = _notify_exception
|
|
40
46
|
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
|
|
5
|
+
from lightning_sdk import Studio
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _Generate:
|
|
9
|
+
"""Generate configs (such as ssh for studio) and print them to commandline."""
|
|
10
|
+
|
|
11
|
+
console = Console()
|
|
12
|
+
|
|
13
|
+
def _generate_ssh_config(self, name: str, studio_id: str) -> str:
|
|
14
|
+
"""Generate SSH config entry for the studio.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
name: Studio name
|
|
18
|
+
studio_id: Studio space ID
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
str: SSH config entry
|
|
22
|
+
"""
|
|
23
|
+
return f"""# ssh s_{studio_id}@ssh.lightning.ai
|
|
24
|
+
|
|
25
|
+
Host {name}
|
|
26
|
+
User s_{studio_id}
|
|
27
|
+
Hostname ssh.lightning.ai
|
|
28
|
+
IdentityFile ~/.ssh/lightning_rsa
|
|
29
|
+
IdentitiesOnly yes
|
|
30
|
+
ServerAliveInterval 15
|
|
31
|
+
ServerAliveCountMax 4
|
|
32
|
+
StrictHostKeyChecking no
|
|
33
|
+
UserKnownHostsFile=/dev/null"""
|
|
34
|
+
|
|
35
|
+
def ssh(self, name: Optional[str] = None, teamspace: Optional[str] = None) -> None:
|
|
36
|
+
"""Get SSH config entry for a studio. Will start the studio if needed.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
name: The name of the studio to stop.
|
|
40
|
+
If not specified, tries to infer from the environment (e.g. when run from within a Studio.)
|
|
41
|
+
teamspace: The teamspace the studio is part of. Should be of format <OWNER>/<TEAMSPACE_NAME>.
|
|
42
|
+
If not specified, tries to infer from the environment (e.g. when run from within a Studio.)
|
|
43
|
+
"""
|
|
44
|
+
if teamspace:
|
|
45
|
+
ts_splits = teamspace.split("/")
|
|
46
|
+
if len(ts_splits) != 2:
|
|
47
|
+
raise ValueError(f"Teamspace should be of format <OWNER>/<TEAMSPACE_NAME> but got {teamspace}")
|
|
48
|
+
owner, teamspace = ts_splits
|
|
49
|
+
else:
|
|
50
|
+
owner, teamspace = None, None
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
studio = Studio(name=name, teamspace=teamspace, org=owner, user=None, create_ok=False)
|
|
54
|
+
except (RuntimeError, ValueError):
|
|
55
|
+
studio = Studio(name=name, teamspace=teamspace, org=None, user=owner, create_ok=False)
|
|
56
|
+
|
|
57
|
+
# Print the SSH config
|
|
58
|
+
self.console.print(self._generate_ssh_config(name, studio._studio.id))
|
lightning_sdk/cli/list.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Optional
|
|
|
3
3
|
from rich.console import Console
|
|
4
4
|
from rich.table import Table
|
|
5
5
|
|
|
6
|
+
from lightning_sdk import Machine
|
|
6
7
|
from lightning_sdk.cli.teamspace_menu import _TeamspacesMenu
|
|
7
8
|
from lightning_sdk.lit_container import LitContainer
|
|
8
9
|
|
|
@@ -10,6 +11,38 @@ from lightning_sdk.lit_container import LitContainer
|
|
|
10
11
|
class _List(_TeamspacesMenu):
|
|
11
12
|
"""List resources on the Lightning AI platform."""
|
|
12
13
|
|
|
14
|
+
def studios(self, teamspace: Optional[str] = None) -> None:
|
|
15
|
+
"""List studios for a given teamspace.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
teamspace: the teamspace to list studios from. Should be specified as {owner}/{name}
|
|
19
|
+
If not provided, can be selected in an interactive menu.
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
resolved_teamspace = self._resolve_teamspace(teamspace=teamspace)
|
|
23
|
+
|
|
24
|
+
studios = resolved_teamspace.studios
|
|
25
|
+
|
|
26
|
+
table = Table(
|
|
27
|
+
pad_edge=True,
|
|
28
|
+
)
|
|
29
|
+
table.add_column("Name")
|
|
30
|
+
table.add_column("Teamspace")
|
|
31
|
+
table.add_column("Status")
|
|
32
|
+
table.add_column("Machine")
|
|
33
|
+
table.add_column("Cloud account")
|
|
34
|
+
for studio in studios:
|
|
35
|
+
table.add_row(
|
|
36
|
+
studio.name,
|
|
37
|
+
f"{studio.teamspace.owner.name}/{studio.teamspace.name}",
|
|
38
|
+
str(studio.status),
|
|
39
|
+
str(studio.machine) if studio.machine is not None else None,
|
|
40
|
+
str(studio.cloud_account),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
console = Console()
|
|
44
|
+
console.print(table)
|
|
45
|
+
|
|
13
46
|
def jobs(self, teamspace: Optional[str] = None) -> None:
|
|
14
47
|
"""List jobs for a given teamspace.
|
|
15
48
|
|
|
@@ -110,3 +143,18 @@ class _List(_TeamspacesMenu):
|
|
|
110
143
|
table.add_row(repo["REPOSITORY"], repo["IMAGE ID"], repo["CREATED"])
|
|
111
144
|
console = Console()
|
|
112
145
|
console.print(table)
|
|
146
|
+
|
|
147
|
+
def machines(self) -> None:
|
|
148
|
+
"""Display the list of available machines."""
|
|
149
|
+
table = Table(pad_edge=True)
|
|
150
|
+
table.add_column("Name")
|
|
151
|
+
|
|
152
|
+
# Get all machine types from the enum
|
|
153
|
+
machine_types = [name for name in dir(Machine) if not name.startswith("_")]
|
|
154
|
+
|
|
155
|
+
# Add rows to table
|
|
156
|
+
for name in sorted(machine_types):
|
|
157
|
+
table.add_row(name)
|
|
158
|
+
|
|
159
|
+
console = Console()
|
|
160
|
+
console.print(table)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from lightning_sdk import Machine, Studio
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class _Start:
|
|
7
|
+
"""Start resources on the Lightning AI platform."""
|
|
8
|
+
|
|
9
|
+
def __init__(self) -> None:
|
|
10
|
+
_machine_values = tuple([machine.value for machine in Machine])
|
|
11
|
+
|
|
12
|
+
docstr_studio = f"""Start a studio on a given machine.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
name: The name of the studio to start.
|
|
16
|
+
If not specified, tries to infer from the environment (e.g. when run from within a Studio.)
|
|
17
|
+
teamspace: The teamspace the studio is part of. Should be of format <OWNER>/<TEAMSPACE_NAME>.
|
|
18
|
+
If not specified, tries to infer from the environment (e.g. when run from within a Studio.)
|
|
19
|
+
machine: The machine type to start the studio on. One of {", ".join(_machine_values)}.
|
|
20
|
+
Defaults to the CPU Machine.
|
|
21
|
+
"""
|
|
22
|
+
self.studio.__func__.__doc__ = docstr_studio
|
|
23
|
+
|
|
24
|
+
def studio(self, name: Optional[str] = None, teamspace: Optional[str] = None, machine: str = "CPU") -> None:
|
|
25
|
+
if teamspace is not None:
|
|
26
|
+
ts_splits = teamspace.split("/")
|
|
27
|
+
if len(ts_splits) != 2:
|
|
28
|
+
raise ValueError(f"Teamspace should be of format <OWNER>/<TEAMSPACE_NAME> but got {teamspace}")
|
|
29
|
+
owner, teamspace = ts_splits
|
|
30
|
+
else:
|
|
31
|
+
owner, teamspace = None, None
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
studio = Studio(name=name, teamspace=teamspace, org=owner, user=None, create_ok=False)
|
|
35
|
+
except (RuntimeError, ValueError):
|
|
36
|
+
studio = Studio(name=name, teamspace=teamspace, org=None, user=owner, create_ok=False)
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
resolved_machine = Machine[machine.upper()]
|
|
40
|
+
except KeyError:
|
|
41
|
+
resolved_machine = machine
|
|
42
|
+
|
|
43
|
+
studio.start(resolved_machine)
|
lightning_sdk/cli/stop.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from lightning_sdk.cli.job_and_mmt_action import _JobAndMMTAction
|
|
4
|
+
from lightning_sdk.studio import Studio
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class _Stop(_JobAndMMTAction):
|
|
@@ -35,3 +36,28 @@ class _Stop(_JobAndMMTAction):
|
|
|
35
36
|
|
|
36
37
|
mmt.stop()
|
|
37
38
|
print(f"Successfully stopped {mmt.name}!")
|
|
39
|
+
|
|
40
|
+
def studio(self, name: Optional[str] = None, teamspace: Optional[str] = None) -> None:
|
|
41
|
+
"""Stop a running studio.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
name: The name of the studio to stop.
|
|
45
|
+
If not specified, tries to infer from the environment (e.g. when run from within a Studio.)
|
|
46
|
+
teamspace: The teamspace the studio is part of. Should be of format <OWNER>/<TEAMSPACE_NAME>.
|
|
47
|
+
If not specified, tries to infer from the environment (e.g. when run from within a Studio.)
|
|
48
|
+
"""
|
|
49
|
+
if teamspace is not None:
|
|
50
|
+
ts_splits = teamspace.split("/")
|
|
51
|
+
if len(ts_splits) != 2:
|
|
52
|
+
raise ValueError(f"Teamspace should be of format <OWNER>/<TEAMSPACE_NAME> but got {teamspace}")
|
|
53
|
+
owner, teamspace = ts_splits
|
|
54
|
+
else:
|
|
55
|
+
owner, teamspace = None, None
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
studio = Studio(name=name, teamspace=teamspace, org=owner, user=None, create_ok=False)
|
|
59
|
+
except (RuntimeError, ValueError):
|
|
60
|
+
studio = Studio(name=name, teamspace=teamspace, org=None, user=owner, create_ok=False)
|
|
61
|
+
|
|
62
|
+
studio.stop()
|
|
63
|
+
print("Studio successfully stopped")
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from lightning_sdk import Machine, Studio
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class _Switch:
|
|
7
|
+
"""Switch machines for resources on the Lightning AI platform."""
|
|
8
|
+
|
|
9
|
+
def __init__(self) -> None:
|
|
10
|
+
_machine_values = tuple([machine.value for machine in Machine])
|
|
11
|
+
|
|
12
|
+
docstr_studio = f"""Switch a studio to a given machine.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
name: The name of the studio to start.
|
|
16
|
+
If not specified, tries to infer from the environment (e.g. when run from within a Studio.)
|
|
17
|
+
teamspace: The teamspace the studio is part of. Should be of format <OWNER>/<TEAMSPACE_NAME>.
|
|
18
|
+
If not specified, tries to infer from the environment (e.g. when run from within a Studio.)
|
|
19
|
+
machine: The machine type to switch to. One of {", ".join(_machine_values)}.
|
|
20
|
+
Defaults to the CPU Machine.
|
|
21
|
+
"""
|
|
22
|
+
self.studio.__func__.__doc__ = docstr_studio
|
|
23
|
+
|
|
24
|
+
def studio(self, name: Optional[str] = None, teamspace: Optional[str] = None, machine: str = "CPU") -> None:
|
|
25
|
+
if teamspace is not None:
|
|
26
|
+
ts_splits = teamspace.split("/")
|
|
27
|
+
if len(ts_splits) != 2:
|
|
28
|
+
raise ValueError(f"Teamspace should be of format <OWNER>/<TEAMSPACE_NAME> but got {teamspace}")
|
|
29
|
+
owner, teamspace = ts_splits
|
|
30
|
+
else:
|
|
31
|
+
owner, teamspace = None, None
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
studio = Studio(name=name, teamspace=teamspace, org=owner, user=None, create_ok=False)
|
|
35
|
+
except (RuntimeError, ValueError):
|
|
36
|
+
studio = Studio(name=name, teamspace=teamspace, org=None, user=owner, create_ok=False)
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
resolved_machine = Machine[machine.upper()]
|
|
40
|
+
except KeyError:
|
|
41
|
+
resolved_machine = machine
|
|
42
|
+
|
|
43
|
+
studio.switch_machine(resolved_machine)
|
|
@@ -42,7 +42,7 @@ class Deployment:
|
|
|
42
42
|
and switching machine types, etc..
|
|
43
43
|
|
|
44
44
|
Args:
|
|
45
|
-
name: The name of the deployment.
|
|
45
|
+
name: The name or the id of the deployment.
|
|
46
46
|
teamspace: The teamspace in which you want to deploy.
|
|
47
47
|
org: The name of the organization owning the :param`teamspace` in case it is owned by an org
|
|
48
48
|
user: The name of the user owning the :param`teamspace` in case it is owned directly by a user instead of an org
|
|
@@ -55,7 +55,7 @@ class Deployment:
|
|
|
55
55
|
|
|
56
56
|
def __init__(
|
|
57
57
|
self,
|
|
58
|
-
name: str,
|
|
58
|
+
name: str,
|
|
59
59
|
teamspace: Optional[Union[str, Teamspace]] = None,
|
|
60
60
|
org: Optional[Union[str, Organization]] = None,
|
|
61
61
|
user: Optional[Union[str, User]] = None,
|
|
@@ -83,8 +83,14 @@ class Deployment:
|
|
|
83
83
|
self._deployment_api = DeploymentApi()
|
|
84
84
|
self._cloud_account = _get_cluster(client=self._deployment_api._client, project_id=self._teamspace.id)
|
|
85
85
|
self._is_created = False
|
|
86
|
-
|
|
86
|
+
|
|
87
|
+
if name.startswith("dep_"):
|
|
88
|
+
deployment = self._deployment_api.get_deployment_by_id(name, self._teamspace.id)
|
|
89
|
+
else:
|
|
90
|
+
deployment = self._deployment_api.get_deployment_by_name(name, self._teamspace.id)
|
|
91
|
+
|
|
87
92
|
if deployment:
|
|
93
|
+
self._name = deployment.name
|
|
88
94
|
self._is_created = True
|
|
89
95
|
self._deployment = deployment
|
|
90
96
|
|
|
@@ -163,6 +169,9 @@ class Deployment:
|
|
|
163
169
|
strategy=to_strategy(release_strategy),
|
|
164
170
|
)
|
|
165
171
|
)
|
|
172
|
+
|
|
173
|
+
# Overrides the name
|
|
174
|
+
self._name = self._deployment._name
|
|
166
175
|
self._is_created = True
|
|
167
176
|
|
|
168
177
|
def update(
|
lightning_sdk/job/base.py
CHANGED
|
@@ -275,20 +275,25 @@ class _BaseJob(ABC):
|
|
|
275
275
|
Caution: This also deletes all artifacts and snapshots associated with the job.
|
|
276
276
|
"""
|
|
277
277
|
|
|
278
|
-
def wait(self, interval: float = 5.0) -> None:
|
|
278
|
+
def wait(self, interval: float = 5.0, timeout: Optional[float] = None) -> None:
|
|
279
279
|
"""Waits for the job to be either completed, manually stopped or failed.
|
|
280
280
|
|
|
281
281
|
Args:
|
|
282
|
-
interval:
|
|
282
|
+
interval: The number of seconds to spend in-between status checks.
|
|
283
|
+
timeout: The maximum number of seconds to wait before raising an error. If None, waits forever.
|
|
283
284
|
"""
|
|
284
285
|
import time
|
|
285
286
|
|
|
286
287
|
from lightning_sdk.status import Status
|
|
287
288
|
|
|
289
|
+
start = time.time()
|
|
288
290
|
while True:
|
|
289
291
|
if self.status in (Status.Completed, Status.Stopped, Status.Failed):
|
|
290
292
|
break
|
|
291
293
|
|
|
294
|
+
if timeout is not None and time.time() - start > timeout:
|
|
295
|
+
raise TimeoutError("Job didn't finish within the provided timeout.")
|
|
296
|
+
|
|
292
297
|
time.sleep(interval)
|
|
293
298
|
|
|
294
299
|
@property
|
lightning_sdk/job/job.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
from functools import lru_cache
|
|
2
1
|
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
|
|
3
2
|
|
|
4
|
-
from lightning_sdk.api.user_api import UserApi
|
|
5
3
|
from lightning_sdk.job.base import _BaseJob
|
|
6
4
|
from lightning_sdk.job.v1 import _JobV1
|
|
7
5
|
from lightning_sdk.job.v2 import _JobV2
|
|
@@ -18,15 +16,6 @@ if TYPE_CHECKING:
|
|
|
18
16
|
from lightning_sdk.user import User
|
|
19
17
|
|
|
20
18
|
|
|
21
|
-
@lru_cache(maxsize=None)
|
|
22
|
-
def _has_jobs_v2() -> bool:
|
|
23
|
-
api = UserApi()
|
|
24
|
-
try:
|
|
25
|
-
return api._get_feature_flags().jobs_v2
|
|
26
|
-
except Exception:
|
|
27
|
-
return False
|
|
28
|
-
|
|
29
|
-
|
|
30
19
|
class Job(_BaseJob):
|
|
31
20
|
"""Class to submit and manage single-machine jobs on the Lightning AI Platform."""
|
|
32
21
|
|
|
@@ -52,7 +41,7 @@ class Job(_BaseJob):
|
|
|
52
41
|
"""
|
|
53
42
|
from lightning_sdk.lightning_cloud.openapi.rest import ApiException
|
|
54
43
|
|
|
55
|
-
if
|
|
44
|
+
if not self._force_v1:
|
|
56
45
|
# try with v2 and fall back to v1
|
|
57
46
|
try:
|
|
58
47
|
job = _JobV2(
|
lightning_sdk/job/v1.py
CHANGED
|
@@ -126,38 +126,7 @@ class _JobV1(_BaseJob):
|
|
|
126
126
|
The submitted job.
|
|
127
127
|
|
|
128
128
|
"""
|
|
129
|
-
|
|
130
|
-
raise ValueError("Studio is required for submitting jobs")
|
|
131
|
-
if image is not None or image_credentials is not None or cloud_account_auth:
|
|
132
|
-
raise ValueError("Image is not supported for submitting jobs")
|
|
133
|
-
|
|
134
|
-
if artifacts_local is not None or artifacts_remote is not None:
|
|
135
|
-
raise ValueError("Specifying how to persist artifacts is not yet supported with jobs")
|
|
136
|
-
|
|
137
|
-
if env is not None:
|
|
138
|
-
raise ValueError("Environment variables are not supported for submitting jobs")
|
|
139
|
-
if command is None:
|
|
140
|
-
raise ValueError("Command is required for submitting jobs")
|
|
141
|
-
|
|
142
|
-
if entrypoint != "sh -c":
|
|
143
|
-
raise ValueError("Specifying the entrypoint is not yet supported with jobs")
|
|
144
|
-
|
|
145
|
-
if path_mappings is not None:
|
|
146
|
-
raise ValueError("Specifying path mappings is not yet supported with jobs")
|
|
147
|
-
|
|
148
|
-
# TODO: add support for empty names (will give an empty string)
|
|
149
|
-
_submitted = self._job_api.submit_job(
|
|
150
|
-
name=self._name,
|
|
151
|
-
command=command,
|
|
152
|
-
studio_id=studio._studio.id,
|
|
153
|
-
teamspace_id=self._teamspace.id,
|
|
154
|
-
cloud_account=cloud_account or "",
|
|
155
|
-
machine=machine,
|
|
156
|
-
interruptible=interruptible,
|
|
157
|
-
)
|
|
158
|
-
self._name = _submitted.name
|
|
159
|
-
self._job = _submitted
|
|
160
|
-
return self
|
|
129
|
+
raise NotImplementedError("Cannot submit new jobs with JobsV1!")
|
|
161
130
|
|
|
162
131
|
def _update_internal_job(self) -> None:
|
|
163
132
|
try:
|
lightning_sdk/job/v2.py
CHANGED
|
@@ -140,7 +140,12 @@ class _JobV2(_BaseJob):
|
|
|
140
140
|
@property
|
|
141
141
|
def status(self) -> "Status":
|
|
142
142
|
"""The current status of the job."""
|
|
143
|
-
|
|
143
|
+
try:
|
|
144
|
+
return self._job_api._job_state_to_external(self._latest_job.state)
|
|
145
|
+
except Exception:
|
|
146
|
+
raise RuntimeError(
|
|
147
|
+
f"Job {self._name} does not exist in Teamspace {self.teamspace.name}. Did you delete it?"
|
|
148
|
+
) from None
|
|
144
149
|
|
|
145
150
|
@property
|
|
146
151
|
def machine(self) -> Union["Machine", str]:
|
|
@@ -571,6 +571,7 @@ from lightning_sdk.lightning_cloud.openapi.models.v1_list_lightningapp_instances
|
|
|
571
571
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_lightningwork_events_response import V1ListLightningworkEventsResponse
|
|
572
572
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_lightningwork_response import V1ListLightningworkResponse
|
|
573
573
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_lit_pages_response import V1ListLitPagesResponse
|
|
574
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_list_lit_registry_repository_image_artifact_versions_response import V1ListLitRegistryRepositoryImageArtifactVersionsResponse
|
|
574
575
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_logger_artifact_response import V1ListLoggerArtifactResponse
|
|
575
576
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_managed_endpoints_response import V1ListManagedEndpointsResponse
|
|
576
577
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_memberships_response import V1ListMembershipsResponse
|
|
@@ -606,6 +607,7 @@ from lightning_sdk.lightning_cloud.openapi.models.v1_list_studio_jobs_response i
|
|
|
606
607
|
from lightning_sdk.lightning_cloud.openapi.models.v1_list_user_slurm_jobs_response import V1ListUserSLURMJobsResponse
|
|
607
608
|
from lightning_sdk.lightning_cloud.openapi.models.v1_lit_page import V1LitPage
|
|
608
609
|
from lightning_sdk.lightning_cloud.openapi.models.v1_lit_page_type import V1LitPageType
|
|
610
|
+
from lightning_sdk.lightning_cloud.openapi.models.v1_lit_registry_artifact import V1LitRegistryArtifact
|
|
609
611
|
from lightning_sdk.lightning_cloud.openapi.models.v1_lit_registry_project import V1LitRegistryProject
|
|
610
612
|
from lightning_sdk.lightning_cloud.openapi.models.v1_lit_repository import V1LitRepository
|
|
611
613
|
from lightning_sdk.lightning_cloud.openapi.models.v1_locked_resource import V1LockedResource
|